1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/globals.hpp" 37 #include "runtime/objectMonitor.hpp" 38 #include "runtime/stubRoutines.hpp" 39 #include "utilities/checkedCast.hpp" 40 #include "utilities/globalDefinitions.hpp" 41 #include "utilities/powerOfTwo.hpp" 42 #include "utilities/sizes.hpp" 43 44 #ifdef PRODUCT 45 #define BLOCK_COMMENT(str) /* nothing */ 46 #define STOP(error) stop(error) 47 #else 48 #define BLOCK_COMMENT(str) block_comment(str) 49 #define STOP(error) block_comment(error); stop(error) 50 #endif 51 52 // C2 compiled method's prolog code. 53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 54 55 // WARNING: Initial instruction MUST be 5 bytes or longer so that 56 // NativeJump::patch_verified_entry will be able to patch out the entry 57 // code safely. The push to verify stack depth is ok at 5 bytes, 58 // the frame allocation can be either 3 or 6 bytes. So if we don't do 59 // stack bang then we must use the 6 byte frame allocation even if 60 // we have no frame. :-( 61 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 62 63 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 64 // Remove word for return addr 65 framesize -= wordSize; 66 stack_bang_size -= wordSize; 67 68 // Calls to C2R adapters often do not accept exceptional returns. 69 // We require that their callers must bang for them. But be careful, because 70 // some VM calls (such as call site linkage) can use several kilobytes of 71 // stack. But the stack safety zone should account for that. 72 // See bugs 4446381, 4468289, 4497237. 73 if (stack_bang_size > 0) { 74 generate_stack_overflow_check(stack_bang_size); 75 76 // We always push rbp, so that on return to interpreter rbp, will be 77 // restored correctly and we can correct the stack. 78 push(rbp); 79 // Save caller's stack pointer into RBP if the frame pointer is preserved. 80 if (PreserveFramePointer) { 81 mov(rbp, rsp); 82 } 83 // Remove word for ebp 84 framesize -= wordSize; 85 86 // Create frame 87 if (framesize) { 88 subptr(rsp, framesize); 89 } 90 } else { 91 // Create frame (force generation of a 4 byte immediate value) 92 subptr_imm32(rsp, framesize); 93 94 // Save RBP register now. 95 framesize -= wordSize; 96 movptr(Address(rsp, framesize), rbp); 97 // Save caller's stack pointer into RBP if the frame pointer is preserved. 98 if (PreserveFramePointer) { 99 movptr(rbp, rsp); 100 if (framesize > 0) { 101 addptr(rbp, framesize); 102 } 103 } 104 } 105 106 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 107 framesize -= wordSize; 108 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 109 } 110 111 #ifndef _LP64 112 // If method sets FPU control word do it now 113 if (fp_mode_24b) { 114 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 115 } 116 if (UseSSE >= 2 && VerifyFPU) { 117 verify_FPU(0, "FPU stack must be clean on entry"); 118 } 119 #endif 120 121 #ifdef ASSERT 122 if (VerifyStackAtCalls) { 123 Label L; 124 push(rax); 125 mov(rax, rsp); 126 andptr(rax, StackAlignmentInBytes-1); 127 cmpptr(rax, StackAlignmentInBytes-wordSize); 128 pop(rax); 129 jcc(Assembler::equal, L); 130 STOP("Stack is not properly aligned!"); 131 bind(L); 132 } 133 #endif 134 135 if (!is_stub) { 136 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 137 #ifdef _LP64 138 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 139 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 140 Label dummy_slow_path; 141 Label dummy_continuation; 142 Label* slow_path = &dummy_slow_path; 143 Label* continuation = &dummy_continuation; 144 if (!Compile::current()->output()->in_scratch_emit_size()) { 145 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 146 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 147 Compile::current()->output()->add_stub(stub); 148 slow_path = &stub->entry(); 149 continuation = &stub->continuation(); 150 } 151 bs->nmethod_entry_barrier(this, slow_path, continuation); 152 } 153 #else 154 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 155 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 156 #endif 157 } 158 } 159 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 161 switch (vlen_in_bytes) { 162 case 4: // fall-through 163 case 8: // fall-through 164 case 16: return Assembler::AVX_128bit; 165 case 32: return Assembler::AVX_256bit; 166 case 64: return Assembler::AVX_512bit; 167 168 default: { 169 ShouldNotReachHere(); 170 return Assembler::AVX_NoVec; 171 } 172 } 173 } 174 175 // fast_lock and fast_unlock used by C2 176 177 // Because the transitions from emitted code to the runtime 178 // monitorenter/exit helper stubs are so slow it's critical that 179 // we inline both the stack-locking fast path and the inflated fast path. 180 // 181 // See also: cmpFastLock and cmpFastUnlock. 182 // 183 // What follows is a specialized inline transliteration of the code 184 // in enter() and exit(). If we're concerned about I$ bloat another 185 // option would be to emit TrySlowEnter and TrySlowExit methods 186 // at startup-time. These methods would accept arguments as 187 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 188 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 189 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 190 // In practice, however, the # of lock sites is bounded and is usually small. 191 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 192 // if the processor uses simple bimodal branch predictors keyed by EIP 193 // Since the helper routines would be called from multiple synchronization 194 // sites. 195 // 196 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 197 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 198 // to those specialized methods. That'd give us a mostly platform-independent 199 // implementation that the JITs could optimize and inline at their pleasure. 200 // Done correctly, the only time we'd need to cross to native could would be 201 // to park() or unpark() threads. We'd also need a few more unsafe operators 202 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 203 // (b) explicit barriers or fence operations. 204 // 205 // TODO: 206 // 207 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 208 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 209 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 210 // the lock operators would typically be faster than reifying Self. 211 // 212 // * Ideally I'd define the primitives as: 213 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 214 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 215 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 216 // Instead, we're stuck with a rather awkward and brittle register assignments below. 217 // Furthermore the register assignments are overconstrained, possibly resulting in 218 // sub-optimal code near the synchronization site. 219 // 220 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 221 // Alternately, use a better sp-proximity test. 222 // 223 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 224 // Either one is sufficient to uniquely identify a thread. 225 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 226 // 227 // * Intrinsify notify() and notifyAll() for the common cases where the 228 // object is locked by the calling thread but the waitlist is empty. 229 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 230 // 231 // * use jccb and jmpb instead of jcc and jmp to improve code density. 232 // But beware of excessive branch density on AMD Opterons. 233 // 234 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 235 // or failure of the fast path. If the fast path fails then we pass 236 // control to the slow path, typically in C. In fast_lock and 237 // fast_unlock we often branch to DONE_LABEL, just to find that C2 238 // will emit a conditional branch immediately after the node. 239 // So we have branches to branches and lots of ICC.ZF games. 240 // Instead, it might be better to have C2 pass a "FailureLabel" 241 // into fast_lock and fast_unlock. In the case of success, control 242 // will drop through the node. ICC.ZF is undefined at exit. 243 // In the case of failure, the node will branch directly to the 244 // FailureLabel 245 246 247 // obj: object to lock 248 // box: on-stack box address (displaced header location) - KILLED 249 // rax,: tmp -- KILLED 250 // scr: tmp -- KILLED 251 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 252 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 253 Metadata* method_data) { 254 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 255 // Ensure the register assignments are disjoint 256 assert(tmpReg == rax, ""); 257 assert(cx1Reg == noreg, ""); 258 assert(cx2Reg == noreg, ""); 259 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 260 261 // Possible cases that we'll encounter in fast_lock 262 // ------------------------------------------------ 263 // * Inflated 264 // -- unlocked 265 // -- Locked 266 // = by self 267 // = by other 268 // * neutral 269 // * stack-locked 270 // -- by self 271 // = sp-proximity test hits 272 // = sp-proximity test generates false-negative 273 // -- by other 274 // 275 276 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 277 278 if (DiagnoseSyncOnValueBasedClasses != 0) { 279 load_klass(tmpReg, objReg, scrReg); 280 testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 281 jcc(Assembler::notZero, DONE_LABEL); 282 } 283 284 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 285 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 286 jcc(Assembler::notZero, IsInflated); 287 288 if (LockingMode == LM_MONITOR) { 289 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 290 testptr(objReg, objReg); 291 } else { 292 assert(LockingMode == LM_LEGACY, "must be"); 293 // Attempt stack-locking ... 294 orptr (tmpReg, markWord::unlocked_value); 295 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 296 lock(); 297 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 298 jcc(Assembler::equal, COUNT); // Success 299 300 // Recursive locking. 301 // The object is stack-locked: markword contains stack pointer to BasicLock. 302 // Locked by current thread if difference with current SP is less than one page. 303 subptr(tmpReg, rsp); 304 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 305 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 306 movptr(Address(boxReg, 0), tmpReg); 307 } 308 jmp(DONE_LABEL); 309 310 bind(IsInflated); 311 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 312 313 #ifndef _LP64 314 // The object is inflated. 315 316 // boxReg refers to the on-stack BasicLock in the current frame. 317 // We'd like to write: 318 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 319 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 320 // additional latency as we have another ST in the store buffer that must drain. 321 322 // avoid ST-before-CAS 323 // register juggle because we need tmpReg for cmpxchgptr below 324 movptr(scrReg, boxReg); 325 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 326 327 // Optimistic form: consider XORL tmpReg,tmpReg 328 movptr(tmpReg, NULL_WORD); 329 330 // Appears unlocked - try to swing _owner from null to non-null. 331 // Ideally, I'd manifest "Self" with get_thread and then attempt 332 // to CAS the register containing Self into m->Owner. 333 // But we don't have enough registers, so instead we can either try to CAS 334 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 335 // we later store "Self" into m->Owner. Transiently storing a stack address 336 // (rsp or the address of the box) into m->owner is harmless. 337 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 338 lock(); 339 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 340 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 341 // If we weren't able to swing _owner from null to the BasicLock 342 // then take the slow path. 343 jccb (Assembler::notZero, NO_COUNT); 344 // update _owner from BasicLock to thread 345 get_thread (scrReg); // beware: clobbers ICCs 346 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 347 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 348 349 // If the CAS fails we can either retry or pass control to the slow path. 350 // We use the latter tactic. 351 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 352 // If the CAS was successful ... 353 // Self has acquired the lock 354 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 355 // Intentional fall-through into DONE_LABEL ... 356 #else // _LP64 357 // It's inflated and we use scrReg for ObjectMonitor* in this section. 358 movq(scrReg, tmpReg); 359 xorq(tmpReg, tmpReg); 360 lock(); 361 cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 362 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 363 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 364 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 365 // Propagate ICC.ZF from CAS above into DONE_LABEL. 366 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 367 368 cmpptr(thread, rax); // Check if we are already the owner (recursive lock) 369 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 370 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 371 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 372 #endif // _LP64 373 bind(DONE_LABEL); 374 375 // ZFlag == 1 count in fast path 376 // ZFlag == 0 count in slow path 377 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 378 379 bind(COUNT); 380 // Count monitors in fast path 381 increment(Address(thread, JavaThread::held_monitor_count_offset())); 382 383 xorl(tmpReg, tmpReg); // Set ZF == 1 384 385 bind(NO_COUNT); 386 387 // At NO_COUNT the icc ZFlag is set as follows ... 388 // fast_unlock uses the same protocol. 389 // ZFlag == 1 -> Success 390 // ZFlag == 0 -> Failure - force control through the slow path 391 } 392 393 // obj: object to unlock 394 // box: box address (displaced header location), killed. Must be EAX. 395 // tmp: killed, cannot be obj nor box. 396 // 397 // Some commentary on balanced locking: 398 // 399 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 400 // Methods that don't have provably balanced locking are forced to run in the 401 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 402 // The interpreter provides two properties: 403 // I1: At return-time the interpreter automatically and quietly unlocks any 404 // objects acquired the current activation (frame). Recall that the 405 // interpreter maintains an on-stack list of locks currently held by 406 // a frame. 407 // I2: If a method attempts to unlock an object that is not held by the 408 // the frame the interpreter throws IMSX. 409 // 410 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 411 // B() doesn't have provably balanced locking so it runs in the interpreter. 412 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 413 // is still locked by A(). 414 // 415 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 416 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 417 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 418 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 419 // Arguably given that the spec legislates the JNI case as undefined our implementation 420 // could reasonably *avoid* checking owner in fast_unlock(). 421 // In the interest of performance we elide m->Owner==Self check in unlock. 422 // A perfectly viable alternative is to elide the owner check except when 423 // Xcheck:jni is enabled. 424 425 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { 426 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 427 assert(boxReg == rax, ""); 428 assert_different_registers(objReg, boxReg, tmpReg); 429 430 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 431 432 if (LockingMode == LM_LEGACY) { 433 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 434 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 435 } 436 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 437 if (LockingMode != LM_MONITOR) { 438 testptr(tmpReg, markWord::monitor_value); // Inflated? 439 jcc(Assembler::zero, Stacked); 440 } 441 442 // It's inflated. 443 444 // Despite our balanced locking property we still check that m->_owner == Self 445 // as java routines or native JNI code called by this thread might 446 // have released the lock. 447 // Refer to the comments in synchronizer.cpp for how we might encode extra 448 // state in _succ so we can avoid fetching EntryList|cxq. 449 // 450 // If there's no contention try a 1-0 exit. That is, exit without 451 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 452 // we detect and recover from the race that the 1-0 exit admits. 453 // 454 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 455 // before it STs null into _owner, releasing the lock. Updates 456 // to data protected by the critical section must be visible before 457 // we drop the lock (and thus before any other thread could acquire 458 // the lock and observe the fields protected by the lock). 459 // IA32's memory-model is SPO, so STs are ordered with respect to 460 // each other and there's no need for an explicit barrier (fence). 461 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 462 #ifndef _LP64 463 // Note that we could employ various encoding schemes to reduce 464 // the number of loads below (currently 4) to just 2 or 3. 465 // Refer to the comments in synchronizer.cpp. 466 // In practice the chain of fetches doesn't seem to impact performance, however. 467 xorptr(boxReg, boxReg); 468 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 469 jccb (Assembler::notZero, DONE_LABEL); 470 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 471 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 472 jccb (Assembler::notZero, DONE_LABEL); 473 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 474 jmpb (DONE_LABEL); 475 #else // _LP64 476 // It's inflated 477 Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath; 478 479 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 480 jccb(Assembler::equal, LNotRecursive); 481 482 // Recursive inflated unlock 483 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 484 jmpb(LSuccess); 485 486 bind(LNotRecursive); 487 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 488 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 489 jccb (Assembler::notZero, CheckSucc); 490 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 491 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 492 jmpb (DONE_LABEL); 493 494 // Try to avoid passing control into the slow_path ... 495 bind (CheckSucc); 496 497 // The following optional optimization can be elided if necessary 498 // Effectively: if (succ == null) goto slow path 499 // The code reduces the window for a race, however, 500 // and thus benefits performance. 501 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 502 jccb (Assembler::zero, LGoSlowPath); 503 504 xorptr(boxReg, boxReg); 505 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 506 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 507 508 // Memory barrier/fence 509 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 510 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 511 // This is faster on Nehalem and AMD Shanghai/Barcelona. 512 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 513 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 514 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 515 lock(); addl(Address(rsp, 0), 0); 516 517 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 518 jccb (Assembler::notZero, LSuccess); 519 520 // Rare inopportune interleaving - race. 521 // The successor vanished in the small window above. 522 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 523 // We need to ensure progress and succession. 524 // Try to reacquire the lock. 525 // If that fails then the new owner is responsible for succession and this 526 // thread needs to take no further action and can exit via the fast path (success). 527 // If the re-acquire succeeds then pass control into the slow path. 528 // As implemented, this latter mode is horrible because we generated more 529 // coherence traffic on the lock *and* artificially extended the critical section 530 // length while by virtue of passing control into the slow path. 531 532 // box is really RAX -- the following CMPXCHG depends on that binding 533 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 534 lock(); 535 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 536 // There's no successor so we tried to regrab the lock. 537 // If that didn't work, then another thread grabbed the 538 // lock so we're done (and exit was a success). 539 jccb (Assembler::notEqual, LSuccess); 540 // Intentional fall-through into slow path 541 542 bind (LGoSlowPath); 543 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 544 jmpb (DONE_LABEL); 545 546 bind (LSuccess); 547 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 548 jmpb (DONE_LABEL); 549 550 #endif 551 if (LockingMode == LM_LEGACY) { 552 bind (Stacked); 553 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 554 lock(); 555 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 556 // Intentional fall-thru into DONE_LABEL 557 } 558 559 bind(DONE_LABEL); 560 561 // ZFlag == 1 count in fast path 562 // ZFlag == 0 count in slow path 563 jccb(Assembler::notZero, NO_COUNT); 564 565 bind(COUNT); 566 // Count monitors in fast path 567 #ifndef _LP64 568 get_thread(tmpReg); 569 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 570 #else // _LP64 571 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 572 #endif 573 574 xorl(tmpReg, tmpReg); // Set ZF == 1 575 576 bind(NO_COUNT); 577 } 578 579 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 580 Register t, Register thread) { 581 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 582 assert(rax_reg == rax, "Used for CAS"); 583 assert_different_registers(obj, box, rax_reg, t, thread); 584 585 // Handle inflated monitor. 586 Label inflated; 587 // Finish fast lock successfully. ZF value is irrelevant. 588 Label locked; 589 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 590 Label slow_path; 591 592 if (UseObjectMonitorTable) { 593 // Clear cache in case fast locking succeeds. 594 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 595 } 596 597 if (DiagnoseSyncOnValueBasedClasses != 0) { 598 load_klass(rax_reg, obj, t); 599 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 600 jcc(Assembler::notZero, slow_path); 601 } 602 603 const Register mark = t; 604 605 { // Lightweight Lock 606 607 Label push; 608 609 const Register top = UseObjectMonitorTable ? rax_reg : box; 610 611 // Load the mark. 612 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 613 614 // Prefetch top. 615 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 616 617 // Check for monitor (0b10). 618 testptr(mark, markWord::monitor_value); 619 jcc(Assembler::notZero, inflated); 620 621 // Check if lock-stack is full. 622 cmpl(top, LockStack::end_offset() - 1); 623 jcc(Assembler::greater, slow_path); 624 625 // Check if recursive. 626 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 627 jccb(Assembler::equal, push); 628 629 // Try to lock. Transition lock bits 0b01 => 0b00 630 movptr(rax_reg, mark); 631 orptr(rax_reg, markWord::unlocked_value); 632 andptr(mark, ~(int32_t)markWord::unlocked_value); 633 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 634 jcc(Assembler::notEqual, slow_path); 635 636 if (UseObjectMonitorTable) { 637 // Need to reload top, clobbered by CAS. 638 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 639 } 640 bind(push); 641 // After successful lock, push object on lock-stack. 642 movptr(Address(thread, top), obj); 643 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 644 jmpb(locked); 645 } 646 647 { // Handle inflated monitor. 648 bind(inflated); 649 650 const Register monitor = t; 651 652 if (!UseObjectMonitorTable) { 653 assert(mark == monitor, "should be the same here"); 654 } else { 655 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 656 // Fetch ObjectMonitor* from the cache or take the slow-path. 657 Label monitor_found; 658 659 // Load cache address 660 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 661 662 const int num_unrolled = 2; 663 for (int i = 0; i < num_unrolled; i++) { 664 cmpptr(obj, Address(t)); 665 jccb(Assembler::equal, monitor_found); 666 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 667 } 668 669 Label loop; 670 671 // Search for obj in cache. 672 bind(loop); 673 674 // Check for match. 675 cmpptr(obj, Address(t)); 676 jccb(Assembler::equal, monitor_found); 677 678 // Search until null encountered, guaranteed _null_sentinel at end. 679 cmpptr(Address(t), 1); 680 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 681 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 682 jmpb(loop); 683 684 // Cache hit. 685 bind(monitor_found); 686 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 687 } 688 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 689 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 690 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 691 692 Label monitor_locked; 693 // Lock the monitor. 694 695 // CAS owner (null => current thread). 696 xorptr(rax_reg, rax_reg); 697 lock(); cmpxchgptr(thread, owner_address); 698 jccb(Assembler::equal, monitor_locked); 699 700 // Check if recursive. 701 cmpptr(thread, rax_reg); 702 jccb(Assembler::notEqual, slow_path); 703 704 // Recursive. 705 increment(recursions_address); 706 707 bind(monitor_locked); 708 if (UseObjectMonitorTable) { 709 // Cache the monitor for unlock 710 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 711 } 712 } 713 714 bind(locked); 715 increment(Address(thread, JavaThread::held_monitor_count_offset())); 716 // Set ZF = 1 717 xorl(rax_reg, rax_reg); 718 719 #ifdef ASSERT 720 // Check that locked label is reached with ZF set. 721 Label zf_correct; 722 Label zf_bad_zero; 723 jcc(Assembler::zero, zf_correct); 724 jmp(zf_bad_zero); 725 #endif 726 727 bind(slow_path); 728 #ifdef ASSERT 729 // Check that slow_path label is reached with ZF not set. 730 jcc(Assembler::notZero, zf_correct); 731 stop("Fast Lock ZF != 0"); 732 bind(zf_bad_zero); 733 stop("Fast Lock ZF != 1"); 734 bind(zf_correct); 735 #endif 736 // C2 uses the value of ZF to determine the continuation. 737 } 738 739 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 740 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 741 assert(reg_rax == rax, "Used for CAS"); 742 assert_different_registers(obj, reg_rax, t); 743 744 // Handle inflated monitor. 745 Label inflated, inflated_check_lock_stack; 746 // Finish fast unlock successfully. MUST jump with ZF == 1 747 Label unlocked; 748 749 // Assume success. 750 decrement(Address(thread, JavaThread::held_monitor_count_offset())); 751 752 const Register mark = t; 753 const Register monitor = t; 754 const Register top = UseObjectMonitorTable ? t : reg_rax; 755 const Register box = reg_rax; 756 757 Label dummy; 758 C2FastUnlockLightweightStub* stub = nullptr; 759 760 if (!Compile::current()->output()->in_scratch_emit_size()) { 761 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 762 Compile::current()->output()->add_stub(stub); 763 } 764 765 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 766 Label& check_successor = stub == nullptr ? dummy : stub->check_successor(); 767 Label& slow_path = stub == nullptr ? dummy : stub->slow_path(); 768 769 { // Lightweight Unlock 770 771 // Load top. 772 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 773 774 if (!UseObjectMonitorTable) { 775 // Prefetch mark. 776 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 777 } 778 779 // Check if obj is top of lock-stack. 780 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 781 // Top of lock stack was not obj. Must be monitor. 782 jcc(Assembler::notEqual, inflated_check_lock_stack); 783 784 // Pop lock-stack. 785 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 786 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 787 788 // Check if recursive. 789 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 790 jcc(Assembler::equal, unlocked); 791 792 // We elide the monitor check, let the CAS fail instead. 793 794 if (UseObjectMonitorTable) { 795 // Load mark. 796 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 797 } 798 799 // Try to unlock. Transition lock bits 0b00 => 0b01 800 movptr(reg_rax, mark); 801 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 802 orptr(mark, markWord::unlocked_value); 803 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 804 jcc(Assembler::notEqual, push_and_slow_path); 805 jmp(unlocked); 806 } 807 808 809 { // Handle inflated monitor. 810 bind(inflated_check_lock_stack); 811 #ifdef ASSERT 812 Label check_done; 813 subl(top, oopSize); 814 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 815 jcc(Assembler::below, check_done); 816 cmpptr(obj, Address(thread, top)); 817 jccb(Assembler::notEqual, inflated_check_lock_stack); 818 stop("Fast Unlock lock on stack"); 819 bind(check_done); 820 if (UseObjectMonitorTable) { 821 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 822 } 823 testptr(mark, markWord::monitor_value); 824 jccb(Assembler::notZero, inflated); 825 stop("Fast Unlock not monitor"); 826 #endif 827 828 bind(inflated); 829 830 if (!UseObjectMonitorTable) { 831 assert(mark == monitor, "should be the same here"); 832 } else { 833 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 834 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 835 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 836 cmpptr(monitor, alignof(ObjectMonitor*)); 837 jcc(Assembler::below, slow_path); 838 } 839 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 840 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 841 const Address cxq_address{monitor, ObjectMonitor::cxq_offset() - monitor_tag}; 842 const Address EntryList_address{monitor, ObjectMonitor::EntryList_offset() - monitor_tag}; 843 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 844 845 Label recursive; 846 847 // Check if recursive. 848 cmpptr(recursions_address, 0); 849 jccb(Assembler::notEqual, recursive); 850 851 // Check if the entry lists are empty. 852 movptr(reg_rax, cxq_address); 853 orptr(reg_rax, EntryList_address); 854 jcc(Assembler::notZero, check_successor); 855 856 // Release lock. 857 movptr(owner_address, NULL_WORD); 858 jmpb(unlocked); 859 860 // Recursive unlock. 861 bind(recursive); 862 decrement(recursions_address); 863 xorl(t, t); 864 } 865 866 bind(unlocked); 867 if (stub != nullptr) { 868 bind(stub->unlocked_continuation()); 869 } 870 871 #ifdef ASSERT 872 // Check that unlocked label is reached with ZF set. 873 Label zf_correct; 874 jcc(Assembler::zero, zf_correct); 875 stop("Fast Unlock ZF != 1"); 876 #endif 877 878 if (stub != nullptr) { 879 bind(stub->slow_path_continuation()); 880 } 881 #ifdef ASSERT 882 // Check that stub->continuation() label is reached with ZF not set. 883 jccb(Assembler::notZero, zf_correct); 884 stop("Fast Unlock ZF != 0"); 885 bind(zf_correct); 886 #endif 887 // C2 uses the value of ZF to determine the continuation. 888 } 889 890 //------------------------------------------------------------------------------------------- 891 // Generic instructions support for use in .ad files C2 code generation 892 893 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 894 if (dst != src) { 895 movdqu(dst, src); 896 } 897 if (opcode == Op_AbsVD) { 898 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 899 } else { 900 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 901 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 902 } 903 } 904 905 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 906 if (opcode == Op_AbsVD) { 907 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 908 } else { 909 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 910 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 911 } 912 } 913 914 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 915 if (dst != src) { 916 movdqu(dst, src); 917 } 918 if (opcode == Op_AbsVF) { 919 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 920 } else { 921 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 922 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 923 } 924 } 925 926 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 927 if (opcode == Op_AbsVF) { 928 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 929 } else { 930 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 931 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 932 } 933 } 934 935 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 936 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 937 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 938 939 if (opcode == Op_MinV) { 940 if (elem_bt == T_BYTE) { 941 pminsb(dst, src); 942 } else if (elem_bt == T_SHORT) { 943 pminsw(dst, src); 944 } else if (elem_bt == T_INT) { 945 pminsd(dst, src); 946 } else { 947 assert(elem_bt == T_LONG, "required"); 948 assert(tmp == xmm0, "required"); 949 assert_different_registers(dst, src, tmp); 950 movdqu(xmm0, dst); 951 pcmpgtq(xmm0, src); 952 blendvpd(dst, src); // xmm0 as mask 953 } 954 } else { // opcode == Op_MaxV 955 if (elem_bt == T_BYTE) { 956 pmaxsb(dst, src); 957 } else if (elem_bt == T_SHORT) { 958 pmaxsw(dst, src); 959 } else if (elem_bt == T_INT) { 960 pmaxsd(dst, src); 961 } else { 962 assert(elem_bt == T_LONG, "required"); 963 assert(tmp == xmm0, "required"); 964 assert_different_registers(dst, src, tmp); 965 movdqu(xmm0, src); 966 pcmpgtq(xmm0, dst); 967 blendvpd(dst, src); // xmm0 as mask 968 } 969 } 970 } 971 972 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 973 XMMRegister dst, XMMRegister src1, XMMRegister src2, 974 int vlen_enc) { 975 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 976 977 if (opcode == Op_MinV) { 978 if (elem_bt == T_BYTE) { 979 vpminsb(dst, src1, src2, vlen_enc); 980 } else if (elem_bt == T_SHORT) { 981 vpminsw(dst, src1, src2, vlen_enc); 982 } else if (elem_bt == T_INT) { 983 vpminsd(dst, src1, src2, vlen_enc); 984 } else { 985 assert(elem_bt == T_LONG, "required"); 986 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 987 vpminsq(dst, src1, src2, vlen_enc); 988 } else { 989 assert_different_registers(dst, src1, src2); 990 vpcmpgtq(dst, src1, src2, vlen_enc); 991 vblendvpd(dst, src1, src2, dst, vlen_enc); 992 } 993 } 994 } else { // opcode == Op_MaxV 995 if (elem_bt == T_BYTE) { 996 vpmaxsb(dst, src1, src2, vlen_enc); 997 } else if (elem_bt == T_SHORT) { 998 vpmaxsw(dst, src1, src2, vlen_enc); 999 } else if (elem_bt == T_INT) { 1000 vpmaxsd(dst, src1, src2, vlen_enc); 1001 } else { 1002 assert(elem_bt == T_LONG, "required"); 1003 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1004 vpmaxsq(dst, src1, src2, vlen_enc); 1005 } else { 1006 assert_different_registers(dst, src1, src2); 1007 vpcmpgtq(dst, src1, src2, vlen_enc); 1008 vblendvpd(dst, src2, src1, dst, vlen_enc); 1009 } 1010 } 1011 } 1012 } 1013 1014 // Float/Double min max 1015 1016 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1017 XMMRegister dst, XMMRegister a, XMMRegister b, 1018 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1019 int vlen_enc) { 1020 assert(UseAVX > 0, "required"); 1021 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1022 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1023 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1024 assert_different_registers(a, tmp, atmp, btmp); 1025 assert_different_registers(b, tmp, atmp, btmp); 1026 1027 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1028 bool is_double_word = is_double_word_type(elem_bt); 1029 1030 /* Note on 'non-obvious' assembly sequence: 1031 * 1032 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1033 * and Java on how they handle floats: 1034 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1035 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1036 * 1037 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1038 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1039 * (only useful when signs differ, noop otherwise) 1040 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1041 1042 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1043 * btmp = (b < +0.0) ? a : b 1044 * atmp = (b < +0.0) ? b : a 1045 * Tmp = Max_Float(atmp , btmp) 1046 * Res = (atmp == NaN) ? atmp : Tmp 1047 */ 1048 1049 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1050 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1051 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1052 XMMRegister mask; 1053 1054 if (!is_double_word && is_min) { 1055 mask = a; 1056 vblend = &MacroAssembler::vblendvps; 1057 vmaxmin = &MacroAssembler::vminps; 1058 vcmp = &MacroAssembler::vcmpps; 1059 } else if (!is_double_word && !is_min) { 1060 mask = b; 1061 vblend = &MacroAssembler::vblendvps; 1062 vmaxmin = &MacroAssembler::vmaxps; 1063 vcmp = &MacroAssembler::vcmpps; 1064 } else if (is_double_word && is_min) { 1065 mask = a; 1066 vblend = &MacroAssembler::vblendvpd; 1067 vmaxmin = &MacroAssembler::vminpd; 1068 vcmp = &MacroAssembler::vcmppd; 1069 } else { 1070 assert(is_double_word && !is_min, "sanity"); 1071 mask = b; 1072 vblend = &MacroAssembler::vblendvpd; 1073 vmaxmin = &MacroAssembler::vmaxpd; 1074 vcmp = &MacroAssembler::vcmppd; 1075 } 1076 1077 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1078 XMMRegister maxmin, scratch; 1079 if (dst == btmp) { 1080 maxmin = btmp; 1081 scratch = tmp; 1082 } else { 1083 maxmin = tmp; 1084 scratch = btmp; 1085 } 1086 1087 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1088 if (precompute_mask && !is_double_word) { 1089 vpsrad(tmp, mask, 32, vlen_enc); 1090 mask = tmp; 1091 } else if (precompute_mask && is_double_word) { 1092 vpxor(tmp, tmp, tmp, vlen_enc); 1093 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1094 mask = tmp; 1095 } 1096 1097 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1098 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1099 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1100 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1101 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1102 } 1103 1104 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1105 XMMRegister dst, XMMRegister a, XMMRegister b, 1106 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1107 int vlen_enc) { 1108 assert(UseAVX > 2, "required"); 1109 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1110 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1111 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1112 assert_different_registers(dst, a, atmp, btmp); 1113 assert_different_registers(dst, b, atmp, btmp); 1114 1115 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1116 bool is_double_word = is_double_word_type(elem_bt); 1117 bool merge = true; 1118 1119 if (!is_double_word && is_min) { 1120 evpmovd2m(ktmp, a, vlen_enc); 1121 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1122 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1123 vminps(dst, atmp, btmp, vlen_enc); 1124 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1125 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1126 } else if (!is_double_word && !is_min) { 1127 evpmovd2m(ktmp, b, vlen_enc); 1128 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1129 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1130 vmaxps(dst, atmp, btmp, vlen_enc); 1131 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1132 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1133 } else if (is_double_word && is_min) { 1134 evpmovq2m(ktmp, a, vlen_enc); 1135 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1136 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1137 vminpd(dst, atmp, btmp, vlen_enc); 1138 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1139 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1140 } else { 1141 assert(is_double_word && !is_min, "sanity"); 1142 evpmovq2m(ktmp, b, vlen_enc); 1143 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1144 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1145 vmaxpd(dst, atmp, btmp, vlen_enc); 1146 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1147 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1148 } 1149 } 1150 1151 // Float/Double signum 1152 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1153 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1154 1155 Label DONE_LABEL; 1156 1157 if (opcode == Op_SignumF) { 1158 assert(UseSSE > 0, "required"); 1159 ucomiss(dst, zero); 1160 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1161 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1162 movflt(dst, one); 1163 jcc(Assembler::above, DONE_LABEL); 1164 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1165 } else if (opcode == Op_SignumD) { 1166 assert(UseSSE > 1, "required"); 1167 ucomisd(dst, zero); 1168 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1169 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1170 movdbl(dst, one); 1171 jcc(Assembler::above, DONE_LABEL); 1172 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1173 } 1174 1175 bind(DONE_LABEL); 1176 } 1177 1178 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1179 if (sign) { 1180 pmovsxbw(dst, src); 1181 } else { 1182 pmovzxbw(dst, src); 1183 } 1184 } 1185 1186 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1187 if (sign) { 1188 vpmovsxbw(dst, src, vector_len); 1189 } else { 1190 vpmovzxbw(dst, src, vector_len); 1191 } 1192 } 1193 1194 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1195 if (sign) { 1196 vpmovsxbd(dst, src, vector_len); 1197 } else { 1198 vpmovzxbd(dst, src, vector_len); 1199 } 1200 } 1201 1202 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1203 if (sign) { 1204 vpmovsxwd(dst, src, vector_len); 1205 } else { 1206 vpmovzxwd(dst, src, vector_len); 1207 } 1208 } 1209 1210 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1211 int shift, int vector_len) { 1212 if (opcode == Op_RotateLeftV) { 1213 if (etype == T_INT) { 1214 evprold(dst, src, shift, vector_len); 1215 } else { 1216 assert(etype == T_LONG, "expected type T_LONG"); 1217 evprolq(dst, src, shift, vector_len); 1218 } 1219 } else { 1220 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1221 if (etype == T_INT) { 1222 evprord(dst, src, shift, vector_len); 1223 } else { 1224 assert(etype == T_LONG, "expected type T_LONG"); 1225 evprorq(dst, src, shift, vector_len); 1226 } 1227 } 1228 } 1229 1230 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1231 XMMRegister shift, int vector_len) { 1232 if (opcode == Op_RotateLeftV) { 1233 if (etype == T_INT) { 1234 evprolvd(dst, src, shift, vector_len); 1235 } else { 1236 assert(etype == T_LONG, "expected type T_LONG"); 1237 evprolvq(dst, src, shift, vector_len); 1238 } 1239 } else { 1240 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1241 if (etype == T_INT) { 1242 evprorvd(dst, src, shift, vector_len); 1243 } else { 1244 assert(etype == T_LONG, "expected type T_LONG"); 1245 evprorvq(dst, src, shift, vector_len); 1246 } 1247 } 1248 } 1249 1250 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1251 if (opcode == Op_RShiftVI) { 1252 psrad(dst, shift); 1253 } else if (opcode == Op_LShiftVI) { 1254 pslld(dst, shift); 1255 } else { 1256 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1257 psrld(dst, shift); 1258 } 1259 } 1260 1261 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1262 switch (opcode) { 1263 case Op_RShiftVI: psrad(dst, shift); break; 1264 case Op_LShiftVI: pslld(dst, shift); break; 1265 case Op_URShiftVI: psrld(dst, shift); break; 1266 1267 default: assert(false, "%s", NodeClassNames[opcode]); 1268 } 1269 } 1270 1271 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1272 if (opcode == Op_RShiftVI) { 1273 vpsrad(dst, nds, shift, vector_len); 1274 } else if (opcode == Op_LShiftVI) { 1275 vpslld(dst, nds, shift, vector_len); 1276 } else { 1277 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1278 vpsrld(dst, nds, shift, vector_len); 1279 } 1280 } 1281 1282 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1283 switch (opcode) { 1284 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1285 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1286 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1287 1288 default: assert(false, "%s", NodeClassNames[opcode]); 1289 } 1290 } 1291 1292 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1293 switch (opcode) { 1294 case Op_RShiftVB: // fall-through 1295 case Op_RShiftVS: psraw(dst, shift); break; 1296 1297 case Op_LShiftVB: // fall-through 1298 case Op_LShiftVS: psllw(dst, shift); break; 1299 1300 case Op_URShiftVS: // fall-through 1301 case Op_URShiftVB: psrlw(dst, shift); break; 1302 1303 default: assert(false, "%s", NodeClassNames[opcode]); 1304 } 1305 } 1306 1307 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1308 switch (opcode) { 1309 case Op_RShiftVB: // fall-through 1310 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1311 1312 case Op_LShiftVB: // fall-through 1313 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1314 1315 case Op_URShiftVS: // fall-through 1316 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1317 1318 default: assert(false, "%s", NodeClassNames[opcode]); 1319 } 1320 } 1321 1322 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1323 switch (opcode) { 1324 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1325 case Op_LShiftVL: psllq(dst, shift); break; 1326 case Op_URShiftVL: psrlq(dst, shift); break; 1327 1328 default: assert(false, "%s", NodeClassNames[opcode]); 1329 } 1330 } 1331 1332 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1333 if (opcode == Op_RShiftVL) { 1334 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1335 } else if (opcode == Op_LShiftVL) { 1336 psllq(dst, shift); 1337 } else { 1338 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1339 psrlq(dst, shift); 1340 } 1341 } 1342 1343 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1344 switch (opcode) { 1345 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1346 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1347 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1348 1349 default: assert(false, "%s", NodeClassNames[opcode]); 1350 } 1351 } 1352 1353 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1354 if (opcode == Op_RShiftVL) { 1355 evpsraq(dst, nds, shift, vector_len); 1356 } else if (opcode == Op_LShiftVL) { 1357 vpsllq(dst, nds, shift, vector_len); 1358 } else { 1359 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1360 vpsrlq(dst, nds, shift, vector_len); 1361 } 1362 } 1363 1364 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1365 switch (opcode) { 1366 case Op_RShiftVB: // fall-through 1367 case Op_RShiftVS: // fall-through 1368 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1369 1370 case Op_LShiftVB: // fall-through 1371 case Op_LShiftVS: // fall-through 1372 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1373 1374 case Op_URShiftVB: // fall-through 1375 case Op_URShiftVS: // fall-through 1376 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1377 1378 default: assert(false, "%s", NodeClassNames[opcode]); 1379 } 1380 } 1381 1382 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1383 switch (opcode) { 1384 case Op_RShiftVB: // fall-through 1385 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1386 1387 case Op_LShiftVB: // fall-through 1388 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1389 1390 case Op_URShiftVB: // fall-through 1391 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1392 1393 default: assert(false, "%s", NodeClassNames[opcode]); 1394 } 1395 } 1396 1397 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1398 assert(UseAVX >= 2, "required"); 1399 switch (opcode) { 1400 case Op_RShiftVL: { 1401 if (UseAVX > 2) { 1402 assert(tmp == xnoreg, "not used"); 1403 if (!VM_Version::supports_avx512vl()) { 1404 vlen_enc = Assembler::AVX_512bit; 1405 } 1406 evpsravq(dst, src, shift, vlen_enc); 1407 } else { 1408 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1409 vpsrlvq(dst, src, shift, vlen_enc); 1410 vpsrlvq(tmp, tmp, shift, vlen_enc); 1411 vpxor(dst, dst, tmp, vlen_enc); 1412 vpsubq(dst, dst, tmp, vlen_enc); 1413 } 1414 break; 1415 } 1416 case Op_LShiftVL: { 1417 assert(tmp == xnoreg, "not used"); 1418 vpsllvq(dst, src, shift, vlen_enc); 1419 break; 1420 } 1421 case Op_URShiftVL: { 1422 assert(tmp == xnoreg, "not used"); 1423 vpsrlvq(dst, src, shift, vlen_enc); 1424 break; 1425 } 1426 default: assert(false, "%s", NodeClassNames[opcode]); 1427 } 1428 } 1429 1430 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1431 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1432 assert(opcode == Op_LShiftVB || 1433 opcode == Op_RShiftVB || 1434 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1435 bool sign = (opcode != Op_URShiftVB); 1436 assert(vector_len == 0, "required"); 1437 vextendbd(sign, dst, src, 1); 1438 vpmovzxbd(vtmp, shift, 1); 1439 varshiftd(opcode, dst, dst, vtmp, 1); 1440 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1441 vextracti128_high(vtmp, dst); 1442 vpackusdw(dst, dst, vtmp, 0); 1443 } 1444 1445 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1446 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1447 assert(opcode == Op_LShiftVB || 1448 opcode == Op_RShiftVB || 1449 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1450 bool sign = (opcode != Op_URShiftVB); 1451 int ext_vector_len = vector_len + 1; 1452 vextendbw(sign, dst, src, ext_vector_len); 1453 vpmovzxbw(vtmp, shift, ext_vector_len); 1454 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1455 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1456 if (vector_len == 0) { 1457 vextracti128_high(vtmp, dst); 1458 vpackuswb(dst, dst, vtmp, vector_len); 1459 } else { 1460 vextracti64x4_high(vtmp, dst); 1461 vpackuswb(dst, dst, vtmp, vector_len); 1462 vpermq(dst, dst, 0xD8, vector_len); 1463 } 1464 } 1465 1466 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1467 switch(typ) { 1468 case T_BYTE: 1469 pinsrb(dst, val, idx); 1470 break; 1471 case T_SHORT: 1472 pinsrw(dst, val, idx); 1473 break; 1474 case T_INT: 1475 pinsrd(dst, val, idx); 1476 break; 1477 case T_LONG: 1478 pinsrq(dst, val, idx); 1479 break; 1480 default: 1481 assert(false,"Should not reach here."); 1482 break; 1483 } 1484 } 1485 1486 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1487 switch(typ) { 1488 case T_BYTE: 1489 vpinsrb(dst, src, val, idx); 1490 break; 1491 case T_SHORT: 1492 vpinsrw(dst, src, val, idx); 1493 break; 1494 case T_INT: 1495 vpinsrd(dst, src, val, idx); 1496 break; 1497 case T_LONG: 1498 vpinsrq(dst, src, val, idx); 1499 break; 1500 default: 1501 assert(false,"Should not reach here."); 1502 break; 1503 } 1504 } 1505 1506 #ifdef _LP64 1507 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1508 XMMRegister dst, Register base, 1509 Register idx_base, 1510 Register offset, Register mask, 1511 Register mask_idx, Register rtmp, 1512 int vlen_enc) { 1513 vpxor(dst, dst, dst, vlen_enc); 1514 if (elem_bt == T_SHORT) { 1515 for (int i = 0; i < 4; i++) { 1516 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1517 Label skip_load; 1518 btq(mask, mask_idx); 1519 jccb(Assembler::carryClear, skip_load); 1520 movl(rtmp, Address(idx_base, i * 4)); 1521 if (offset != noreg) { 1522 addl(rtmp, offset); 1523 } 1524 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1525 bind(skip_load); 1526 incq(mask_idx); 1527 } 1528 } else { 1529 assert(elem_bt == T_BYTE, ""); 1530 for (int i = 0; i < 8; i++) { 1531 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1532 Label skip_load; 1533 btq(mask, mask_idx); 1534 jccb(Assembler::carryClear, skip_load); 1535 movl(rtmp, Address(idx_base, i * 4)); 1536 if (offset != noreg) { 1537 addl(rtmp, offset); 1538 } 1539 pinsrb(dst, Address(base, rtmp), i); 1540 bind(skip_load); 1541 incq(mask_idx); 1542 } 1543 } 1544 } 1545 #endif // _LP64 1546 1547 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1548 Register base, Register idx_base, 1549 Register offset, Register rtmp, 1550 int vlen_enc) { 1551 vpxor(dst, dst, dst, vlen_enc); 1552 if (elem_bt == T_SHORT) { 1553 for (int i = 0; i < 4; i++) { 1554 // dst[i] = src[offset + idx_base[i]] 1555 movl(rtmp, Address(idx_base, i * 4)); 1556 if (offset != noreg) { 1557 addl(rtmp, offset); 1558 } 1559 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1560 } 1561 } else { 1562 assert(elem_bt == T_BYTE, ""); 1563 for (int i = 0; i < 8; i++) { 1564 // dst[i] = src[offset + idx_base[i]] 1565 movl(rtmp, Address(idx_base, i * 4)); 1566 if (offset != noreg) { 1567 addl(rtmp, offset); 1568 } 1569 pinsrb(dst, Address(base, rtmp), i); 1570 } 1571 } 1572 } 1573 1574 /* 1575 * Gather using hybrid algorithm, first partially unroll scalar loop 1576 * to accumulate values from gather indices into a quad-word(64bit) slice. 1577 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1578 * permutation to place the slice into appropriate vector lane 1579 * locations in destination vector. Following pseudo code describes the 1580 * algorithm in detail: 1581 * 1582 * DST_VEC = ZERO_VEC 1583 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1584 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1585 * FOREACH_ITER: 1586 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1587 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1588 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1589 * PERM_INDEX = PERM_INDEX - TWO_VEC 1590 * 1591 * With each iteration, doubleword permute indices (0,1) corresponding 1592 * to gathered quadword gets right shifted by two lane positions. 1593 * 1594 */ 1595 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1596 Register base, Register idx_base, 1597 Register offset, Register mask, 1598 XMMRegister xtmp1, XMMRegister xtmp2, 1599 XMMRegister temp_dst, Register rtmp, 1600 Register mask_idx, Register length, 1601 int vector_len, int vlen_enc) { 1602 Label GATHER8_LOOP; 1603 assert(is_subword_type(elem_ty), ""); 1604 movl(length, vector_len); 1605 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1606 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1607 vallones(xtmp2, vlen_enc); 1608 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1609 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1610 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1611 1612 bind(GATHER8_LOOP); 1613 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1614 if (mask == noreg) { 1615 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1616 } else { 1617 LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc)); 1618 } 1619 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1620 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1621 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1622 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1623 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1624 vpor(dst, dst, temp_dst, vlen_enc); 1625 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1626 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1627 jcc(Assembler::notEqual, GATHER8_LOOP); 1628 } 1629 1630 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1631 switch(typ) { 1632 case T_INT: 1633 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1634 break; 1635 case T_FLOAT: 1636 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1637 break; 1638 case T_LONG: 1639 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1640 break; 1641 case T_DOUBLE: 1642 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1643 break; 1644 default: 1645 assert(false,"Should not reach here."); 1646 break; 1647 } 1648 } 1649 1650 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1651 switch(typ) { 1652 case T_INT: 1653 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1654 break; 1655 case T_FLOAT: 1656 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1657 break; 1658 case T_LONG: 1659 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1660 break; 1661 case T_DOUBLE: 1662 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1663 break; 1664 default: 1665 assert(false,"Should not reach here."); 1666 break; 1667 } 1668 } 1669 1670 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1671 switch(typ) { 1672 case T_INT: 1673 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1674 break; 1675 case T_FLOAT: 1676 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1677 break; 1678 case T_LONG: 1679 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1680 break; 1681 case T_DOUBLE: 1682 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1683 break; 1684 default: 1685 assert(false,"Should not reach here."); 1686 break; 1687 } 1688 } 1689 1690 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1691 if (vlen_in_bytes <= 16) { 1692 pxor (dst, dst); 1693 psubb(dst, src); 1694 switch (elem_bt) { 1695 case T_BYTE: /* nothing to do */ break; 1696 case T_SHORT: pmovsxbw(dst, dst); break; 1697 case T_INT: pmovsxbd(dst, dst); break; 1698 case T_FLOAT: pmovsxbd(dst, dst); break; 1699 case T_LONG: pmovsxbq(dst, dst); break; 1700 case T_DOUBLE: pmovsxbq(dst, dst); break; 1701 1702 default: assert(false, "%s", type2name(elem_bt)); 1703 } 1704 } else { 1705 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1706 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1707 1708 vpxor (dst, dst, dst, vlen_enc); 1709 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1710 1711 switch (elem_bt) { 1712 case T_BYTE: /* nothing to do */ break; 1713 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1714 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1715 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1716 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1717 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1718 1719 default: assert(false, "%s", type2name(elem_bt)); 1720 } 1721 } 1722 } 1723 1724 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1725 if (novlbwdq) { 1726 vpmovsxbd(xtmp, src, vlen_enc); 1727 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1728 Assembler::eq, true, vlen_enc, noreg); 1729 } else { 1730 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1731 vpsubb(xtmp, xtmp, src, vlen_enc); 1732 evpmovb2m(dst, xtmp, vlen_enc); 1733 } 1734 } 1735 1736 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1737 switch (vlen_in_bytes) { 1738 case 4: movdl(dst, src); break; 1739 case 8: movq(dst, src); break; 1740 case 16: movdqu(dst, src); break; 1741 case 32: vmovdqu(dst, src); break; 1742 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1743 default: ShouldNotReachHere(); 1744 } 1745 } 1746 1747 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1748 assert(rscratch != noreg || always_reachable(src), "missing"); 1749 1750 if (reachable(src)) { 1751 load_vector(dst, as_Address(src), vlen_in_bytes); 1752 } else { 1753 lea(rscratch, src); 1754 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1755 } 1756 } 1757 1758 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1759 int vlen_enc = vector_length_encoding(vlen); 1760 if (VM_Version::supports_avx()) { 1761 if (bt == T_LONG) { 1762 if (VM_Version::supports_avx2()) { 1763 vpbroadcastq(dst, src, vlen_enc); 1764 } else { 1765 vmovddup(dst, src, vlen_enc); 1766 } 1767 } else if (bt == T_DOUBLE) { 1768 if (vlen_enc != Assembler::AVX_128bit) { 1769 vbroadcastsd(dst, src, vlen_enc, noreg); 1770 } else { 1771 vmovddup(dst, src, vlen_enc); 1772 } 1773 } else { 1774 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1775 vpbroadcastd(dst, src, vlen_enc); 1776 } else { 1777 vbroadcastss(dst, src, vlen_enc); 1778 } 1779 } 1780 } else if (VM_Version::supports_sse3()) { 1781 movddup(dst, src); 1782 } else { 1783 movq(dst, src); 1784 if (vlen == 16) { 1785 punpcklqdq(dst, dst); 1786 } 1787 } 1788 } 1789 1790 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1791 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1792 int offset = exact_log2(type2aelembytes(bt)) << 6; 1793 if (is_floating_point_type(bt)) { 1794 offset += 128; 1795 } 1796 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1797 load_vector(dst, addr, vlen_in_bytes); 1798 } 1799 1800 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1801 1802 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1803 int vector_len = Assembler::AVX_128bit; 1804 1805 switch (opcode) { 1806 case Op_AndReductionV: pand(dst, src); break; 1807 case Op_OrReductionV: por (dst, src); break; 1808 case Op_XorReductionV: pxor(dst, src); break; 1809 case Op_MinReductionV: 1810 switch (typ) { 1811 case T_BYTE: pminsb(dst, src); break; 1812 case T_SHORT: pminsw(dst, src); break; 1813 case T_INT: pminsd(dst, src); break; 1814 case T_LONG: assert(UseAVX > 2, "required"); 1815 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1816 default: assert(false, "wrong type"); 1817 } 1818 break; 1819 case Op_MaxReductionV: 1820 switch (typ) { 1821 case T_BYTE: pmaxsb(dst, src); break; 1822 case T_SHORT: pmaxsw(dst, src); break; 1823 case T_INT: pmaxsd(dst, src); break; 1824 case T_LONG: assert(UseAVX > 2, "required"); 1825 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1826 default: assert(false, "wrong type"); 1827 } 1828 break; 1829 case Op_AddReductionVF: addss(dst, src); break; 1830 case Op_AddReductionVD: addsd(dst, src); break; 1831 case Op_AddReductionVI: 1832 switch (typ) { 1833 case T_BYTE: paddb(dst, src); break; 1834 case T_SHORT: paddw(dst, src); break; 1835 case T_INT: paddd(dst, src); break; 1836 default: assert(false, "wrong type"); 1837 } 1838 break; 1839 case Op_AddReductionVL: paddq(dst, src); break; 1840 case Op_MulReductionVF: mulss(dst, src); break; 1841 case Op_MulReductionVD: mulsd(dst, src); break; 1842 case Op_MulReductionVI: 1843 switch (typ) { 1844 case T_SHORT: pmullw(dst, src); break; 1845 case T_INT: pmulld(dst, src); break; 1846 default: assert(false, "wrong type"); 1847 } 1848 break; 1849 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1850 evpmullq(dst, dst, src, vector_len); break; 1851 default: assert(false, "wrong opcode"); 1852 } 1853 } 1854 1855 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1856 switch (opcode) { 1857 case Op_AddReductionVF: addps(dst, src); break; 1858 case Op_AddReductionVD: addpd(dst, src); break; 1859 case Op_MulReductionVF: mulps(dst, src); break; 1860 case Op_MulReductionVD: mulpd(dst, src); break; 1861 default: assert(false, "%s", NodeClassNames[opcode]); 1862 } 1863 } 1864 1865 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1866 int vector_len = Assembler::AVX_256bit; 1867 1868 switch (opcode) { 1869 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1870 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1871 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1872 case Op_MinReductionV: 1873 switch (typ) { 1874 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1875 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1876 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1877 case T_LONG: assert(UseAVX > 2, "required"); 1878 vpminsq(dst, src1, src2, vector_len); break; 1879 default: assert(false, "wrong type"); 1880 } 1881 break; 1882 case Op_MaxReductionV: 1883 switch (typ) { 1884 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1885 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1886 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1887 case T_LONG: assert(UseAVX > 2, "required"); 1888 vpmaxsq(dst, src1, src2, vector_len); break; 1889 default: assert(false, "wrong type"); 1890 } 1891 break; 1892 case Op_AddReductionVI: 1893 switch (typ) { 1894 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1895 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1896 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1897 default: assert(false, "wrong type"); 1898 } 1899 break; 1900 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1901 case Op_MulReductionVI: 1902 switch (typ) { 1903 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1904 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1905 default: assert(false, "wrong type"); 1906 } 1907 break; 1908 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1909 default: assert(false, "wrong opcode"); 1910 } 1911 } 1912 1913 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1914 int vector_len = Assembler::AVX_256bit; 1915 1916 switch (opcode) { 1917 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 1918 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 1919 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 1920 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 1921 default: assert(false, "%s", NodeClassNames[opcode]); 1922 } 1923 } 1924 1925 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1926 XMMRegister dst, XMMRegister src, 1927 XMMRegister vtmp1, XMMRegister vtmp2) { 1928 switch (opcode) { 1929 case Op_AddReductionVF: 1930 case Op_MulReductionVF: 1931 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1932 break; 1933 1934 case Op_AddReductionVD: 1935 case Op_MulReductionVD: 1936 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1937 break; 1938 1939 default: assert(false, "wrong opcode"); 1940 } 1941 } 1942 1943 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 1944 XMMRegister dst, XMMRegister src, 1945 XMMRegister vtmp1, XMMRegister vtmp2) { 1946 switch (opcode) { 1947 case Op_AddReductionVF: 1948 case Op_MulReductionVF: 1949 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1950 break; 1951 1952 case Op_AddReductionVD: 1953 case Op_MulReductionVD: 1954 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1955 break; 1956 1957 default: assert(false, "%s", NodeClassNames[opcode]); 1958 } 1959 } 1960 1961 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1962 Register dst, Register src1, XMMRegister src2, 1963 XMMRegister vtmp1, XMMRegister vtmp2) { 1964 switch (vlen) { 1965 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1966 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1967 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1968 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1969 1970 default: assert(false, "wrong vector length"); 1971 } 1972 } 1973 1974 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1975 Register dst, Register src1, XMMRegister src2, 1976 XMMRegister vtmp1, XMMRegister vtmp2) { 1977 switch (vlen) { 1978 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1979 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1980 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1981 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1982 1983 default: assert(false, "wrong vector length"); 1984 } 1985 } 1986 1987 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1988 Register dst, Register src1, XMMRegister src2, 1989 XMMRegister vtmp1, XMMRegister vtmp2) { 1990 switch (vlen) { 1991 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1992 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1993 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1994 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1995 1996 default: assert(false, "wrong vector length"); 1997 } 1998 } 1999 2000 void C2_MacroAssembler::reduceI(int opcode, int vlen, 2001 Register dst, Register src1, XMMRegister src2, 2002 XMMRegister vtmp1, XMMRegister vtmp2) { 2003 switch (vlen) { 2004 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2005 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2006 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2007 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2008 2009 default: assert(false, "wrong vector length"); 2010 } 2011 } 2012 2013 #ifdef _LP64 2014 void C2_MacroAssembler::reduceL(int opcode, int vlen, 2015 Register dst, Register src1, XMMRegister src2, 2016 XMMRegister vtmp1, XMMRegister vtmp2) { 2017 switch (vlen) { 2018 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2019 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2020 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2021 2022 default: assert(false, "wrong vector length"); 2023 } 2024 } 2025 #endif // _LP64 2026 2027 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2028 switch (vlen) { 2029 case 2: 2030 assert(vtmp2 == xnoreg, ""); 2031 reduce2F(opcode, dst, src, vtmp1); 2032 break; 2033 case 4: 2034 assert(vtmp2 == xnoreg, ""); 2035 reduce4F(opcode, dst, src, vtmp1); 2036 break; 2037 case 8: 2038 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2039 break; 2040 case 16: 2041 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2042 break; 2043 default: assert(false, "wrong vector length"); 2044 } 2045 } 2046 2047 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2048 switch (vlen) { 2049 case 2: 2050 assert(vtmp2 == xnoreg, ""); 2051 reduce2D(opcode, dst, src, vtmp1); 2052 break; 2053 case 4: 2054 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2055 break; 2056 case 8: 2057 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2058 break; 2059 default: assert(false, "wrong vector length"); 2060 } 2061 } 2062 2063 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2064 switch (vlen) { 2065 case 2: 2066 assert(vtmp1 == xnoreg, ""); 2067 assert(vtmp2 == xnoreg, ""); 2068 unorderedReduce2F(opcode, dst, src); 2069 break; 2070 case 4: 2071 assert(vtmp2 == xnoreg, ""); 2072 unorderedReduce4F(opcode, dst, src, vtmp1); 2073 break; 2074 case 8: 2075 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 2076 break; 2077 case 16: 2078 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 2079 break; 2080 default: assert(false, "wrong vector length"); 2081 } 2082 } 2083 2084 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2085 switch (vlen) { 2086 case 2: 2087 assert(vtmp1 == xnoreg, ""); 2088 assert(vtmp2 == xnoreg, ""); 2089 unorderedReduce2D(opcode, dst, src); 2090 break; 2091 case 4: 2092 assert(vtmp2 == xnoreg, ""); 2093 unorderedReduce4D(opcode, dst, src, vtmp1); 2094 break; 2095 case 8: 2096 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2097 break; 2098 default: assert(false, "wrong vector length"); 2099 } 2100 } 2101 2102 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2103 if (opcode == Op_AddReductionVI) { 2104 if (vtmp1 != src2) { 2105 movdqu(vtmp1, src2); 2106 } 2107 phaddd(vtmp1, vtmp1); 2108 } else { 2109 pshufd(vtmp1, src2, 0x1); 2110 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2111 } 2112 movdl(vtmp2, src1); 2113 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2114 movdl(dst, vtmp1); 2115 } 2116 2117 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2118 if (opcode == Op_AddReductionVI) { 2119 if (vtmp1 != src2) { 2120 movdqu(vtmp1, src2); 2121 } 2122 phaddd(vtmp1, src2); 2123 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2124 } else { 2125 pshufd(vtmp2, src2, 0xE); 2126 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2127 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2128 } 2129 } 2130 2131 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2132 if (opcode == Op_AddReductionVI) { 2133 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2134 vextracti128_high(vtmp2, vtmp1); 2135 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2136 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2137 } else { 2138 vextracti128_high(vtmp1, src2); 2139 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2140 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2141 } 2142 } 2143 2144 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2145 vextracti64x4_high(vtmp2, src2); 2146 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2147 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2148 } 2149 2150 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2151 pshufd(vtmp2, src2, 0x1); 2152 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2153 movdqu(vtmp1, vtmp2); 2154 psrldq(vtmp1, 2); 2155 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2156 movdqu(vtmp2, vtmp1); 2157 psrldq(vtmp2, 1); 2158 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2159 movdl(vtmp2, src1); 2160 pmovsxbd(vtmp1, vtmp1); 2161 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2162 pextrb(dst, vtmp1, 0x0); 2163 movsbl(dst, dst); 2164 } 2165 2166 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2167 pshufd(vtmp1, src2, 0xE); 2168 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2169 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2170 } 2171 2172 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2173 vextracti128_high(vtmp2, src2); 2174 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2175 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2176 } 2177 2178 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2179 vextracti64x4_high(vtmp1, src2); 2180 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2181 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2182 } 2183 2184 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2185 pmovsxbw(vtmp2, src2); 2186 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2187 } 2188 2189 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2190 if (UseAVX > 1) { 2191 int vector_len = Assembler::AVX_256bit; 2192 vpmovsxbw(vtmp1, src2, vector_len); 2193 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2194 } else { 2195 pmovsxbw(vtmp2, src2); 2196 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2197 pshufd(vtmp2, src2, 0x1); 2198 pmovsxbw(vtmp2, src2); 2199 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2200 } 2201 } 2202 2203 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2204 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2205 int vector_len = Assembler::AVX_512bit; 2206 vpmovsxbw(vtmp1, src2, vector_len); 2207 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2208 } else { 2209 assert(UseAVX >= 2,"Should not reach here."); 2210 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2211 vextracti128_high(vtmp2, src2); 2212 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2213 } 2214 } 2215 2216 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2217 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2218 vextracti64x4_high(vtmp2, src2); 2219 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2220 } 2221 2222 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2223 if (opcode == Op_AddReductionVI) { 2224 if (vtmp1 != src2) { 2225 movdqu(vtmp1, src2); 2226 } 2227 phaddw(vtmp1, vtmp1); 2228 phaddw(vtmp1, vtmp1); 2229 } else { 2230 pshufd(vtmp2, src2, 0x1); 2231 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2232 movdqu(vtmp1, vtmp2); 2233 psrldq(vtmp1, 2); 2234 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2235 } 2236 movdl(vtmp2, src1); 2237 pmovsxwd(vtmp1, vtmp1); 2238 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2239 pextrw(dst, vtmp1, 0x0); 2240 movswl(dst, dst); 2241 } 2242 2243 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2244 if (opcode == Op_AddReductionVI) { 2245 if (vtmp1 != src2) { 2246 movdqu(vtmp1, src2); 2247 } 2248 phaddw(vtmp1, src2); 2249 } else { 2250 pshufd(vtmp1, src2, 0xE); 2251 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2252 } 2253 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2254 } 2255 2256 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2257 if (opcode == Op_AddReductionVI) { 2258 int vector_len = Assembler::AVX_256bit; 2259 vphaddw(vtmp2, src2, src2, vector_len); 2260 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2261 } else { 2262 vextracti128_high(vtmp2, src2); 2263 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2264 } 2265 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2266 } 2267 2268 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2269 int vector_len = Assembler::AVX_256bit; 2270 vextracti64x4_high(vtmp1, src2); 2271 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2272 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2273 } 2274 2275 #ifdef _LP64 2276 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2277 pshufd(vtmp2, src2, 0xE); 2278 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2279 movdq(vtmp1, src1); 2280 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2281 movdq(dst, vtmp1); 2282 } 2283 2284 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2285 vextracti128_high(vtmp1, src2); 2286 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2287 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2288 } 2289 2290 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2291 vextracti64x4_high(vtmp2, src2); 2292 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2293 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2294 } 2295 2296 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2297 mov64(temp, -1L); 2298 bzhiq(temp, temp, len); 2299 kmovql(dst, temp); 2300 } 2301 #endif // _LP64 2302 2303 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2304 reduce_operation_128(T_FLOAT, opcode, dst, src); 2305 pshufd(vtmp, src, 0x1); 2306 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2307 } 2308 2309 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2310 reduce2F(opcode, dst, src, vtmp); 2311 pshufd(vtmp, src, 0x2); 2312 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2313 pshufd(vtmp, src, 0x3); 2314 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2315 } 2316 2317 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2318 reduce4F(opcode, dst, src, vtmp2); 2319 vextractf128_high(vtmp2, src); 2320 reduce4F(opcode, dst, vtmp2, vtmp1); 2321 } 2322 2323 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2324 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2325 vextracti64x4_high(vtmp1, src); 2326 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2327 } 2328 2329 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2330 pshufd(dst, src, 0x1); 2331 reduce_operation_128(T_FLOAT, opcode, dst, src); 2332 } 2333 2334 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2335 pshufd(vtmp, src, 0xE); 2336 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2337 unorderedReduce2F(opcode, dst, vtmp); 2338 } 2339 2340 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2341 vextractf128_high(vtmp1, src); 2342 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2343 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2344 } 2345 2346 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2347 vextractf64x4_high(vtmp2, src); 2348 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2349 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2350 } 2351 2352 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2353 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2354 pshufd(vtmp, src, 0xE); 2355 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2356 } 2357 2358 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2359 reduce2D(opcode, dst, src, vtmp2); 2360 vextractf128_high(vtmp2, src); 2361 reduce2D(opcode, dst, vtmp2, vtmp1); 2362 } 2363 2364 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2365 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2366 vextracti64x4_high(vtmp1, src); 2367 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2368 } 2369 2370 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2371 pshufd(dst, src, 0xE); 2372 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2373 } 2374 2375 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2376 vextractf128_high(vtmp, src); 2377 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2378 unorderedReduce2D(opcode, dst, vtmp); 2379 } 2380 2381 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2382 vextractf64x4_high(vtmp2, src); 2383 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2384 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2385 } 2386 2387 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2388 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2389 } 2390 2391 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2392 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2393 } 2394 2395 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2396 int vec_enc) { 2397 switch(elem_bt) { 2398 case T_INT: 2399 case T_FLOAT: 2400 vmaskmovps(dst, src, mask, vec_enc); 2401 break; 2402 case T_LONG: 2403 case T_DOUBLE: 2404 vmaskmovpd(dst, src, mask, vec_enc); 2405 break; 2406 default: 2407 fatal("Unsupported type %s", type2name(elem_bt)); 2408 break; 2409 } 2410 } 2411 2412 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2413 int vec_enc) { 2414 switch(elem_bt) { 2415 case T_INT: 2416 case T_FLOAT: 2417 vmaskmovps(dst, src, mask, vec_enc); 2418 break; 2419 case T_LONG: 2420 case T_DOUBLE: 2421 vmaskmovpd(dst, src, mask, vec_enc); 2422 break; 2423 default: 2424 fatal("Unsupported type %s", type2name(elem_bt)); 2425 break; 2426 } 2427 } 2428 2429 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2430 XMMRegister dst, XMMRegister src, 2431 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2432 XMMRegister xmm_0, XMMRegister xmm_1) { 2433 const int permconst[] = {1, 14}; 2434 XMMRegister wsrc = src; 2435 XMMRegister wdst = xmm_0; 2436 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2437 2438 int vlen_enc = Assembler::AVX_128bit; 2439 if (vlen == 16) { 2440 vlen_enc = Assembler::AVX_256bit; 2441 } 2442 2443 for (int i = log2(vlen) - 1; i >=0; i--) { 2444 if (i == 0 && !is_dst_valid) { 2445 wdst = dst; 2446 } 2447 if (i == 3) { 2448 vextracti64x4_high(wtmp, wsrc); 2449 } else if (i == 2) { 2450 vextracti128_high(wtmp, wsrc); 2451 } else { // i = [0,1] 2452 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2453 } 2454 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2455 wsrc = wdst; 2456 vlen_enc = Assembler::AVX_128bit; 2457 } 2458 if (is_dst_valid) { 2459 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2460 } 2461 } 2462 2463 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2464 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2465 XMMRegister xmm_0, XMMRegister xmm_1) { 2466 XMMRegister wsrc = src; 2467 XMMRegister wdst = xmm_0; 2468 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2469 int vlen_enc = Assembler::AVX_128bit; 2470 if (vlen == 8) { 2471 vlen_enc = Assembler::AVX_256bit; 2472 } 2473 for (int i = log2(vlen) - 1; i >=0; i--) { 2474 if (i == 0 && !is_dst_valid) { 2475 wdst = dst; 2476 } 2477 if (i == 1) { 2478 vextracti128_high(wtmp, wsrc); 2479 } else if (i == 2) { 2480 vextracti64x4_high(wtmp, wsrc); 2481 } else { 2482 assert(i == 0, "%d", i); 2483 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2484 } 2485 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2486 wsrc = wdst; 2487 vlen_enc = Assembler::AVX_128bit; 2488 } 2489 if (is_dst_valid) { 2490 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2491 } 2492 } 2493 2494 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2495 switch (bt) { 2496 case T_BYTE: pextrb(dst, src, idx); break; 2497 case T_SHORT: pextrw(dst, src, idx); break; 2498 case T_INT: pextrd(dst, src, idx); break; 2499 case T_LONG: pextrq(dst, src, idx); break; 2500 2501 default: 2502 assert(false,"Should not reach here."); 2503 break; 2504 } 2505 } 2506 2507 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2508 int esize = type2aelembytes(typ); 2509 int elem_per_lane = 16/esize; 2510 int lane = elemindex / elem_per_lane; 2511 int eindex = elemindex % elem_per_lane; 2512 2513 if (lane >= 2) { 2514 assert(UseAVX > 2, "required"); 2515 vextractf32x4(dst, src, lane & 3); 2516 return dst; 2517 } else if (lane > 0) { 2518 assert(UseAVX > 0, "required"); 2519 vextractf128(dst, src, lane); 2520 return dst; 2521 } else { 2522 return src; 2523 } 2524 } 2525 2526 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2527 if (typ == T_BYTE) { 2528 movsbl(dst, dst); 2529 } else if (typ == T_SHORT) { 2530 movswl(dst, dst); 2531 } 2532 } 2533 2534 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2535 int esize = type2aelembytes(typ); 2536 int elem_per_lane = 16/esize; 2537 int eindex = elemindex % elem_per_lane; 2538 assert(is_integral_type(typ),"required"); 2539 2540 if (eindex == 0) { 2541 if (typ == T_LONG) { 2542 movq(dst, src); 2543 } else { 2544 movdl(dst, src); 2545 movsxl(typ, dst); 2546 } 2547 } else { 2548 extract(typ, dst, src, eindex); 2549 movsxl(typ, dst); 2550 } 2551 } 2552 2553 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2554 int esize = type2aelembytes(typ); 2555 int elem_per_lane = 16/esize; 2556 int eindex = elemindex % elem_per_lane; 2557 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2558 2559 if (eindex == 0) { 2560 movq(dst, src); 2561 } else { 2562 if (typ == T_FLOAT) { 2563 if (UseAVX == 0) { 2564 movdqu(dst, src); 2565 shufps(dst, dst, eindex); 2566 } else { 2567 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2568 } 2569 } else { 2570 if (UseAVX == 0) { 2571 movdqu(dst, src); 2572 psrldq(dst, eindex*esize); 2573 } else { 2574 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2575 } 2576 movq(dst, dst); 2577 } 2578 } 2579 // Zero upper bits 2580 if (typ == T_FLOAT) { 2581 if (UseAVX == 0) { 2582 assert(vtmp != xnoreg, "required."); 2583 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2584 pand(dst, vtmp); 2585 } else { 2586 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2587 } 2588 } 2589 } 2590 2591 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2592 switch(typ) { 2593 case T_BYTE: 2594 case T_BOOLEAN: 2595 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2596 break; 2597 case T_SHORT: 2598 case T_CHAR: 2599 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2600 break; 2601 case T_INT: 2602 case T_FLOAT: 2603 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2604 break; 2605 case T_LONG: 2606 case T_DOUBLE: 2607 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2608 break; 2609 default: 2610 assert(false,"Should not reach here."); 2611 break; 2612 } 2613 } 2614 2615 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2616 assert(rscratch != noreg || always_reachable(src2), "missing"); 2617 2618 switch(typ) { 2619 case T_BOOLEAN: 2620 case T_BYTE: 2621 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2622 break; 2623 case T_CHAR: 2624 case T_SHORT: 2625 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2626 break; 2627 case T_INT: 2628 case T_FLOAT: 2629 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2630 break; 2631 case T_LONG: 2632 case T_DOUBLE: 2633 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2634 break; 2635 default: 2636 assert(false,"Should not reach here."); 2637 break; 2638 } 2639 } 2640 2641 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2642 switch(typ) { 2643 case T_BYTE: 2644 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2645 break; 2646 case T_SHORT: 2647 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2648 break; 2649 case T_INT: 2650 case T_FLOAT: 2651 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2652 break; 2653 case T_LONG: 2654 case T_DOUBLE: 2655 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2656 break; 2657 default: 2658 assert(false,"Should not reach here."); 2659 break; 2660 } 2661 } 2662 2663 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2664 assert(vlen_in_bytes <= 32, ""); 2665 int esize = type2aelembytes(bt); 2666 if (vlen_in_bytes == 32) { 2667 assert(vtmp == xnoreg, "required."); 2668 if (esize >= 4) { 2669 vtestps(src1, src2, AVX_256bit); 2670 } else { 2671 vptest(src1, src2, AVX_256bit); 2672 } 2673 return; 2674 } 2675 if (vlen_in_bytes < 16) { 2676 // Duplicate the lower part to fill the whole register, 2677 // Don't need to do so for src2 2678 assert(vtmp != xnoreg, "required"); 2679 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2680 pshufd(vtmp, src1, shuffle_imm); 2681 } else { 2682 assert(vtmp == xnoreg, "required"); 2683 vtmp = src1; 2684 } 2685 if (esize >= 4 && VM_Version::supports_avx()) { 2686 vtestps(vtmp, src2, AVX_128bit); 2687 } else { 2688 ptest(vtmp, src2); 2689 } 2690 } 2691 2692 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2693 assert(UseAVX >= 2, "required"); 2694 #ifdef ASSERT 2695 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2696 bool is_bw_supported = VM_Version::supports_avx512bw(); 2697 if (is_bw && !is_bw_supported) { 2698 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2699 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2700 "XMM register should be 0-15"); 2701 } 2702 #endif // ASSERT 2703 switch (elem_bt) { 2704 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2705 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2706 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2707 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2708 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2709 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2710 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2711 } 2712 } 2713 2714 #ifdef _LP64 2715 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2716 assert(UseAVX >= 2, "required"); 2717 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2718 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2719 if ((UseAVX > 2) && 2720 (!is_bw || VM_Version::supports_avx512bw()) && 2721 (!is_vl || VM_Version::supports_avx512vl())) { 2722 switch (elem_bt) { 2723 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2724 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2725 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2726 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2727 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2728 } 2729 } else { 2730 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2731 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2732 switch (elem_bt) { 2733 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2734 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2735 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2736 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2737 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2738 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2739 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2740 } 2741 } 2742 } 2743 #endif 2744 2745 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2746 switch (to_elem_bt) { 2747 case T_SHORT: 2748 vpmovsxbw(dst, src, vlen_enc); 2749 break; 2750 case T_INT: 2751 vpmovsxbd(dst, src, vlen_enc); 2752 break; 2753 case T_FLOAT: 2754 vpmovsxbd(dst, src, vlen_enc); 2755 vcvtdq2ps(dst, dst, vlen_enc); 2756 break; 2757 case T_LONG: 2758 vpmovsxbq(dst, src, vlen_enc); 2759 break; 2760 case T_DOUBLE: { 2761 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2762 vpmovsxbd(dst, src, mid_vlen_enc); 2763 vcvtdq2pd(dst, dst, vlen_enc); 2764 break; 2765 } 2766 default: 2767 fatal("Unsupported type %s", type2name(to_elem_bt)); 2768 break; 2769 } 2770 } 2771 2772 //------------------------------------------------------------------------------------------- 2773 2774 // IndexOf for constant substrings with size >= 8 chars 2775 // which don't need to be loaded through stack. 2776 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2777 Register cnt1, Register cnt2, 2778 int int_cnt2, Register result, 2779 XMMRegister vec, Register tmp, 2780 int ae) { 2781 ShortBranchVerifier sbv(this); 2782 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2783 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2784 2785 // This method uses the pcmpestri instruction with bound registers 2786 // inputs: 2787 // xmm - substring 2788 // rax - substring length (elements count) 2789 // mem - scanned string 2790 // rdx - string length (elements count) 2791 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2792 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2793 // outputs: 2794 // rcx - matched index in string 2795 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2796 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2797 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2798 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2799 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2800 2801 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2802 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2803 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2804 2805 // Note, inline_string_indexOf() generates checks: 2806 // if (substr.count > string.count) return -1; 2807 // if (substr.count == 0) return 0; 2808 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2809 2810 // Load substring. 2811 if (ae == StrIntrinsicNode::UL) { 2812 pmovzxbw(vec, Address(str2, 0)); 2813 } else { 2814 movdqu(vec, Address(str2, 0)); 2815 } 2816 movl(cnt2, int_cnt2); 2817 movptr(result, str1); // string addr 2818 2819 if (int_cnt2 > stride) { 2820 jmpb(SCAN_TO_SUBSTR); 2821 2822 // Reload substr for rescan, this code 2823 // is executed only for large substrings (> 8 chars) 2824 bind(RELOAD_SUBSTR); 2825 if (ae == StrIntrinsicNode::UL) { 2826 pmovzxbw(vec, Address(str2, 0)); 2827 } else { 2828 movdqu(vec, Address(str2, 0)); 2829 } 2830 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2831 2832 bind(RELOAD_STR); 2833 // We came here after the beginning of the substring was 2834 // matched but the rest of it was not so we need to search 2835 // again. Start from the next element after the previous match. 2836 2837 // cnt2 is number of substring reminding elements and 2838 // cnt1 is number of string reminding elements when cmp failed. 2839 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2840 subl(cnt1, cnt2); 2841 addl(cnt1, int_cnt2); 2842 movl(cnt2, int_cnt2); // Now restore cnt2 2843 2844 decrementl(cnt1); // Shift to next element 2845 cmpl(cnt1, cnt2); 2846 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2847 2848 addptr(result, (1<<scale1)); 2849 2850 } // (int_cnt2 > 8) 2851 2852 // Scan string for start of substr in 16-byte vectors 2853 bind(SCAN_TO_SUBSTR); 2854 pcmpestri(vec, Address(result, 0), mode); 2855 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2856 subl(cnt1, stride); 2857 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2858 cmpl(cnt1, cnt2); 2859 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2860 addptr(result, 16); 2861 jmpb(SCAN_TO_SUBSTR); 2862 2863 // Found a potential substr 2864 bind(FOUND_CANDIDATE); 2865 // Matched whole vector if first element matched (tmp(rcx) == 0). 2866 if (int_cnt2 == stride) { 2867 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2868 } else { // int_cnt2 > 8 2869 jccb(Assembler::overflow, FOUND_SUBSTR); 2870 } 2871 // After pcmpestri tmp(rcx) contains matched element index 2872 // Compute start addr of substr 2873 lea(result, Address(result, tmp, scale1)); 2874 2875 // Make sure string is still long enough 2876 subl(cnt1, tmp); 2877 cmpl(cnt1, cnt2); 2878 if (int_cnt2 == stride) { 2879 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2880 } else { // int_cnt2 > 8 2881 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2882 } 2883 // Left less then substring. 2884 2885 bind(RET_NOT_FOUND); 2886 movl(result, -1); 2887 jmp(EXIT); 2888 2889 if (int_cnt2 > stride) { 2890 // This code is optimized for the case when whole substring 2891 // is matched if its head is matched. 2892 bind(MATCH_SUBSTR_HEAD); 2893 pcmpestri(vec, Address(result, 0), mode); 2894 // Reload only string if does not match 2895 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2896 2897 Label CONT_SCAN_SUBSTR; 2898 // Compare the rest of substring (> 8 chars). 2899 bind(FOUND_SUBSTR); 2900 // First 8 chars are already matched. 2901 negptr(cnt2); 2902 addptr(cnt2, stride); 2903 2904 bind(SCAN_SUBSTR); 2905 subl(cnt1, stride); 2906 cmpl(cnt2, -stride); // Do not read beyond substring 2907 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2908 // Back-up strings to avoid reading beyond substring: 2909 // cnt1 = cnt1 - cnt2 + 8 2910 addl(cnt1, cnt2); // cnt2 is negative 2911 addl(cnt1, stride); 2912 movl(cnt2, stride); negptr(cnt2); 2913 bind(CONT_SCAN_SUBSTR); 2914 if (int_cnt2 < (int)G) { 2915 int tail_off1 = int_cnt2<<scale1; 2916 int tail_off2 = int_cnt2<<scale2; 2917 if (ae == StrIntrinsicNode::UL) { 2918 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2919 } else { 2920 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2921 } 2922 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2923 } else { 2924 // calculate index in register to avoid integer overflow (int_cnt2*2) 2925 movl(tmp, int_cnt2); 2926 addptr(tmp, cnt2); 2927 if (ae == StrIntrinsicNode::UL) { 2928 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2929 } else { 2930 movdqu(vec, Address(str2, tmp, scale2, 0)); 2931 } 2932 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2933 } 2934 // Need to reload strings pointers if not matched whole vector 2935 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2936 addptr(cnt2, stride); 2937 jcc(Assembler::negative, SCAN_SUBSTR); 2938 // Fall through if found full substring 2939 2940 } // (int_cnt2 > 8) 2941 2942 bind(RET_FOUND); 2943 // Found result if we matched full small substring. 2944 // Compute substr offset 2945 subptr(result, str1); 2946 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2947 shrl(result, 1); // index 2948 } 2949 bind(EXIT); 2950 2951 } // string_indexofC8 2952 2953 // Small strings are loaded through stack if they cross page boundary. 2954 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2955 Register cnt1, Register cnt2, 2956 int int_cnt2, Register result, 2957 XMMRegister vec, Register tmp, 2958 int ae) { 2959 ShortBranchVerifier sbv(this); 2960 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2961 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2962 2963 // 2964 // int_cnt2 is length of small (< 8 chars) constant substring 2965 // or (-1) for non constant substring in which case its length 2966 // is in cnt2 register. 2967 // 2968 // Note, inline_string_indexOf() generates checks: 2969 // if (substr.count > string.count) return -1; 2970 // if (substr.count == 0) return 0; 2971 // 2972 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2973 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2974 // This method uses the pcmpestri instruction with bound registers 2975 // inputs: 2976 // xmm - substring 2977 // rax - substring length (elements count) 2978 // mem - scanned string 2979 // rdx - string length (elements count) 2980 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2981 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2982 // outputs: 2983 // rcx - matched index in string 2984 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2985 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2986 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2987 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2988 2989 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2990 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2991 FOUND_CANDIDATE; 2992 2993 { //======================================================== 2994 // We don't know where these strings are located 2995 // and we can't read beyond them. Load them through stack. 2996 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2997 2998 movptr(tmp, rsp); // save old SP 2999 3000 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 3001 if (int_cnt2 == (1>>scale2)) { // One byte 3002 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 3003 load_unsigned_byte(result, Address(str2, 0)); 3004 movdl(vec, result); // move 32 bits 3005 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 3006 // Not enough header space in 32-bit VM: 12+3 = 15. 3007 movl(result, Address(str2, -1)); 3008 shrl(result, 8); 3009 movdl(vec, result); // move 32 bits 3010 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 3011 load_unsigned_short(result, Address(str2, 0)); 3012 movdl(vec, result); // move 32 bits 3013 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 3014 movdl(vec, Address(str2, 0)); // move 32 bits 3015 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 3016 movq(vec, Address(str2, 0)); // move 64 bits 3017 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 3018 // Array header size is 12 bytes in 32-bit VM 3019 // + 6 bytes for 3 chars == 18 bytes, 3020 // enough space to load vec and shift. 3021 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3022 if (ae == StrIntrinsicNode::UL) { 3023 int tail_off = int_cnt2-8; 3024 pmovzxbw(vec, Address(str2, tail_off)); 3025 psrldq(vec, -2*tail_off); 3026 } 3027 else { 3028 int tail_off = int_cnt2*(1<<scale2); 3029 movdqu(vec, Address(str2, tail_off-16)); 3030 psrldq(vec, 16-tail_off); 3031 } 3032 } 3033 } else { // not constant substring 3034 cmpl(cnt2, stride); 3035 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3036 3037 // We can read beyond string if srt+16 does not cross page boundary 3038 // since heaps are aligned and mapped by pages. 3039 assert(os::vm_page_size() < (int)G, "default page should be small"); 3040 movl(result, str2); // We need only low 32 bits 3041 andl(result, ((int)os::vm_page_size()-1)); 3042 cmpl(result, ((int)os::vm_page_size()-16)); 3043 jccb(Assembler::belowEqual, CHECK_STR); 3044 3045 // Move small strings to stack to allow load 16 bytes into vec. 3046 subptr(rsp, 16); 3047 int stk_offset = wordSize-(1<<scale2); 3048 push(cnt2); 3049 3050 bind(COPY_SUBSTR); 3051 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3052 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3053 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3054 } else if (ae == StrIntrinsicNode::UU) { 3055 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3056 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3057 } 3058 decrement(cnt2); 3059 jccb(Assembler::notZero, COPY_SUBSTR); 3060 3061 pop(cnt2); 3062 movptr(str2, rsp); // New substring address 3063 } // non constant 3064 3065 bind(CHECK_STR); 3066 cmpl(cnt1, stride); 3067 jccb(Assembler::aboveEqual, BIG_STRINGS); 3068 3069 // Check cross page boundary. 3070 movl(result, str1); // We need only low 32 bits 3071 andl(result, ((int)os::vm_page_size()-1)); 3072 cmpl(result, ((int)os::vm_page_size()-16)); 3073 jccb(Assembler::belowEqual, BIG_STRINGS); 3074 3075 subptr(rsp, 16); 3076 int stk_offset = -(1<<scale1); 3077 if (int_cnt2 < 0) { // not constant 3078 push(cnt2); 3079 stk_offset += wordSize; 3080 } 3081 movl(cnt2, cnt1); 3082 3083 bind(COPY_STR); 3084 if (ae == StrIntrinsicNode::LL) { 3085 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3086 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3087 } else { 3088 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3089 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3090 } 3091 decrement(cnt2); 3092 jccb(Assembler::notZero, COPY_STR); 3093 3094 if (int_cnt2 < 0) { // not constant 3095 pop(cnt2); 3096 } 3097 movptr(str1, rsp); // New string address 3098 3099 bind(BIG_STRINGS); 3100 // Load substring. 3101 if (int_cnt2 < 0) { // -1 3102 if (ae == StrIntrinsicNode::UL) { 3103 pmovzxbw(vec, Address(str2, 0)); 3104 } else { 3105 movdqu(vec, Address(str2, 0)); 3106 } 3107 push(cnt2); // substr count 3108 push(str2); // substr addr 3109 push(str1); // string addr 3110 } else { 3111 // Small (< 8 chars) constant substrings are loaded already. 3112 movl(cnt2, int_cnt2); 3113 } 3114 push(tmp); // original SP 3115 3116 } // Finished loading 3117 3118 //======================================================== 3119 // Start search 3120 // 3121 3122 movptr(result, str1); // string addr 3123 3124 if (int_cnt2 < 0) { // Only for non constant substring 3125 jmpb(SCAN_TO_SUBSTR); 3126 3127 // SP saved at sp+0 3128 // String saved at sp+1*wordSize 3129 // Substr saved at sp+2*wordSize 3130 // Substr count saved at sp+3*wordSize 3131 3132 // Reload substr for rescan, this code 3133 // is executed only for large substrings (> 8 chars) 3134 bind(RELOAD_SUBSTR); 3135 movptr(str2, Address(rsp, 2*wordSize)); 3136 movl(cnt2, Address(rsp, 3*wordSize)); 3137 if (ae == StrIntrinsicNode::UL) { 3138 pmovzxbw(vec, Address(str2, 0)); 3139 } else { 3140 movdqu(vec, Address(str2, 0)); 3141 } 3142 // We came here after the beginning of the substring was 3143 // matched but the rest of it was not so we need to search 3144 // again. Start from the next element after the previous match. 3145 subptr(str1, result); // Restore counter 3146 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3147 shrl(str1, 1); 3148 } 3149 addl(cnt1, str1); 3150 decrementl(cnt1); // Shift to next element 3151 cmpl(cnt1, cnt2); 3152 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3153 3154 addptr(result, (1<<scale1)); 3155 } // non constant 3156 3157 // Scan string for start of substr in 16-byte vectors 3158 bind(SCAN_TO_SUBSTR); 3159 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3160 pcmpestri(vec, Address(result, 0), mode); 3161 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3162 subl(cnt1, stride); 3163 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3164 cmpl(cnt1, cnt2); 3165 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3166 addptr(result, 16); 3167 3168 bind(ADJUST_STR); 3169 cmpl(cnt1, stride); // Do not read beyond string 3170 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3171 // Back-up string to avoid reading beyond string. 3172 lea(result, Address(result, cnt1, scale1, -16)); 3173 movl(cnt1, stride); 3174 jmpb(SCAN_TO_SUBSTR); 3175 3176 // Found a potential substr 3177 bind(FOUND_CANDIDATE); 3178 // After pcmpestri tmp(rcx) contains matched element index 3179 3180 // Make sure string is still long enough 3181 subl(cnt1, tmp); 3182 cmpl(cnt1, cnt2); 3183 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3184 // Left less then substring. 3185 3186 bind(RET_NOT_FOUND); 3187 movl(result, -1); 3188 jmp(CLEANUP); 3189 3190 bind(FOUND_SUBSTR); 3191 // Compute start addr of substr 3192 lea(result, Address(result, tmp, scale1)); 3193 if (int_cnt2 > 0) { // Constant substring 3194 // Repeat search for small substring (< 8 chars) 3195 // from new point without reloading substring. 3196 // Have to check that we don't read beyond string. 3197 cmpl(tmp, stride-int_cnt2); 3198 jccb(Assembler::greater, ADJUST_STR); 3199 // Fall through if matched whole substring. 3200 } else { // non constant 3201 assert(int_cnt2 == -1, "should be != 0"); 3202 3203 addl(tmp, cnt2); 3204 // Found result if we matched whole substring. 3205 cmpl(tmp, stride); 3206 jcc(Assembler::lessEqual, RET_FOUND); 3207 3208 // Repeat search for small substring (<= 8 chars) 3209 // from new point 'str1' without reloading substring. 3210 cmpl(cnt2, stride); 3211 // Have to check that we don't read beyond string. 3212 jccb(Assembler::lessEqual, ADJUST_STR); 3213 3214 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3215 // Compare the rest of substring (> 8 chars). 3216 movptr(str1, result); 3217 3218 cmpl(tmp, cnt2); 3219 // First 8 chars are already matched. 3220 jccb(Assembler::equal, CHECK_NEXT); 3221 3222 bind(SCAN_SUBSTR); 3223 pcmpestri(vec, Address(str1, 0), mode); 3224 // Need to reload strings pointers if not matched whole vector 3225 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3226 3227 bind(CHECK_NEXT); 3228 subl(cnt2, stride); 3229 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3230 addptr(str1, 16); 3231 if (ae == StrIntrinsicNode::UL) { 3232 addptr(str2, 8); 3233 } else { 3234 addptr(str2, 16); 3235 } 3236 subl(cnt1, stride); 3237 cmpl(cnt2, stride); // Do not read beyond substring 3238 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3239 // Back-up strings to avoid reading beyond substring. 3240 3241 if (ae == StrIntrinsicNode::UL) { 3242 lea(str2, Address(str2, cnt2, scale2, -8)); 3243 lea(str1, Address(str1, cnt2, scale1, -16)); 3244 } else { 3245 lea(str2, Address(str2, cnt2, scale2, -16)); 3246 lea(str1, Address(str1, cnt2, scale1, -16)); 3247 } 3248 subl(cnt1, cnt2); 3249 movl(cnt2, stride); 3250 addl(cnt1, stride); 3251 bind(CONT_SCAN_SUBSTR); 3252 if (ae == StrIntrinsicNode::UL) { 3253 pmovzxbw(vec, Address(str2, 0)); 3254 } else { 3255 movdqu(vec, Address(str2, 0)); 3256 } 3257 jmp(SCAN_SUBSTR); 3258 3259 bind(RET_FOUND_LONG); 3260 movptr(str1, Address(rsp, wordSize)); 3261 } // non constant 3262 3263 bind(RET_FOUND); 3264 // Compute substr offset 3265 subptr(result, str1); 3266 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3267 shrl(result, 1); // index 3268 } 3269 bind(CLEANUP); 3270 pop(rsp); // restore SP 3271 3272 } // string_indexof 3273 3274 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3275 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3276 ShortBranchVerifier sbv(this); 3277 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3278 3279 int stride = 8; 3280 3281 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3282 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3283 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3284 FOUND_SEQ_CHAR, DONE_LABEL; 3285 3286 movptr(result, str1); 3287 if (UseAVX >= 2) { 3288 cmpl(cnt1, stride); 3289 jcc(Assembler::less, SCAN_TO_CHAR); 3290 cmpl(cnt1, 2*stride); 3291 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3292 movdl(vec1, ch); 3293 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3294 vpxor(vec2, vec2); 3295 movl(tmp, cnt1); 3296 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3297 andl(cnt1,0x0000000F); //tail count (in chars) 3298 3299 bind(SCAN_TO_16_CHAR_LOOP); 3300 vmovdqu(vec3, Address(result, 0)); 3301 vpcmpeqw(vec3, vec3, vec1, 1); 3302 vptest(vec2, vec3); 3303 jcc(Assembler::carryClear, FOUND_CHAR); 3304 addptr(result, 32); 3305 subl(tmp, 2*stride); 3306 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3307 jmp(SCAN_TO_8_CHAR); 3308 bind(SCAN_TO_8_CHAR_INIT); 3309 movdl(vec1, ch); 3310 pshuflw(vec1, vec1, 0x00); 3311 pshufd(vec1, vec1, 0); 3312 pxor(vec2, vec2); 3313 } 3314 bind(SCAN_TO_8_CHAR); 3315 cmpl(cnt1, stride); 3316 jcc(Assembler::less, SCAN_TO_CHAR); 3317 if (UseAVX < 2) { 3318 movdl(vec1, ch); 3319 pshuflw(vec1, vec1, 0x00); 3320 pshufd(vec1, vec1, 0); 3321 pxor(vec2, vec2); 3322 } 3323 movl(tmp, cnt1); 3324 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3325 andl(cnt1,0x00000007); //tail count (in chars) 3326 3327 bind(SCAN_TO_8_CHAR_LOOP); 3328 movdqu(vec3, Address(result, 0)); 3329 pcmpeqw(vec3, vec1); 3330 ptest(vec2, vec3); 3331 jcc(Assembler::carryClear, FOUND_CHAR); 3332 addptr(result, 16); 3333 subl(tmp, stride); 3334 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3335 bind(SCAN_TO_CHAR); 3336 testl(cnt1, cnt1); 3337 jcc(Assembler::zero, RET_NOT_FOUND); 3338 bind(SCAN_TO_CHAR_LOOP); 3339 load_unsigned_short(tmp, Address(result, 0)); 3340 cmpl(ch, tmp); 3341 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3342 addptr(result, 2); 3343 subl(cnt1, 1); 3344 jccb(Assembler::zero, RET_NOT_FOUND); 3345 jmp(SCAN_TO_CHAR_LOOP); 3346 3347 bind(RET_NOT_FOUND); 3348 movl(result, -1); 3349 jmpb(DONE_LABEL); 3350 3351 bind(FOUND_CHAR); 3352 if (UseAVX >= 2) { 3353 vpmovmskb(tmp, vec3); 3354 } else { 3355 pmovmskb(tmp, vec3); 3356 } 3357 bsfl(ch, tmp); 3358 addptr(result, ch); 3359 3360 bind(FOUND_SEQ_CHAR); 3361 subptr(result, str1); 3362 shrl(result, 1); 3363 3364 bind(DONE_LABEL); 3365 } // string_indexof_char 3366 3367 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3368 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3369 ShortBranchVerifier sbv(this); 3370 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3371 3372 int stride = 16; 3373 3374 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3375 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3376 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3377 FOUND_SEQ_CHAR, DONE_LABEL; 3378 3379 movptr(result, str1); 3380 if (UseAVX >= 2) { 3381 cmpl(cnt1, stride); 3382 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3383 cmpl(cnt1, stride*2); 3384 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3385 movdl(vec1, ch); 3386 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3387 vpxor(vec2, vec2); 3388 movl(tmp, cnt1); 3389 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3390 andl(cnt1,0x0000001F); //tail count (in chars) 3391 3392 bind(SCAN_TO_32_CHAR_LOOP); 3393 vmovdqu(vec3, Address(result, 0)); 3394 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3395 vptest(vec2, vec3); 3396 jcc(Assembler::carryClear, FOUND_CHAR); 3397 addptr(result, 32); 3398 subl(tmp, stride*2); 3399 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3400 jmp(SCAN_TO_16_CHAR); 3401 3402 bind(SCAN_TO_16_CHAR_INIT); 3403 movdl(vec1, ch); 3404 pxor(vec2, vec2); 3405 pshufb(vec1, vec2); 3406 } 3407 3408 bind(SCAN_TO_16_CHAR); 3409 cmpl(cnt1, stride); 3410 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3411 if (UseAVX < 2) { 3412 movdl(vec1, ch); 3413 pxor(vec2, vec2); 3414 pshufb(vec1, vec2); 3415 } 3416 movl(tmp, cnt1); 3417 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3418 andl(cnt1,0x0000000F); //tail count (in bytes) 3419 3420 bind(SCAN_TO_16_CHAR_LOOP); 3421 movdqu(vec3, Address(result, 0)); 3422 pcmpeqb(vec3, vec1); 3423 ptest(vec2, vec3); 3424 jcc(Assembler::carryClear, FOUND_CHAR); 3425 addptr(result, 16); 3426 subl(tmp, stride); 3427 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3428 3429 bind(SCAN_TO_CHAR_INIT); 3430 testl(cnt1, cnt1); 3431 jcc(Assembler::zero, RET_NOT_FOUND); 3432 bind(SCAN_TO_CHAR_LOOP); 3433 load_unsigned_byte(tmp, Address(result, 0)); 3434 cmpl(ch, tmp); 3435 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3436 addptr(result, 1); 3437 subl(cnt1, 1); 3438 jccb(Assembler::zero, RET_NOT_FOUND); 3439 jmp(SCAN_TO_CHAR_LOOP); 3440 3441 bind(RET_NOT_FOUND); 3442 movl(result, -1); 3443 jmpb(DONE_LABEL); 3444 3445 bind(FOUND_CHAR); 3446 if (UseAVX >= 2) { 3447 vpmovmskb(tmp, vec3); 3448 } else { 3449 pmovmskb(tmp, vec3); 3450 } 3451 bsfl(ch, tmp); 3452 addptr(result, ch); 3453 3454 bind(FOUND_SEQ_CHAR); 3455 subptr(result, str1); 3456 3457 bind(DONE_LABEL); 3458 } // stringL_indexof_char 3459 3460 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3461 switch (eltype) { 3462 case T_BOOLEAN: return sizeof(jboolean); 3463 case T_BYTE: return sizeof(jbyte); 3464 case T_SHORT: return sizeof(jshort); 3465 case T_CHAR: return sizeof(jchar); 3466 case T_INT: return sizeof(jint); 3467 default: 3468 ShouldNotReachHere(); 3469 return -1; 3470 } 3471 } 3472 3473 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3474 switch (eltype) { 3475 // T_BOOLEAN used as surrogate for unsigned byte 3476 case T_BOOLEAN: movzbl(dst, src); break; 3477 case T_BYTE: movsbl(dst, src); break; 3478 case T_SHORT: movswl(dst, src); break; 3479 case T_CHAR: movzwl(dst, src); break; 3480 case T_INT: movl(dst, src); break; 3481 default: 3482 ShouldNotReachHere(); 3483 } 3484 } 3485 3486 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3487 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3488 } 3489 3490 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3491 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3492 } 3493 3494 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3495 const int vlen = Assembler::AVX_256bit; 3496 switch (eltype) { 3497 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3498 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3499 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3500 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3501 case T_INT: 3502 // do nothing 3503 break; 3504 default: 3505 ShouldNotReachHere(); 3506 } 3507 } 3508 3509 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3510 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3511 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3512 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3513 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3514 BasicType eltype) { 3515 ShortBranchVerifier sbv(this); 3516 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3517 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3518 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3519 3520 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3521 SHORT_UNROLLED_LOOP_EXIT, 3522 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3523 UNROLLED_VECTOR_LOOP_BEGIN, 3524 END; 3525 switch (eltype) { 3526 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3527 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3528 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3529 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3530 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3531 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3532 } 3533 3534 // For "renaming" for readibility of the code 3535 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3536 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3537 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3538 3539 const int elsize = arrays_hashcode_elsize(eltype); 3540 3541 /* 3542 if (cnt1 >= 2) { 3543 if (cnt1 >= 32) { 3544 UNROLLED VECTOR LOOP 3545 } 3546 UNROLLED SCALAR LOOP 3547 } 3548 SINGLE SCALAR 3549 */ 3550 3551 cmpl(cnt1, 32); 3552 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3553 3554 // cnt1 >= 32 && generate_vectorized_loop 3555 xorl(index, index); 3556 3557 // vresult = IntVector.zero(I256); 3558 for (int idx = 0; idx < 4; idx++) { 3559 vpxor(vresult[idx], vresult[idx]); 3560 } 3561 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3562 Register bound = tmp2; 3563 Register next = tmp3; 3564 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3565 movl(next, Address(tmp2, 0)); 3566 movdl(vnext, next); 3567 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3568 3569 // index = 0; 3570 // bound = cnt1 & ~(32 - 1); 3571 movl(bound, cnt1); 3572 andl(bound, ~(32 - 1)); 3573 // for (; index < bound; index += 32) { 3574 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3575 // result *= next; 3576 imull(result, next); 3577 // loop fission to upfront the cost of fetching from memory, OOO execution 3578 // can then hopefully do a better job of prefetching 3579 for (int idx = 0; idx < 4; idx++) { 3580 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3581 } 3582 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3583 for (int idx = 0; idx < 4; idx++) { 3584 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3585 arrays_hashcode_elvcast(vtmp[idx], eltype); 3586 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3587 } 3588 // index += 32; 3589 addl(index, 32); 3590 // index < bound; 3591 cmpl(index, bound); 3592 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3593 // } 3594 3595 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3596 subl(cnt1, bound); 3597 // release bound 3598 3599 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3600 for (int idx = 0; idx < 4; idx++) { 3601 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3602 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3603 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3604 } 3605 // result += vresult.reduceLanes(ADD); 3606 for (int idx = 0; idx < 4; idx++) { 3607 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3608 } 3609 3610 // } else if (cnt1 < 32) { 3611 3612 bind(SHORT_UNROLLED_BEGIN); 3613 // int i = 1; 3614 movl(index, 1); 3615 cmpl(index, cnt1); 3616 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3617 3618 // for (; i < cnt1 ; i += 2) { 3619 bind(SHORT_UNROLLED_LOOP_BEGIN); 3620 movl(tmp3, 961); 3621 imull(result, tmp3); 3622 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3623 movl(tmp3, tmp2); 3624 shll(tmp3, 5); 3625 subl(tmp3, tmp2); 3626 addl(result, tmp3); 3627 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3628 addl(result, tmp3); 3629 addl(index, 2); 3630 cmpl(index, cnt1); 3631 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3632 3633 // } 3634 // if (i >= cnt1) { 3635 bind(SHORT_UNROLLED_LOOP_EXIT); 3636 jccb(Assembler::greater, END); 3637 movl(tmp2, result); 3638 shll(result, 5); 3639 subl(result, tmp2); 3640 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3641 addl(result, tmp3); 3642 // } 3643 bind(END); 3644 3645 BLOCK_COMMENT("} // arrays_hashcode"); 3646 3647 } // arrays_hashcode 3648 3649 // helper function for string_compare 3650 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3651 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3652 Address::ScaleFactor scale2, Register index, int ae) { 3653 if (ae == StrIntrinsicNode::LL) { 3654 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3655 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3656 } else if (ae == StrIntrinsicNode::UU) { 3657 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3658 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3659 } else { 3660 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3661 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3662 } 3663 } 3664 3665 // Compare strings, used for char[] and byte[]. 3666 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3667 Register cnt1, Register cnt2, Register result, 3668 XMMRegister vec1, int ae, KRegister mask) { 3669 ShortBranchVerifier sbv(this); 3670 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3671 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3672 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3673 int stride2x2 = 0x40; 3674 Address::ScaleFactor scale = Address::no_scale; 3675 Address::ScaleFactor scale1 = Address::no_scale; 3676 Address::ScaleFactor scale2 = Address::no_scale; 3677 3678 if (ae != StrIntrinsicNode::LL) { 3679 stride2x2 = 0x20; 3680 } 3681 3682 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3683 shrl(cnt2, 1); 3684 } 3685 // Compute the minimum of the string lengths and the 3686 // difference of the string lengths (stack). 3687 // Do the conditional move stuff 3688 movl(result, cnt1); 3689 subl(cnt1, cnt2); 3690 push(cnt1); 3691 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3692 3693 // Is the minimum length zero? 3694 testl(cnt2, cnt2); 3695 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3696 if (ae == StrIntrinsicNode::LL) { 3697 // Load first bytes 3698 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3699 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3700 } else if (ae == StrIntrinsicNode::UU) { 3701 // Load first characters 3702 load_unsigned_short(result, Address(str1, 0)); 3703 load_unsigned_short(cnt1, Address(str2, 0)); 3704 } else { 3705 load_unsigned_byte(result, Address(str1, 0)); 3706 load_unsigned_short(cnt1, Address(str2, 0)); 3707 } 3708 subl(result, cnt1); 3709 jcc(Assembler::notZero, POP_LABEL); 3710 3711 if (ae == StrIntrinsicNode::UU) { 3712 // Divide length by 2 to get number of chars 3713 shrl(cnt2, 1); 3714 } 3715 cmpl(cnt2, 1); 3716 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3717 3718 // Check if the strings start at the same location and setup scale and stride 3719 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3720 cmpptr(str1, str2); 3721 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3722 if (ae == StrIntrinsicNode::LL) { 3723 scale = Address::times_1; 3724 stride = 16; 3725 } else { 3726 scale = Address::times_2; 3727 stride = 8; 3728 } 3729 } else { 3730 scale1 = Address::times_1; 3731 scale2 = Address::times_2; 3732 // scale not used 3733 stride = 8; 3734 } 3735 3736 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3737 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3738 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3739 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3740 Label COMPARE_TAIL_LONG; 3741 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3742 3743 int pcmpmask = 0x19; 3744 if (ae == StrIntrinsicNode::LL) { 3745 pcmpmask &= ~0x01; 3746 } 3747 3748 // Setup to compare 16-chars (32-bytes) vectors, 3749 // start from first character again because it has aligned address. 3750 if (ae == StrIntrinsicNode::LL) { 3751 stride2 = 32; 3752 } else { 3753 stride2 = 16; 3754 } 3755 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3756 adr_stride = stride << scale; 3757 } else { 3758 adr_stride1 = 8; //stride << scale1; 3759 adr_stride2 = 16; //stride << scale2; 3760 } 3761 3762 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3763 // rax and rdx are used by pcmpestri as elements counters 3764 movl(result, cnt2); 3765 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3766 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3767 3768 // fast path : compare first 2 8-char vectors. 3769 bind(COMPARE_16_CHARS); 3770 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3771 movdqu(vec1, Address(str1, 0)); 3772 } else { 3773 pmovzxbw(vec1, Address(str1, 0)); 3774 } 3775 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3776 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3777 3778 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3779 movdqu(vec1, Address(str1, adr_stride)); 3780 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3781 } else { 3782 pmovzxbw(vec1, Address(str1, adr_stride1)); 3783 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3784 } 3785 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3786 addl(cnt1, stride); 3787 3788 // Compare the characters at index in cnt1 3789 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3790 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3791 subl(result, cnt2); 3792 jmp(POP_LABEL); 3793 3794 // Setup the registers to start vector comparison loop 3795 bind(COMPARE_WIDE_VECTORS); 3796 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3797 lea(str1, Address(str1, result, scale)); 3798 lea(str2, Address(str2, result, scale)); 3799 } else { 3800 lea(str1, Address(str1, result, scale1)); 3801 lea(str2, Address(str2, result, scale2)); 3802 } 3803 subl(result, stride2); 3804 subl(cnt2, stride2); 3805 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3806 negptr(result); 3807 3808 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3809 bind(COMPARE_WIDE_VECTORS_LOOP); 3810 3811 #ifdef _LP64 3812 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3813 cmpl(cnt2, stride2x2); 3814 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3815 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3816 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3817 3818 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3819 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3820 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3821 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3822 } else { 3823 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3824 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3825 } 3826 kortestql(mask, mask); 3827 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3828 addptr(result, stride2x2); // update since we already compared at this addr 3829 subl(cnt2, stride2x2); // and sub the size too 3830 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3831 3832 vpxor(vec1, vec1); 3833 jmpb(COMPARE_WIDE_TAIL); 3834 }//if (VM_Version::supports_avx512vlbw()) 3835 #endif // _LP64 3836 3837 3838 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3839 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3840 vmovdqu(vec1, Address(str1, result, scale)); 3841 vpxor(vec1, Address(str2, result, scale)); 3842 } else { 3843 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3844 vpxor(vec1, Address(str2, result, scale2)); 3845 } 3846 vptest(vec1, vec1); 3847 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3848 addptr(result, stride2); 3849 subl(cnt2, stride2); 3850 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3851 // clean upper bits of YMM registers 3852 vpxor(vec1, vec1); 3853 3854 // compare wide vectors tail 3855 bind(COMPARE_WIDE_TAIL); 3856 testptr(result, result); 3857 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3858 3859 movl(result, stride2); 3860 movl(cnt2, result); 3861 negptr(result); 3862 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3863 3864 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3865 bind(VECTOR_NOT_EQUAL); 3866 // clean upper bits of YMM registers 3867 vpxor(vec1, vec1); 3868 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3869 lea(str1, Address(str1, result, scale)); 3870 lea(str2, Address(str2, result, scale)); 3871 } else { 3872 lea(str1, Address(str1, result, scale1)); 3873 lea(str2, Address(str2, result, scale2)); 3874 } 3875 jmp(COMPARE_16_CHARS); 3876 3877 // Compare tail chars, length between 1 to 15 chars 3878 bind(COMPARE_TAIL_LONG); 3879 movl(cnt2, result); 3880 cmpl(cnt2, stride); 3881 jcc(Assembler::less, COMPARE_SMALL_STR); 3882 3883 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3884 movdqu(vec1, Address(str1, 0)); 3885 } else { 3886 pmovzxbw(vec1, Address(str1, 0)); 3887 } 3888 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3889 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3890 subptr(cnt2, stride); 3891 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3892 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3893 lea(str1, Address(str1, result, scale)); 3894 lea(str2, Address(str2, result, scale)); 3895 } else { 3896 lea(str1, Address(str1, result, scale1)); 3897 lea(str2, Address(str2, result, scale2)); 3898 } 3899 negptr(cnt2); 3900 jmpb(WHILE_HEAD_LABEL); 3901 3902 bind(COMPARE_SMALL_STR); 3903 } else if (UseSSE42Intrinsics) { 3904 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3905 int pcmpmask = 0x19; 3906 // Setup to compare 8-char (16-byte) vectors, 3907 // start from first character again because it has aligned address. 3908 movl(result, cnt2); 3909 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3910 if (ae == StrIntrinsicNode::LL) { 3911 pcmpmask &= ~0x01; 3912 } 3913 jcc(Assembler::zero, COMPARE_TAIL); 3914 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3915 lea(str1, Address(str1, result, scale)); 3916 lea(str2, Address(str2, result, scale)); 3917 } else { 3918 lea(str1, Address(str1, result, scale1)); 3919 lea(str2, Address(str2, result, scale2)); 3920 } 3921 negptr(result); 3922 3923 // pcmpestri 3924 // inputs: 3925 // vec1- substring 3926 // rax - negative string length (elements count) 3927 // mem - scanned string 3928 // rdx - string length (elements count) 3929 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3930 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3931 // outputs: 3932 // rcx - first mismatched element index 3933 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3934 3935 bind(COMPARE_WIDE_VECTORS); 3936 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3937 movdqu(vec1, Address(str1, result, scale)); 3938 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3939 } else { 3940 pmovzxbw(vec1, Address(str1, result, scale1)); 3941 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3942 } 3943 // After pcmpestri cnt1(rcx) contains mismatched element index 3944 3945 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3946 addptr(result, stride); 3947 subptr(cnt2, stride); 3948 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3949 3950 // compare wide vectors tail 3951 testptr(result, result); 3952 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3953 3954 movl(cnt2, stride); 3955 movl(result, stride); 3956 negptr(result); 3957 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3958 movdqu(vec1, Address(str1, result, scale)); 3959 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3960 } else { 3961 pmovzxbw(vec1, Address(str1, result, scale1)); 3962 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3963 } 3964 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3965 3966 // Mismatched characters in the vectors 3967 bind(VECTOR_NOT_EQUAL); 3968 addptr(cnt1, result); 3969 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3970 subl(result, cnt2); 3971 jmpb(POP_LABEL); 3972 3973 bind(COMPARE_TAIL); // limit is zero 3974 movl(cnt2, result); 3975 // Fallthru to tail compare 3976 } 3977 // Shift str2 and str1 to the end of the arrays, negate min 3978 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3979 lea(str1, Address(str1, cnt2, scale)); 3980 lea(str2, Address(str2, cnt2, scale)); 3981 } else { 3982 lea(str1, Address(str1, cnt2, scale1)); 3983 lea(str2, Address(str2, cnt2, scale2)); 3984 } 3985 decrementl(cnt2); // first character was compared already 3986 negptr(cnt2); 3987 3988 // Compare the rest of the elements 3989 bind(WHILE_HEAD_LABEL); 3990 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3991 subl(result, cnt1); 3992 jccb(Assembler::notZero, POP_LABEL); 3993 increment(cnt2); 3994 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3995 3996 // Strings are equal up to min length. Return the length difference. 3997 bind(LENGTH_DIFF_LABEL); 3998 pop(result); 3999 if (ae == StrIntrinsicNode::UU) { 4000 // Divide diff by 2 to get number of chars 4001 sarl(result, 1); 4002 } 4003 jmpb(DONE_LABEL); 4004 4005 #ifdef _LP64 4006 if (VM_Version::supports_avx512vlbw()) { 4007 4008 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 4009 4010 kmovql(cnt1, mask); 4011 notq(cnt1); 4012 bsfq(cnt2, cnt1); 4013 if (ae != StrIntrinsicNode::LL) { 4014 // Divide diff by 2 to get number of chars 4015 sarl(cnt2, 1); 4016 } 4017 addq(result, cnt2); 4018 if (ae == StrIntrinsicNode::LL) { 4019 load_unsigned_byte(cnt1, Address(str2, result)); 4020 load_unsigned_byte(result, Address(str1, result)); 4021 } else if (ae == StrIntrinsicNode::UU) { 4022 load_unsigned_short(cnt1, Address(str2, result, scale)); 4023 load_unsigned_short(result, Address(str1, result, scale)); 4024 } else { 4025 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4026 load_unsigned_byte(result, Address(str1, result, scale1)); 4027 } 4028 subl(result, cnt1); 4029 jmpb(POP_LABEL); 4030 }//if (VM_Version::supports_avx512vlbw()) 4031 #endif // _LP64 4032 4033 // Discard the stored length difference 4034 bind(POP_LABEL); 4035 pop(cnt1); 4036 4037 // That's it 4038 bind(DONE_LABEL); 4039 if(ae == StrIntrinsicNode::UL) { 4040 negl(result); 4041 } 4042 4043 } 4044 4045 // Search for Non-ASCII character (Negative byte value) in a byte array, 4046 // return the index of the first such character, otherwise the length 4047 // of the array segment searched. 4048 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4049 // @IntrinsicCandidate 4050 // public static int countPositives(byte[] ba, int off, int len) { 4051 // for (int i = off; i < off + len; i++) { 4052 // if (ba[i] < 0) { 4053 // return i - off; 4054 // } 4055 // } 4056 // return len; 4057 // } 4058 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4059 Register result, Register tmp1, 4060 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4061 // rsi: byte array 4062 // rcx: len 4063 // rax: result 4064 ShortBranchVerifier sbv(this); 4065 assert_different_registers(ary1, len, result, tmp1); 4066 assert_different_registers(vec1, vec2); 4067 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4068 4069 movl(result, len); // copy 4070 // len == 0 4071 testl(len, len); 4072 jcc(Assembler::zero, DONE); 4073 4074 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4075 VM_Version::supports_avx512vlbw() && 4076 VM_Version::supports_bmi2()) { 4077 4078 Label test_64_loop, test_tail, BREAK_LOOP; 4079 movl(tmp1, len); 4080 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4081 4082 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4083 andl(len, 0xffffffc0); // vector count (in chars) 4084 jccb(Assembler::zero, test_tail); 4085 4086 lea(ary1, Address(ary1, len, Address::times_1)); 4087 negptr(len); 4088 4089 bind(test_64_loop); 4090 // Check whether our 64 elements of size byte contain negatives 4091 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4092 kortestql(mask1, mask1); 4093 jcc(Assembler::notZero, BREAK_LOOP); 4094 4095 addptr(len, 64); 4096 jccb(Assembler::notZero, test_64_loop); 4097 4098 bind(test_tail); 4099 // bail out when there is nothing to be done 4100 testl(tmp1, -1); 4101 jcc(Assembler::zero, DONE); 4102 4103 4104 // check the tail for absense of negatives 4105 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4106 #ifdef _LP64 4107 { 4108 Register tmp3_aliased = len; 4109 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4110 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4111 notq(tmp3_aliased); 4112 kmovql(mask2, tmp3_aliased); 4113 } 4114 #else 4115 Label k_init; 4116 jmp(k_init); 4117 4118 // We could not read 64-bits from a general purpose register thus we move 4119 // data required to compose 64 1's to the instruction stream 4120 // We emit 64 byte wide series of elements from 0..63 which later on would 4121 // be used as a compare targets with tail count contained in tmp1 register. 4122 // Result would be a k register having tmp1 consecutive number or 1 4123 // counting from least significant bit. 4124 address tmp = pc(); 4125 emit_int64(0x0706050403020100); 4126 emit_int64(0x0F0E0D0C0B0A0908); 4127 emit_int64(0x1716151413121110); 4128 emit_int64(0x1F1E1D1C1B1A1918); 4129 emit_int64(0x2726252423222120); 4130 emit_int64(0x2F2E2D2C2B2A2928); 4131 emit_int64(0x3736353433323130); 4132 emit_int64(0x3F3E3D3C3B3A3938); 4133 4134 bind(k_init); 4135 lea(len, InternalAddress(tmp)); 4136 // create mask to test for negative byte inside a vector 4137 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 4138 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 4139 4140 #endif 4141 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4142 ktestq(mask1, mask2); 4143 jcc(Assembler::zero, DONE); 4144 4145 // do a full check for negative registers in the tail 4146 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4147 // ary1 already pointing to the right place 4148 jmpb(TAIL_START); 4149 4150 bind(BREAK_LOOP); 4151 // At least one byte in the last 64 byte block was negative. 4152 // Set up to look at the last 64 bytes as if they were a tail 4153 lea(ary1, Address(ary1, len, Address::times_1)); 4154 addptr(result, len); 4155 // Ignore the very last byte: if all others are positive, 4156 // it must be negative, so we can skip right to the 2+1 byte 4157 // end comparison at this point 4158 orl(result, 63); 4159 movl(len, 63); 4160 // Fallthru to tail compare 4161 } else { 4162 4163 if (UseAVX >= 2 && UseSSE >= 2) { 4164 // With AVX2, use 32-byte vector compare 4165 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4166 4167 // Compare 32-byte vectors 4168 testl(len, 0xffffffe0); // vector count (in bytes) 4169 jccb(Assembler::zero, TAIL_START); 4170 4171 andl(len, 0xffffffe0); 4172 lea(ary1, Address(ary1, len, Address::times_1)); 4173 negptr(len); 4174 4175 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4176 movdl(vec2, tmp1); 4177 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4178 4179 bind(COMPARE_WIDE_VECTORS); 4180 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4181 vptest(vec1, vec2); 4182 jccb(Assembler::notZero, BREAK_LOOP); 4183 addptr(len, 32); 4184 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4185 4186 testl(result, 0x0000001f); // any bytes remaining? 4187 jcc(Assembler::zero, DONE); 4188 4189 // Quick test using the already prepared vector mask 4190 movl(len, result); 4191 andl(len, 0x0000001f); 4192 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4193 vptest(vec1, vec2); 4194 jcc(Assembler::zero, DONE); 4195 // There are zeros, jump to the tail to determine exactly where 4196 jmpb(TAIL_START); 4197 4198 bind(BREAK_LOOP); 4199 // At least one byte in the last 32-byte vector is negative. 4200 // Set up to look at the last 32 bytes as if they were a tail 4201 lea(ary1, Address(ary1, len, Address::times_1)); 4202 addptr(result, len); 4203 // Ignore the very last byte: if all others are positive, 4204 // it must be negative, so we can skip right to the 2+1 byte 4205 // end comparison at this point 4206 orl(result, 31); 4207 movl(len, 31); 4208 // Fallthru to tail compare 4209 } else if (UseSSE42Intrinsics) { 4210 // With SSE4.2, use double quad vector compare 4211 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4212 4213 // Compare 16-byte vectors 4214 testl(len, 0xfffffff0); // vector count (in bytes) 4215 jcc(Assembler::zero, TAIL_START); 4216 4217 andl(len, 0xfffffff0); 4218 lea(ary1, Address(ary1, len, Address::times_1)); 4219 negptr(len); 4220 4221 movl(tmp1, 0x80808080); 4222 movdl(vec2, tmp1); 4223 pshufd(vec2, vec2, 0); 4224 4225 bind(COMPARE_WIDE_VECTORS); 4226 movdqu(vec1, Address(ary1, len, Address::times_1)); 4227 ptest(vec1, vec2); 4228 jccb(Assembler::notZero, BREAK_LOOP); 4229 addptr(len, 16); 4230 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4231 4232 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4233 jcc(Assembler::zero, DONE); 4234 4235 // Quick test using the already prepared vector mask 4236 movl(len, result); 4237 andl(len, 0x0000000f); // tail count (in bytes) 4238 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4239 ptest(vec1, vec2); 4240 jcc(Assembler::zero, DONE); 4241 jmpb(TAIL_START); 4242 4243 bind(BREAK_LOOP); 4244 // At least one byte in the last 16-byte vector is negative. 4245 // Set up and look at the last 16 bytes as if they were a tail 4246 lea(ary1, Address(ary1, len, Address::times_1)); 4247 addptr(result, len); 4248 // Ignore the very last byte: if all others are positive, 4249 // it must be negative, so we can skip right to the 2+1 byte 4250 // end comparison at this point 4251 orl(result, 15); 4252 movl(len, 15); 4253 // Fallthru to tail compare 4254 } 4255 } 4256 4257 bind(TAIL_START); 4258 // Compare 4-byte vectors 4259 andl(len, 0xfffffffc); // vector count (in bytes) 4260 jccb(Assembler::zero, COMPARE_CHAR); 4261 4262 lea(ary1, Address(ary1, len, Address::times_1)); 4263 negptr(len); 4264 4265 bind(COMPARE_VECTORS); 4266 movl(tmp1, Address(ary1, len, Address::times_1)); 4267 andl(tmp1, 0x80808080); 4268 jccb(Assembler::notZero, TAIL_ADJUST); 4269 addptr(len, 4); 4270 jccb(Assembler::notZero, COMPARE_VECTORS); 4271 4272 // Compare trailing char (final 2-3 bytes), if any 4273 bind(COMPARE_CHAR); 4274 4275 testl(result, 0x2); // tail char 4276 jccb(Assembler::zero, COMPARE_BYTE); 4277 load_unsigned_short(tmp1, Address(ary1, 0)); 4278 andl(tmp1, 0x00008080); 4279 jccb(Assembler::notZero, CHAR_ADJUST); 4280 lea(ary1, Address(ary1, 2)); 4281 4282 bind(COMPARE_BYTE); 4283 testl(result, 0x1); // tail byte 4284 jccb(Assembler::zero, DONE); 4285 load_unsigned_byte(tmp1, Address(ary1, 0)); 4286 testl(tmp1, 0x00000080); 4287 jccb(Assembler::zero, DONE); 4288 subptr(result, 1); 4289 jmpb(DONE); 4290 4291 bind(TAIL_ADJUST); 4292 // there are negative bits in the last 4 byte block. 4293 // Adjust result and check the next three bytes 4294 addptr(result, len); 4295 orl(result, 3); 4296 lea(ary1, Address(ary1, len, Address::times_1)); 4297 jmpb(COMPARE_CHAR); 4298 4299 bind(CHAR_ADJUST); 4300 // We are looking at a char + optional byte tail, and found that one 4301 // of the bytes in the char is negative. Adjust the result, check the 4302 // first byte and readjust if needed. 4303 andl(result, 0xfffffffc); 4304 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4305 jccb(Assembler::notZero, DONE); 4306 addptr(result, 1); 4307 4308 // That's it 4309 bind(DONE); 4310 if (UseAVX >= 2 && UseSSE >= 2) { 4311 // clean upper bits of YMM registers 4312 vpxor(vec1, vec1); 4313 vpxor(vec2, vec2); 4314 } 4315 } 4316 4317 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4318 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4319 Register limit, Register result, Register chr, 4320 XMMRegister vec1, XMMRegister vec2, bool is_char, 4321 KRegister mask, bool expand_ary2) { 4322 // for expand_ary2, limit is the (smaller) size of the second array. 4323 ShortBranchVerifier sbv(this); 4324 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4325 4326 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4327 "Expansion only implemented for AVX2"); 4328 4329 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4330 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4331 4332 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4333 int scaleIncr = expand_ary2 ? 8 : 16; 4334 4335 if (is_array_equ) { 4336 // Check the input args 4337 cmpoop(ary1, ary2); 4338 jcc(Assembler::equal, TRUE_LABEL); 4339 4340 // Need additional checks for arrays_equals. 4341 testptr(ary1, ary1); 4342 jcc(Assembler::zero, FALSE_LABEL); 4343 testptr(ary2, ary2); 4344 jcc(Assembler::zero, FALSE_LABEL); 4345 4346 // Check the lengths 4347 movl(limit, Address(ary1, length_offset)); 4348 cmpl(limit, Address(ary2, length_offset)); 4349 jcc(Assembler::notEqual, FALSE_LABEL); 4350 } 4351 4352 // count == 0 4353 testl(limit, limit); 4354 jcc(Assembler::zero, TRUE_LABEL); 4355 4356 if (is_array_equ) { 4357 // Load array address 4358 lea(ary1, Address(ary1, base_offset)); 4359 lea(ary2, Address(ary2, base_offset)); 4360 } 4361 4362 if (is_array_equ && is_char) { 4363 // arrays_equals when used for char[]. 4364 shll(limit, 1); // byte count != 0 4365 } 4366 movl(result, limit); // copy 4367 4368 if (UseAVX >= 2) { 4369 // With AVX2, use 32-byte vector compare 4370 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4371 4372 // Compare 32-byte vectors 4373 if (expand_ary2) { 4374 andl(result, 0x0000000f); // tail count (in bytes) 4375 andl(limit, 0xfffffff0); // vector count (in bytes) 4376 jcc(Assembler::zero, COMPARE_TAIL); 4377 } else { 4378 andl(result, 0x0000001f); // tail count (in bytes) 4379 andl(limit, 0xffffffe0); // vector count (in bytes) 4380 jcc(Assembler::zero, COMPARE_TAIL_16); 4381 } 4382 4383 lea(ary1, Address(ary1, limit, scaleFactor)); 4384 lea(ary2, Address(ary2, limit, Address::times_1)); 4385 negptr(limit); 4386 4387 #ifdef _LP64 4388 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4389 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4390 4391 cmpl(limit, -64); 4392 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4393 4394 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4395 4396 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4397 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4398 kortestql(mask, mask); 4399 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4400 addptr(limit, 64); // update since we already compared at this addr 4401 cmpl(limit, -64); 4402 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4403 4404 // At this point we may still need to compare -limit+result bytes. 4405 // We could execute the next two instruction and just continue via non-wide path: 4406 // cmpl(limit, 0); 4407 // jcc(Assembler::equal, COMPARE_TAIL); // true 4408 // But since we stopped at the points ary{1,2}+limit which are 4409 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4410 // (|limit| <= 32 and result < 32), 4411 // we may just compare the last 64 bytes. 4412 // 4413 addptr(result, -64); // it is safe, bc we just came from this area 4414 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4415 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4416 kortestql(mask, mask); 4417 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4418 4419 jmp(TRUE_LABEL); 4420 4421 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4422 4423 }//if (VM_Version::supports_avx512vlbw()) 4424 #endif //_LP64 4425 bind(COMPARE_WIDE_VECTORS); 4426 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4427 if (expand_ary2) { 4428 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4429 } else { 4430 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4431 } 4432 vpxor(vec1, vec2); 4433 4434 vptest(vec1, vec1); 4435 jcc(Assembler::notZero, FALSE_LABEL); 4436 addptr(limit, scaleIncr * 2); 4437 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4438 4439 testl(result, result); 4440 jcc(Assembler::zero, TRUE_LABEL); 4441 4442 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4443 if (expand_ary2) { 4444 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4445 } else { 4446 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4447 } 4448 vpxor(vec1, vec2); 4449 4450 vptest(vec1, vec1); 4451 jcc(Assembler::notZero, FALSE_LABEL); 4452 jmp(TRUE_LABEL); 4453 4454 bind(COMPARE_TAIL_16); // limit is zero 4455 movl(limit, result); 4456 4457 // Compare 16-byte chunks 4458 andl(result, 0x0000000f); // tail count (in bytes) 4459 andl(limit, 0xfffffff0); // vector count (in bytes) 4460 jcc(Assembler::zero, COMPARE_TAIL); 4461 4462 lea(ary1, Address(ary1, limit, scaleFactor)); 4463 lea(ary2, Address(ary2, limit, Address::times_1)); 4464 negptr(limit); 4465 4466 bind(COMPARE_WIDE_VECTORS_16); 4467 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4468 if (expand_ary2) { 4469 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4470 } else { 4471 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4472 } 4473 pxor(vec1, vec2); 4474 4475 ptest(vec1, vec1); 4476 jcc(Assembler::notZero, FALSE_LABEL); 4477 addptr(limit, scaleIncr); 4478 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4479 4480 bind(COMPARE_TAIL); // limit is zero 4481 movl(limit, result); 4482 // Fallthru to tail compare 4483 } else if (UseSSE42Intrinsics) { 4484 // With SSE4.2, use double quad vector compare 4485 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4486 4487 // Compare 16-byte vectors 4488 andl(result, 0x0000000f); // tail count (in bytes) 4489 andl(limit, 0xfffffff0); // vector count (in bytes) 4490 jcc(Assembler::zero, COMPARE_TAIL); 4491 4492 lea(ary1, Address(ary1, limit, Address::times_1)); 4493 lea(ary2, Address(ary2, limit, Address::times_1)); 4494 negptr(limit); 4495 4496 bind(COMPARE_WIDE_VECTORS); 4497 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4498 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4499 pxor(vec1, vec2); 4500 4501 ptest(vec1, vec1); 4502 jcc(Assembler::notZero, FALSE_LABEL); 4503 addptr(limit, 16); 4504 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4505 4506 testl(result, result); 4507 jcc(Assembler::zero, TRUE_LABEL); 4508 4509 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4510 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4511 pxor(vec1, vec2); 4512 4513 ptest(vec1, vec1); 4514 jccb(Assembler::notZero, FALSE_LABEL); 4515 jmpb(TRUE_LABEL); 4516 4517 bind(COMPARE_TAIL); // limit is zero 4518 movl(limit, result); 4519 // Fallthru to tail compare 4520 } 4521 4522 // Compare 4-byte vectors 4523 if (expand_ary2) { 4524 testl(result, result); 4525 jccb(Assembler::zero, TRUE_LABEL); 4526 } else { 4527 andl(limit, 0xfffffffc); // vector count (in bytes) 4528 jccb(Assembler::zero, COMPARE_CHAR); 4529 } 4530 4531 lea(ary1, Address(ary1, limit, scaleFactor)); 4532 lea(ary2, Address(ary2, limit, Address::times_1)); 4533 negptr(limit); 4534 4535 bind(COMPARE_VECTORS); 4536 if (expand_ary2) { 4537 // There are no "vector" operations for bytes to shorts 4538 movzbl(chr, Address(ary2, limit, Address::times_1)); 4539 cmpw(Address(ary1, limit, Address::times_2), chr); 4540 jccb(Assembler::notEqual, FALSE_LABEL); 4541 addptr(limit, 1); 4542 jcc(Assembler::notZero, COMPARE_VECTORS); 4543 jmp(TRUE_LABEL); 4544 } else { 4545 movl(chr, Address(ary1, limit, Address::times_1)); 4546 cmpl(chr, Address(ary2, limit, Address::times_1)); 4547 jccb(Assembler::notEqual, FALSE_LABEL); 4548 addptr(limit, 4); 4549 jcc(Assembler::notZero, COMPARE_VECTORS); 4550 } 4551 4552 // Compare trailing char (final 2 bytes), if any 4553 bind(COMPARE_CHAR); 4554 testl(result, 0x2); // tail char 4555 jccb(Assembler::zero, COMPARE_BYTE); 4556 load_unsigned_short(chr, Address(ary1, 0)); 4557 load_unsigned_short(limit, Address(ary2, 0)); 4558 cmpl(chr, limit); 4559 jccb(Assembler::notEqual, FALSE_LABEL); 4560 4561 if (is_array_equ && is_char) { 4562 bind(COMPARE_BYTE); 4563 } else { 4564 lea(ary1, Address(ary1, 2)); 4565 lea(ary2, Address(ary2, 2)); 4566 4567 bind(COMPARE_BYTE); 4568 testl(result, 0x1); // tail byte 4569 jccb(Assembler::zero, TRUE_LABEL); 4570 load_unsigned_byte(chr, Address(ary1, 0)); 4571 load_unsigned_byte(limit, Address(ary2, 0)); 4572 cmpl(chr, limit); 4573 jccb(Assembler::notEqual, FALSE_LABEL); 4574 } 4575 bind(TRUE_LABEL); 4576 movl(result, 1); // return true 4577 jmpb(DONE); 4578 4579 bind(FALSE_LABEL); 4580 xorl(result, result); // return false 4581 4582 // That's it 4583 bind(DONE); 4584 if (UseAVX >= 2) { 4585 // clean upper bits of YMM registers 4586 vpxor(vec1, vec1); 4587 vpxor(vec2, vec2); 4588 } 4589 } 4590 4591 #ifdef _LP64 4592 4593 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4594 #define __ masm. 4595 Register dst = stub.data<0>(); 4596 XMMRegister src = stub.data<1>(); 4597 address target = stub.data<2>(); 4598 __ bind(stub.entry()); 4599 __ subptr(rsp, 8); 4600 __ movdbl(Address(rsp), src); 4601 __ call(RuntimeAddress(target)); 4602 __ pop(dst); 4603 __ jmp(stub.continuation()); 4604 #undef __ 4605 } 4606 4607 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4608 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4609 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4610 4611 address slowpath_target; 4612 if (dst_bt == T_INT) { 4613 if (src_bt == T_FLOAT) { 4614 cvttss2sil(dst, src); 4615 cmpl(dst, 0x80000000); 4616 slowpath_target = StubRoutines::x86::f2i_fixup(); 4617 } else { 4618 cvttsd2sil(dst, src); 4619 cmpl(dst, 0x80000000); 4620 slowpath_target = StubRoutines::x86::d2i_fixup(); 4621 } 4622 } else { 4623 if (src_bt == T_FLOAT) { 4624 cvttss2siq(dst, src); 4625 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4626 slowpath_target = StubRoutines::x86::f2l_fixup(); 4627 } else { 4628 cvttsd2siq(dst, src); 4629 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4630 slowpath_target = StubRoutines::x86::d2l_fixup(); 4631 } 4632 } 4633 4634 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4635 jcc(Assembler::equal, stub->entry()); 4636 bind(stub->continuation()); 4637 } 4638 4639 #endif // _LP64 4640 4641 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4642 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4643 switch(ideal_opc) { 4644 case Op_LShiftVS: 4645 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4646 case Op_LShiftVI: 4647 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4648 case Op_LShiftVL: 4649 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4650 case Op_RShiftVS: 4651 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4652 case Op_RShiftVI: 4653 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4654 case Op_RShiftVL: 4655 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4656 case Op_URShiftVS: 4657 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4658 case Op_URShiftVI: 4659 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4660 case Op_URShiftVL: 4661 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4662 case Op_RotateRightV: 4663 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4664 case Op_RotateLeftV: 4665 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4666 default: 4667 fatal("Unsupported masked operation"); break; 4668 } 4669 } 4670 4671 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4672 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4673 bool is_varshift) { 4674 switch (ideal_opc) { 4675 case Op_AddVB: 4676 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4677 case Op_AddVS: 4678 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4679 case Op_AddVI: 4680 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4681 case Op_AddVL: 4682 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4683 case Op_AddVF: 4684 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4685 case Op_AddVD: 4686 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4687 case Op_SubVB: 4688 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4689 case Op_SubVS: 4690 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4691 case Op_SubVI: 4692 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4693 case Op_SubVL: 4694 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4695 case Op_SubVF: 4696 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4697 case Op_SubVD: 4698 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4699 case Op_MulVS: 4700 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4701 case Op_MulVI: 4702 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4703 case Op_MulVL: 4704 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4705 case Op_MulVF: 4706 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4707 case Op_MulVD: 4708 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4709 case Op_DivVF: 4710 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4711 case Op_DivVD: 4712 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4713 case Op_SqrtVF: 4714 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4715 case Op_SqrtVD: 4716 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4717 case Op_AbsVB: 4718 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4719 case Op_AbsVS: 4720 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4721 case Op_AbsVI: 4722 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4723 case Op_AbsVL: 4724 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4725 case Op_FmaVF: 4726 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4727 case Op_FmaVD: 4728 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4729 case Op_VectorRearrange: 4730 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4731 case Op_LShiftVS: 4732 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4733 case Op_LShiftVI: 4734 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4735 case Op_LShiftVL: 4736 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4737 case Op_RShiftVS: 4738 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4739 case Op_RShiftVI: 4740 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4741 case Op_RShiftVL: 4742 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4743 case Op_URShiftVS: 4744 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4745 case Op_URShiftVI: 4746 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4747 case Op_URShiftVL: 4748 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4749 case Op_RotateLeftV: 4750 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4751 case Op_RotateRightV: 4752 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4753 case Op_MaxV: 4754 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4755 case Op_MinV: 4756 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4757 case Op_XorV: 4758 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4759 case Op_OrV: 4760 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4761 case Op_AndV: 4762 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4763 default: 4764 fatal("Unsupported masked operation"); break; 4765 } 4766 } 4767 4768 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4769 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4770 switch (ideal_opc) { 4771 case Op_AddVB: 4772 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4773 case Op_AddVS: 4774 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4775 case Op_AddVI: 4776 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4777 case Op_AddVL: 4778 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4779 case Op_AddVF: 4780 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4781 case Op_AddVD: 4782 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4783 case Op_SubVB: 4784 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4785 case Op_SubVS: 4786 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4787 case Op_SubVI: 4788 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4789 case Op_SubVL: 4790 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4791 case Op_SubVF: 4792 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4793 case Op_SubVD: 4794 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4795 case Op_MulVS: 4796 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4797 case Op_MulVI: 4798 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4799 case Op_MulVL: 4800 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4801 case Op_MulVF: 4802 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4803 case Op_MulVD: 4804 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4805 case Op_DivVF: 4806 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4807 case Op_DivVD: 4808 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4809 case Op_FmaVF: 4810 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4811 case Op_FmaVD: 4812 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4813 case Op_MaxV: 4814 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4815 case Op_MinV: 4816 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4817 case Op_XorV: 4818 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4819 case Op_OrV: 4820 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4821 case Op_AndV: 4822 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4823 default: 4824 fatal("Unsupported masked operation"); break; 4825 } 4826 } 4827 4828 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4829 KRegister src1, KRegister src2) { 4830 BasicType etype = T_ILLEGAL; 4831 switch(mask_len) { 4832 case 2: 4833 case 4: 4834 case 8: etype = T_BYTE; break; 4835 case 16: etype = T_SHORT; break; 4836 case 32: etype = T_INT; break; 4837 case 64: etype = T_LONG; break; 4838 default: fatal("Unsupported type"); break; 4839 } 4840 assert(etype != T_ILLEGAL, ""); 4841 switch(ideal_opc) { 4842 case Op_AndVMask: 4843 kand(etype, dst, src1, src2); break; 4844 case Op_OrVMask: 4845 kor(etype, dst, src1, src2); break; 4846 case Op_XorVMask: 4847 kxor(etype, dst, src1, src2); break; 4848 default: 4849 fatal("Unsupported masked operation"); break; 4850 } 4851 } 4852 4853 /* 4854 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4855 * If src is NaN, the result is 0. 4856 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4857 * the result is equal to the value of Integer.MIN_VALUE. 4858 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4859 * the result is equal to the value of Integer.MAX_VALUE. 4860 */ 4861 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4862 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4863 Register rscratch, AddressLiteral float_sign_flip, 4864 int vec_enc) { 4865 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4866 Label done; 4867 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4868 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4869 vptest(xtmp2, xtmp2, vec_enc); 4870 jccb(Assembler::equal, done); 4871 4872 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4873 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4874 4875 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4876 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4877 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4878 4879 // Recompute the mask for remaining special value. 4880 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4881 // Extract SRC values corresponding to TRUE mask lanes. 4882 vpand(xtmp4, xtmp2, src, vec_enc); 4883 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4884 // values are set. 4885 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4886 4887 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4888 bind(done); 4889 } 4890 4891 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4892 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4893 Register rscratch, AddressLiteral float_sign_flip, 4894 int vec_enc) { 4895 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4896 Label done; 4897 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4898 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4899 kortestwl(ktmp1, ktmp1); 4900 jccb(Assembler::equal, done); 4901 4902 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4903 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4904 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4905 4906 kxorwl(ktmp1, ktmp1, ktmp2); 4907 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4908 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4909 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4910 bind(done); 4911 } 4912 4913 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4914 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4915 Register rscratch, AddressLiteral double_sign_flip, 4916 int vec_enc) { 4917 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4918 4919 Label done; 4920 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4921 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4922 kortestwl(ktmp1, ktmp1); 4923 jccb(Assembler::equal, done); 4924 4925 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4926 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4927 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4928 4929 kxorwl(ktmp1, ktmp1, ktmp2); 4930 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4931 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4932 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4933 bind(done); 4934 } 4935 4936 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4937 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4938 Register rscratch, AddressLiteral float_sign_flip, 4939 int vec_enc) { 4940 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4941 Label done; 4942 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4943 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4944 kortestwl(ktmp1, ktmp1); 4945 jccb(Assembler::equal, done); 4946 4947 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4948 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4949 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4950 4951 kxorwl(ktmp1, ktmp1, ktmp2); 4952 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4953 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4954 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4955 bind(done); 4956 } 4957 4958 /* 4959 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4960 * If src is NaN, the result is 0. 4961 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4962 * the result is equal to the value of Long.MIN_VALUE. 4963 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4964 * the result is equal to the value of Long.MAX_VALUE. 4965 */ 4966 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4967 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4968 Register rscratch, AddressLiteral double_sign_flip, 4969 int vec_enc) { 4970 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4971 4972 Label done; 4973 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4974 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4975 kortestwl(ktmp1, ktmp1); 4976 jccb(Assembler::equal, done); 4977 4978 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4979 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4980 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4981 4982 kxorwl(ktmp1, ktmp1, ktmp2); 4983 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4984 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4985 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4986 bind(done); 4987 } 4988 4989 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 4990 XMMRegister xtmp, int index, int vec_enc) { 4991 assert(vec_enc < Assembler::AVX_512bit, ""); 4992 if (vec_enc == Assembler::AVX_256bit) { 4993 vextractf128_high(xtmp, src); 4994 vshufps(dst, src, xtmp, index, vec_enc); 4995 } else { 4996 vshufps(dst, src, zero, index, vec_enc); 4997 } 4998 } 4999 5000 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5001 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5002 AddressLiteral float_sign_flip, int src_vec_enc) { 5003 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5004 5005 Label done; 5006 // Compare the destination lanes with float_sign_flip 5007 // value to get mask for all special values. 5008 movdqu(xtmp1, float_sign_flip, rscratch); 5009 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5010 ptest(xtmp2, xtmp2); 5011 jccb(Assembler::equal, done); 5012 5013 // Flip float_sign_flip to get max integer value. 5014 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5015 pxor(xtmp1, xtmp4); 5016 5017 // Set detination lanes corresponding to unordered source lanes as zero. 5018 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5019 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5020 5021 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5022 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5023 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5024 5025 // Recompute the mask for remaining special value. 5026 pxor(xtmp2, xtmp3); 5027 // Extract mask corresponding to non-negative source lanes. 5028 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5029 5030 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5031 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5032 pand(xtmp3, xtmp2); 5033 5034 // Replace destination lanes holding special value(0x80000000) with max int 5035 // if corresponding source lane holds a +ve value. 5036 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5037 bind(done); 5038 } 5039 5040 5041 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5042 XMMRegister xtmp, Register rscratch, int vec_enc) { 5043 switch(to_elem_bt) { 5044 case T_SHORT: 5045 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5046 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5047 vpackusdw(dst, dst, zero, vec_enc); 5048 if (vec_enc == Assembler::AVX_256bit) { 5049 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5050 } 5051 break; 5052 case T_BYTE: 5053 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5054 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5055 vpackusdw(dst, dst, zero, vec_enc); 5056 if (vec_enc == Assembler::AVX_256bit) { 5057 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5058 } 5059 vpackuswb(dst, dst, zero, vec_enc); 5060 break; 5061 default: assert(false, "%s", type2name(to_elem_bt)); 5062 } 5063 } 5064 5065 /* 5066 * Algorithm for vector D2L and F2I conversions:- 5067 * a) Perform vector D2L/F2I cast. 5068 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5069 * It signifies that source value could be any of the special floating point 5070 * values(NaN,-Inf,Inf,Max,-Min). 5071 * c) Set destination to zero if source is NaN value. 5072 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5073 */ 5074 5075 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5076 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5077 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5078 int to_elem_sz = type2aelembytes(to_elem_bt); 5079 assert(to_elem_sz <= 4, ""); 5080 vcvttps2dq(dst, src, vec_enc); 5081 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5082 if (to_elem_sz < 4) { 5083 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5084 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5085 } 5086 } 5087 5088 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5089 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5090 Register rscratch, int vec_enc) { 5091 int to_elem_sz = type2aelembytes(to_elem_bt); 5092 assert(to_elem_sz <= 4, ""); 5093 vcvttps2dq(dst, src, vec_enc); 5094 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5095 switch(to_elem_bt) { 5096 case T_INT: 5097 break; 5098 case T_SHORT: 5099 evpmovdw(dst, dst, vec_enc); 5100 break; 5101 case T_BYTE: 5102 evpmovdb(dst, dst, vec_enc); 5103 break; 5104 default: assert(false, "%s", type2name(to_elem_bt)); 5105 } 5106 } 5107 5108 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5109 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5110 Register rscratch, int vec_enc) { 5111 evcvttps2qq(dst, src, vec_enc); 5112 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5113 } 5114 5115 // Handling for downcasting from double to integer or sub-word types on AVX2. 5116 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5117 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5118 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5119 int to_elem_sz = type2aelembytes(to_elem_bt); 5120 assert(to_elem_sz < 8, ""); 5121 vcvttpd2dq(dst, src, vec_enc); 5122 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5123 float_sign_flip, vec_enc); 5124 if (to_elem_sz < 4) { 5125 // xtmp4 holds all zero lanes. 5126 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5127 } 5128 } 5129 5130 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5131 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5132 KRegister ktmp2, AddressLiteral sign_flip, 5133 Register rscratch, int vec_enc) { 5134 if (VM_Version::supports_avx512dq()) { 5135 evcvttpd2qq(dst, src, vec_enc); 5136 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5137 switch(to_elem_bt) { 5138 case T_LONG: 5139 break; 5140 case T_INT: 5141 evpmovsqd(dst, dst, vec_enc); 5142 break; 5143 case T_SHORT: 5144 evpmovsqd(dst, dst, vec_enc); 5145 evpmovdw(dst, dst, vec_enc); 5146 break; 5147 case T_BYTE: 5148 evpmovsqd(dst, dst, vec_enc); 5149 evpmovdb(dst, dst, vec_enc); 5150 break; 5151 default: assert(false, "%s", type2name(to_elem_bt)); 5152 } 5153 } else { 5154 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5155 vcvttpd2dq(dst, src, vec_enc); 5156 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5157 switch(to_elem_bt) { 5158 case T_INT: 5159 break; 5160 case T_SHORT: 5161 evpmovdw(dst, dst, vec_enc); 5162 break; 5163 case T_BYTE: 5164 evpmovdb(dst, dst, vec_enc); 5165 break; 5166 default: assert(false, "%s", type2name(to_elem_bt)); 5167 } 5168 } 5169 } 5170 5171 #ifdef _LP64 5172 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5173 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5174 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5175 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5176 // and re-instantiate original MXCSR.RC mode after that. 5177 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5178 5179 mov64(tmp, julong_cast(0.5L)); 5180 evpbroadcastq(xtmp1, tmp, vec_enc); 5181 vaddpd(xtmp1, src , xtmp1, vec_enc); 5182 evcvtpd2qq(dst, xtmp1, vec_enc); 5183 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5184 double_sign_flip, vec_enc);; 5185 5186 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5187 } 5188 5189 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5190 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5191 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5192 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5193 // and re-instantiate original MXCSR.RC mode after that. 5194 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5195 5196 movl(tmp, jint_cast(0.5)); 5197 movq(xtmp1, tmp); 5198 vbroadcastss(xtmp1, xtmp1, vec_enc); 5199 vaddps(xtmp1, src , xtmp1, vec_enc); 5200 vcvtps2dq(dst, xtmp1, vec_enc); 5201 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5202 float_sign_flip, vec_enc); 5203 5204 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5205 } 5206 5207 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5208 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5209 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5210 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5211 // and re-instantiate original MXCSR.RC mode after that. 5212 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5213 5214 movl(tmp, jint_cast(0.5)); 5215 movq(xtmp1, tmp); 5216 vbroadcastss(xtmp1, xtmp1, vec_enc); 5217 vaddps(xtmp1, src , xtmp1, vec_enc); 5218 vcvtps2dq(dst, xtmp1, vec_enc); 5219 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5220 5221 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5222 } 5223 #endif // _LP64 5224 5225 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5226 BasicType from_elem_bt, BasicType to_elem_bt) { 5227 switch (from_elem_bt) { 5228 case T_BYTE: 5229 switch (to_elem_bt) { 5230 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5231 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5232 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5233 default: ShouldNotReachHere(); 5234 } 5235 break; 5236 case T_SHORT: 5237 switch (to_elem_bt) { 5238 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5239 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5240 default: ShouldNotReachHere(); 5241 } 5242 break; 5243 case T_INT: 5244 assert(to_elem_bt == T_LONG, ""); 5245 vpmovzxdq(dst, src, vlen_enc); 5246 break; 5247 default: 5248 ShouldNotReachHere(); 5249 } 5250 } 5251 5252 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5253 BasicType from_elem_bt, BasicType to_elem_bt) { 5254 switch (from_elem_bt) { 5255 case T_BYTE: 5256 switch (to_elem_bt) { 5257 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5258 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5259 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5260 default: ShouldNotReachHere(); 5261 } 5262 break; 5263 case T_SHORT: 5264 switch (to_elem_bt) { 5265 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5266 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5267 default: ShouldNotReachHere(); 5268 } 5269 break; 5270 case T_INT: 5271 assert(to_elem_bt == T_LONG, ""); 5272 vpmovsxdq(dst, src, vlen_enc); 5273 break; 5274 default: 5275 ShouldNotReachHere(); 5276 } 5277 } 5278 5279 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5280 BasicType dst_bt, BasicType src_bt, int vlen) { 5281 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5282 assert(vlen_enc != AVX_512bit, ""); 5283 5284 int dst_bt_size = type2aelembytes(dst_bt); 5285 int src_bt_size = type2aelembytes(src_bt); 5286 if (dst_bt_size > src_bt_size) { 5287 switch (dst_bt_size / src_bt_size) { 5288 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5289 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5290 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5291 default: ShouldNotReachHere(); 5292 } 5293 } else { 5294 assert(dst_bt_size < src_bt_size, ""); 5295 switch (src_bt_size / dst_bt_size) { 5296 case 2: { 5297 if (vlen_enc == AVX_128bit) { 5298 vpacksswb(dst, src, src, vlen_enc); 5299 } else { 5300 vpacksswb(dst, src, src, vlen_enc); 5301 vpermq(dst, dst, 0x08, vlen_enc); 5302 } 5303 break; 5304 } 5305 case 4: { 5306 if (vlen_enc == AVX_128bit) { 5307 vpackssdw(dst, src, src, vlen_enc); 5308 vpacksswb(dst, dst, dst, vlen_enc); 5309 } else { 5310 vpackssdw(dst, src, src, vlen_enc); 5311 vpermq(dst, dst, 0x08, vlen_enc); 5312 vpacksswb(dst, dst, dst, AVX_128bit); 5313 } 5314 break; 5315 } 5316 case 8: { 5317 if (vlen_enc == AVX_128bit) { 5318 vpshufd(dst, src, 0x08, vlen_enc); 5319 vpackssdw(dst, dst, dst, vlen_enc); 5320 vpacksswb(dst, dst, dst, vlen_enc); 5321 } else { 5322 vpshufd(dst, src, 0x08, vlen_enc); 5323 vpermq(dst, dst, 0x08, vlen_enc); 5324 vpackssdw(dst, dst, dst, AVX_128bit); 5325 vpacksswb(dst, dst, dst, AVX_128bit); 5326 } 5327 break; 5328 } 5329 default: ShouldNotReachHere(); 5330 } 5331 } 5332 } 5333 5334 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5335 bool merge, BasicType bt, int vlen_enc) { 5336 if (bt == T_INT) { 5337 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5338 } else { 5339 assert(bt == T_LONG, ""); 5340 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5341 } 5342 } 5343 5344 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5345 bool merge, BasicType bt, int vlen_enc) { 5346 if (bt == T_INT) { 5347 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5348 } else { 5349 assert(bt == T_LONG, ""); 5350 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5351 } 5352 } 5353 5354 #ifdef _LP64 5355 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5356 Register rtmp2, XMMRegister xtmp, int mask_len, 5357 int vec_enc) { 5358 int index = 0; 5359 int vindex = 0; 5360 mov64(rtmp1, 0x0101010101010101L); 5361 pdepq(rtmp1, src, rtmp1); 5362 if (mask_len > 8) { 5363 movq(rtmp2, src); 5364 vpxor(xtmp, xtmp, xtmp, vec_enc); 5365 movq(xtmp, rtmp1); 5366 } 5367 movq(dst, rtmp1); 5368 5369 mask_len -= 8; 5370 while (mask_len > 0) { 5371 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5372 index++; 5373 if ((index % 2) == 0) { 5374 pxor(xtmp, xtmp); 5375 } 5376 mov64(rtmp1, 0x0101010101010101L); 5377 shrq(rtmp2, 8); 5378 pdepq(rtmp1, rtmp2, rtmp1); 5379 pinsrq(xtmp, rtmp1, index % 2); 5380 vindex = index / 2; 5381 if (vindex) { 5382 // Write entire 16 byte vector when both 64 bit 5383 // lanes are update to save redundant instructions. 5384 if (index % 2) { 5385 vinsertf128(dst, dst, xtmp, vindex); 5386 } 5387 } else { 5388 vmovdqu(dst, xtmp); 5389 } 5390 mask_len -= 8; 5391 } 5392 } 5393 5394 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5395 switch(opc) { 5396 case Op_VectorMaskTrueCount: 5397 popcntq(dst, tmp); 5398 break; 5399 case Op_VectorMaskLastTrue: 5400 if (VM_Version::supports_lzcnt()) { 5401 lzcntq(tmp, tmp); 5402 movl(dst, 63); 5403 subl(dst, tmp); 5404 } else { 5405 movl(dst, -1); 5406 bsrq(tmp, tmp); 5407 cmov32(Assembler::notZero, dst, tmp); 5408 } 5409 break; 5410 case Op_VectorMaskFirstTrue: 5411 if (VM_Version::supports_bmi1()) { 5412 if (masklen < 32) { 5413 orl(tmp, 1 << masklen); 5414 tzcntl(dst, tmp); 5415 } else if (masklen == 32) { 5416 tzcntl(dst, tmp); 5417 } else { 5418 assert(masklen == 64, ""); 5419 tzcntq(dst, tmp); 5420 } 5421 } else { 5422 if (masklen < 32) { 5423 orl(tmp, 1 << masklen); 5424 bsfl(dst, tmp); 5425 } else { 5426 assert(masklen == 32 || masklen == 64, ""); 5427 movl(dst, masklen); 5428 if (masklen == 32) { 5429 bsfl(tmp, tmp); 5430 } else { 5431 bsfq(tmp, tmp); 5432 } 5433 cmov32(Assembler::notZero, dst, tmp); 5434 } 5435 } 5436 break; 5437 case Op_VectorMaskToLong: 5438 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5439 break; 5440 default: assert(false, "Unhandled mask operation"); 5441 } 5442 } 5443 5444 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5445 int masklen, int masksize, int vec_enc) { 5446 assert(VM_Version::supports_popcnt(), ""); 5447 5448 if(VM_Version::supports_avx512bw()) { 5449 kmovql(tmp, mask); 5450 } else { 5451 assert(masklen <= 16, ""); 5452 kmovwl(tmp, mask); 5453 } 5454 5455 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5456 // operations needs to be clipped. 5457 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5458 andq(tmp, (1 << masklen) - 1); 5459 } 5460 5461 vector_mask_operation_helper(opc, dst, tmp, masklen); 5462 } 5463 5464 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5465 Register tmp, int masklen, BasicType bt, int vec_enc) { 5466 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5467 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5468 assert(VM_Version::supports_popcnt(), ""); 5469 5470 bool need_clip = false; 5471 switch(bt) { 5472 case T_BOOLEAN: 5473 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5474 vpxor(xtmp, xtmp, xtmp, vec_enc); 5475 vpsubb(xtmp, xtmp, mask, vec_enc); 5476 vpmovmskb(tmp, xtmp, vec_enc); 5477 need_clip = masklen < 16; 5478 break; 5479 case T_BYTE: 5480 vpmovmskb(tmp, mask, vec_enc); 5481 need_clip = masklen < 16; 5482 break; 5483 case T_SHORT: 5484 vpacksswb(xtmp, mask, mask, vec_enc); 5485 if (masklen >= 16) { 5486 vpermpd(xtmp, xtmp, 8, vec_enc); 5487 } 5488 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5489 need_clip = masklen < 16; 5490 break; 5491 case T_INT: 5492 case T_FLOAT: 5493 vmovmskps(tmp, mask, vec_enc); 5494 need_clip = masklen < 4; 5495 break; 5496 case T_LONG: 5497 case T_DOUBLE: 5498 vmovmskpd(tmp, mask, vec_enc); 5499 need_clip = masklen < 2; 5500 break; 5501 default: assert(false, "Unhandled type, %s", type2name(bt)); 5502 } 5503 5504 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5505 // operations needs to be clipped. 5506 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5507 // need_clip implies masklen < 32 5508 andq(tmp, (1 << masklen) - 1); 5509 } 5510 5511 vector_mask_operation_helper(opc, dst, tmp, masklen); 5512 } 5513 5514 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5515 Register rtmp2, int mask_len) { 5516 kmov(rtmp1, src); 5517 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5518 mov64(rtmp2, -1L); 5519 pextq(rtmp2, rtmp2, rtmp1); 5520 kmov(dst, rtmp2); 5521 } 5522 5523 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5524 XMMRegister mask, Register rtmp, Register rscratch, 5525 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5526 int vec_enc) { 5527 assert(type2aelembytes(bt) >= 4, ""); 5528 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5529 address compress_perm_table = nullptr; 5530 address expand_perm_table = nullptr; 5531 if (type2aelembytes(bt) == 8) { 5532 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5533 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5534 vmovmskpd(rtmp, mask, vec_enc); 5535 } else { 5536 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5537 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5538 vmovmskps(rtmp, mask, vec_enc); 5539 } 5540 shlq(rtmp, 5); // for 32 byte permute row. 5541 if (opcode == Op_CompressV) { 5542 lea(rscratch, ExternalAddress(compress_perm_table)); 5543 } else { 5544 lea(rscratch, ExternalAddress(expand_perm_table)); 5545 } 5546 addptr(rtmp, rscratch); 5547 vmovdqu(permv, Address(rtmp)); 5548 vpermps(dst, permv, src, Assembler::AVX_256bit); 5549 vpxor(xtmp, xtmp, xtmp, vec_enc); 5550 // Blend the result with zero vector using permute mask, each column entry 5551 // in a permute table row contains either a valid permute index or a -1 (default) 5552 // value, this can potentially be used as a blending mask after 5553 // compressing/expanding the source vector lanes. 5554 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5555 } 5556 5557 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5558 bool merge, BasicType bt, int vec_enc) { 5559 if (opcode == Op_CompressV) { 5560 switch(bt) { 5561 case T_BYTE: 5562 evpcompressb(dst, mask, src, merge, vec_enc); 5563 break; 5564 case T_CHAR: 5565 case T_SHORT: 5566 evpcompressw(dst, mask, src, merge, vec_enc); 5567 break; 5568 case T_INT: 5569 evpcompressd(dst, mask, src, merge, vec_enc); 5570 break; 5571 case T_FLOAT: 5572 evcompressps(dst, mask, src, merge, vec_enc); 5573 break; 5574 case T_LONG: 5575 evpcompressq(dst, mask, src, merge, vec_enc); 5576 break; 5577 case T_DOUBLE: 5578 evcompresspd(dst, mask, src, merge, vec_enc); 5579 break; 5580 default: 5581 fatal("Unsupported type %s", type2name(bt)); 5582 break; 5583 } 5584 } else { 5585 assert(opcode == Op_ExpandV, ""); 5586 switch(bt) { 5587 case T_BYTE: 5588 evpexpandb(dst, mask, src, merge, vec_enc); 5589 break; 5590 case T_CHAR: 5591 case T_SHORT: 5592 evpexpandw(dst, mask, src, merge, vec_enc); 5593 break; 5594 case T_INT: 5595 evpexpandd(dst, mask, src, merge, vec_enc); 5596 break; 5597 case T_FLOAT: 5598 evexpandps(dst, mask, src, merge, vec_enc); 5599 break; 5600 case T_LONG: 5601 evpexpandq(dst, mask, src, merge, vec_enc); 5602 break; 5603 case T_DOUBLE: 5604 evexpandpd(dst, mask, src, merge, vec_enc); 5605 break; 5606 default: 5607 fatal("Unsupported type %s", type2name(bt)); 5608 break; 5609 } 5610 } 5611 } 5612 #endif 5613 5614 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5615 KRegister ktmp1, int vec_enc) { 5616 if (opcode == Op_SignumVD) { 5617 vsubpd(dst, zero, one, vec_enc); 5618 // if src < 0 ? -1 : 1 5619 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5620 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5621 // if src == NaN, -0.0 or 0.0 return src. 5622 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5623 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5624 } else { 5625 assert(opcode == Op_SignumVF, ""); 5626 vsubps(dst, zero, one, vec_enc); 5627 // if src < 0 ? -1 : 1 5628 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5629 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5630 // if src == NaN, -0.0 or 0.0 return src. 5631 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5632 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5633 } 5634 } 5635 5636 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5637 XMMRegister xtmp1, int vec_enc) { 5638 if (opcode == Op_SignumVD) { 5639 vsubpd(dst, zero, one, vec_enc); 5640 // if src < 0 ? -1 : 1 5641 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5642 // if src == NaN, -0.0 or 0.0 return src. 5643 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5644 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5645 } else { 5646 assert(opcode == Op_SignumVF, ""); 5647 vsubps(dst, zero, one, vec_enc); 5648 // if src < 0 ? -1 : 1 5649 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5650 // if src == NaN, -0.0 or 0.0 return src. 5651 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5652 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5653 } 5654 } 5655 5656 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5657 if (VM_Version::supports_avx512bw()) { 5658 if (mask_len > 32) { 5659 kmovql(dst, src); 5660 } else { 5661 kmovdl(dst, src); 5662 if (mask_len != 32) { 5663 kshiftrdl(dst, dst, 32 - mask_len); 5664 } 5665 } 5666 } else { 5667 assert(mask_len <= 16, ""); 5668 kmovwl(dst, src); 5669 if (mask_len != 16) { 5670 kshiftrwl(dst, dst, 16 - mask_len); 5671 } 5672 } 5673 } 5674 5675 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5676 int lane_size = type2aelembytes(bt); 5677 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5678 if ((is_LP64 || lane_size < 8) && 5679 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5680 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5681 movptr(rtmp, imm32); 5682 switch(lane_size) { 5683 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5684 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5685 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5686 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5687 fatal("Unsupported lane size %d", lane_size); 5688 break; 5689 } 5690 } else { 5691 movptr(rtmp, imm32); 5692 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5693 switch(lane_size) { 5694 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5695 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5696 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5697 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5698 fatal("Unsupported lane size %d", lane_size); 5699 break; 5700 } 5701 } 5702 } 5703 5704 // 5705 // Following is lookup table based popcount computation algorithm:- 5706 // Index Bit set count 5707 // [ 0000 -> 0, 5708 // 0001 -> 1, 5709 // 0010 -> 1, 5710 // 0011 -> 2, 5711 // 0100 -> 1, 5712 // 0101 -> 2, 5713 // 0110 -> 2, 5714 // 0111 -> 3, 5715 // 1000 -> 1, 5716 // 1001 -> 2, 5717 // 1010 -> 3, 5718 // 1011 -> 3, 5719 // 1100 -> 2, 5720 // 1101 -> 3, 5721 // 1111 -> 4 ] 5722 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5723 // shuffle indices for lookup table access. 5724 // b. Right shift each byte of vector lane by 4 positions. 5725 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5726 // shuffle indices for lookup table access. 5727 // d. Add the bitset count of upper and lower 4 bits of each byte. 5728 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5729 // count of all the bytes of a quadword. 5730 // f. Perform step e. for upper 128bit vector lane. 5731 // g. Pack the bitset count of quadwords back to double word. 5732 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5733 5734 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5735 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5736 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5737 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5738 vpsrlw(dst, src, 4, vec_enc); 5739 vpand(dst, dst, xtmp1, vec_enc); 5740 vpand(xtmp1, src, xtmp1, vec_enc); 5741 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5742 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5743 vpshufb(dst, xtmp2, dst, vec_enc); 5744 vpaddb(dst, dst, xtmp1, vec_enc); 5745 } 5746 5747 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5748 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5749 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5750 // Following code is as per steps e,f,g and h of above algorithm. 5751 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5752 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5753 vpsadbw(dst, dst, xtmp2, vec_enc); 5754 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5755 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5756 vpackuswb(dst, xtmp1, dst, vec_enc); 5757 } 5758 5759 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5760 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5761 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5762 // Add the popcount of upper and lower bytes of word. 5763 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5764 vpsrlw(dst, xtmp1, 8, vec_enc); 5765 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5766 vpaddw(dst, dst, xtmp1, vec_enc); 5767 } 5768 5769 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5770 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5771 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5772 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5773 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5774 } 5775 5776 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5777 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5778 switch(bt) { 5779 case T_LONG: 5780 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5781 break; 5782 case T_INT: 5783 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5784 break; 5785 case T_CHAR: 5786 case T_SHORT: 5787 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5788 break; 5789 case T_BYTE: 5790 case T_BOOLEAN: 5791 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5792 break; 5793 default: 5794 fatal("Unsupported type %s", type2name(bt)); 5795 break; 5796 } 5797 } 5798 5799 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5800 KRegister mask, bool merge, int vec_enc) { 5801 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5802 switch(bt) { 5803 case T_LONG: 5804 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5805 evpopcntq(dst, mask, src, merge, vec_enc); 5806 break; 5807 case T_INT: 5808 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5809 evpopcntd(dst, mask, src, merge, vec_enc); 5810 break; 5811 case T_CHAR: 5812 case T_SHORT: 5813 assert(VM_Version::supports_avx512_bitalg(), ""); 5814 evpopcntw(dst, mask, src, merge, vec_enc); 5815 break; 5816 case T_BYTE: 5817 case T_BOOLEAN: 5818 assert(VM_Version::supports_avx512_bitalg(), ""); 5819 evpopcntb(dst, mask, src, merge, vec_enc); 5820 break; 5821 default: 5822 fatal("Unsupported type %s", type2name(bt)); 5823 break; 5824 } 5825 } 5826 5827 #ifndef _LP64 5828 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5829 assert(VM_Version::supports_avx512bw(), ""); 5830 kmovdl(tmp, src); 5831 kunpckdql(dst, tmp, tmp); 5832 } 5833 #endif 5834 5835 // Bit reversal algorithm first reverses the bits of each byte followed by 5836 // a byte level reversal for multi-byte primitive types (short/int/long). 5837 // Algorithm performs a lookup table access to get reverse bit sequence 5838 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5839 // is obtained by swapping the reverse bit sequences of upper and lower 5840 // nibble of a byte. 5841 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5842 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5843 if (VM_Version::supports_avx512vlbw()) { 5844 5845 // Get the reverse bit sequence of lower nibble of each byte. 5846 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5847 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5848 evpandq(dst, xtmp2, src, vec_enc); 5849 vpshufb(dst, xtmp1, dst, vec_enc); 5850 vpsllq(dst, dst, 4, vec_enc); 5851 5852 // Get the reverse bit sequence of upper nibble of each byte. 5853 vpandn(xtmp2, xtmp2, src, vec_enc); 5854 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5855 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5856 5857 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5858 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5859 evporq(xtmp2, dst, xtmp2, vec_enc); 5860 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5861 5862 } else if(vec_enc == Assembler::AVX_512bit) { 5863 // Shift based bit reversal. 5864 assert(bt == T_LONG || bt == T_INT, ""); 5865 5866 // Swap lower and upper nibble of each byte. 5867 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5868 5869 // Swap two least and most significant bits of each nibble. 5870 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5871 5872 // Swap adjacent pair of bits. 5873 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5874 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5875 5876 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5877 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5878 } else { 5879 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5880 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5881 5882 // Get the reverse bit sequence of lower nibble of each byte. 5883 vpand(dst, xtmp2, src, vec_enc); 5884 vpshufb(dst, xtmp1, dst, vec_enc); 5885 vpsllq(dst, dst, 4, vec_enc); 5886 5887 // Get the reverse bit sequence of upper nibble of each byte. 5888 vpandn(xtmp2, xtmp2, src, vec_enc); 5889 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5890 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5891 5892 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5893 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5894 vpor(xtmp2, dst, xtmp2, vec_enc); 5895 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5896 } 5897 } 5898 5899 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5900 XMMRegister xtmp, Register rscratch) { 5901 assert(VM_Version::supports_gfni(), ""); 5902 assert(rscratch != noreg || always_reachable(mask), "missing"); 5903 5904 // Galois field instruction based bit reversal based on following algorithm. 5905 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5906 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5907 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5908 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5909 } 5910 5911 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5912 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5913 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5914 evpandq(dst, xtmp1, src, vec_enc); 5915 vpsllq(dst, dst, nbits, vec_enc); 5916 vpandn(xtmp1, xtmp1, src, vec_enc); 5917 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5918 evporq(dst, dst, xtmp1, vec_enc); 5919 } 5920 5921 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5922 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5923 // Shift based bit reversal. 5924 assert(VM_Version::supports_evex(), ""); 5925 switch(bt) { 5926 case T_LONG: 5927 // Swap upper and lower double word of each quad word. 5928 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5929 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5930 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5931 break; 5932 case T_INT: 5933 // Swap upper and lower word of each double word. 5934 evprord(xtmp1, k0, src, 16, true, vec_enc); 5935 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5936 break; 5937 case T_CHAR: 5938 case T_SHORT: 5939 // Swap upper and lower byte of each word. 5940 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 5941 break; 5942 case T_BYTE: 5943 evmovdquq(dst, k0, src, true, vec_enc); 5944 break; 5945 default: 5946 fatal("Unsupported type %s", type2name(bt)); 5947 break; 5948 } 5949 } 5950 5951 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5952 if (bt == T_BYTE) { 5953 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 5954 evmovdquq(dst, k0, src, true, vec_enc); 5955 } else { 5956 vmovdqu(dst, src); 5957 } 5958 return; 5959 } 5960 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 5961 // pre-computed shuffle indices. 5962 switch(bt) { 5963 case T_LONG: 5964 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 5965 break; 5966 case T_INT: 5967 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 5968 break; 5969 case T_CHAR: 5970 case T_SHORT: 5971 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 5972 break; 5973 default: 5974 fatal("Unsupported type %s", type2name(bt)); 5975 break; 5976 } 5977 vpshufb(dst, src, dst, vec_enc); 5978 } 5979 5980 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5981 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5982 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 5983 assert(is_integral_type(bt), ""); 5984 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5985 assert(VM_Version::supports_avx512cd(), ""); 5986 switch(bt) { 5987 case T_LONG: 5988 evplzcntq(dst, ktmp, src, merge, vec_enc); 5989 break; 5990 case T_INT: 5991 evplzcntd(dst, ktmp, src, merge, vec_enc); 5992 break; 5993 case T_SHORT: 5994 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 5995 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 5996 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 5997 vpunpckhwd(dst, xtmp1, src, vec_enc); 5998 evplzcntd(dst, ktmp, dst, merge, vec_enc); 5999 vpackusdw(dst, xtmp2, dst, vec_enc); 6000 break; 6001 case T_BYTE: 6002 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6003 // accessing the lookup table. 6004 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6005 // accessing the lookup table. 6006 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6007 assert(VM_Version::supports_avx512bw(), ""); 6008 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6009 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6010 vpand(xtmp2, dst, src, vec_enc); 6011 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6012 vpsrlw(xtmp3, src, 4, vec_enc); 6013 vpand(xtmp3, dst, xtmp3, vec_enc); 6014 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6015 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6016 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6017 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6018 break; 6019 default: 6020 fatal("Unsupported type %s", type2name(bt)); 6021 break; 6022 } 6023 } 6024 6025 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6026 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6027 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6028 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6029 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6030 // accessing the lookup table. 6031 vpand(dst, xtmp2, src, vec_enc); 6032 vpshufb(dst, xtmp1, dst, vec_enc); 6033 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6034 // accessing the lookup table. 6035 vpsrlw(xtmp3, src, 4, vec_enc); 6036 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6037 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6038 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6039 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6040 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6041 vpaddb(dst, dst, xtmp2, vec_enc); 6042 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6043 } 6044 6045 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6046 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6047 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6048 // Add zero counts of lower byte and upper byte of a word if 6049 // upper byte holds a zero value. 6050 vpsrlw(xtmp3, src, 8, vec_enc); 6051 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6052 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6053 vpsllw(xtmp2, dst, 8, vec_enc); 6054 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6055 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6056 vpsrlw(dst, dst, 8, vec_enc); 6057 } 6058 6059 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6060 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6061 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6062 // hence biased exponent can be used to compute leading zero count as per 6063 // following formula:- 6064 // LZCNT = 32 - (biased_exp - 127) 6065 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6066 6067 // Broadcast 0xFF 6068 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6069 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6070 6071 // Extract biased exponent. 6072 vcvtdq2ps(dst, src, vec_enc); 6073 vpsrld(dst, dst, 23, vec_enc); 6074 vpand(dst, dst, xtmp1, vec_enc); 6075 6076 // Broadcast 127. 6077 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6078 // Exponent = biased_exp - 127 6079 vpsubd(dst, dst, xtmp1, vec_enc); 6080 6081 // Exponent = Exponent + 1 6082 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6083 vpaddd(dst, dst, xtmp3, vec_enc); 6084 6085 // Replace -ve exponent with zero, exponent is -ve when src 6086 // lane contains a zero value. 6087 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6088 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6089 6090 // Rematerialize broadcast 32. 6091 vpslld(xtmp1, xtmp3, 5, vec_enc); 6092 // Exponent is 32 if corresponding source lane contains max_int value. 6093 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6094 // LZCNT = 32 - exponent 6095 vpsubd(dst, xtmp1, dst, vec_enc); 6096 6097 // Replace LZCNT with a value 1 if corresponding source lane 6098 // contains max_int value. 6099 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6100 6101 // Replace biased_exp with 0 if source lane value is less than zero. 6102 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6103 vblendvps(dst, dst, xtmp2, src, vec_enc); 6104 } 6105 6106 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6107 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6108 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6109 // Add zero counts of lower word and upper word of a double word if 6110 // upper word holds a zero value. 6111 vpsrld(xtmp3, src, 16, vec_enc); 6112 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6113 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6114 vpslld(xtmp2, dst, 16, vec_enc); 6115 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6116 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6117 vpsrld(dst, dst, 16, vec_enc); 6118 // Add zero counts of lower doubleword and upper doubleword of a 6119 // quadword if upper doubleword holds a zero value. 6120 vpsrlq(xtmp3, src, 32, vec_enc); 6121 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6122 vpsllq(xtmp2, dst, 32, vec_enc); 6123 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6124 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6125 vpsrlq(dst, dst, 32, vec_enc); 6126 } 6127 6128 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6129 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6130 Register rtmp, int vec_enc) { 6131 assert(is_integral_type(bt), "unexpected type"); 6132 assert(vec_enc < Assembler::AVX_512bit, ""); 6133 switch(bt) { 6134 case T_LONG: 6135 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6136 break; 6137 case T_INT: 6138 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6139 break; 6140 case T_SHORT: 6141 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6142 break; 6143 case T_BYTE: 6144 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6145 break; 6146 default: 6147 fatal("Unsupported type %s", type2name(bt)); 6148 break; 6149 } 6150 } 6151 6152 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6153 switch(bt) { 6154 case T_BYTE: 6155 vpsubb(dst, src1, src2, vec_enc); 6156 break; 6157 case T_SHORT: 6158 vpsubw(dst, src1, src2, vec_enc); 6159 break; 6160 case T_INT: 6161 vpsubd(dst, src1, src2, vec_enc); 6162 break; 6163 case T_LONG: 6164 vpsubq(dst, src1, src2, vec_enc); 6165 break; 6166 default: 6167 fatal("Unsupported type %s", type2name(bt)); 6168 break; 6169 } 6170 } 6171 6172 // Trailing zero count computation is based on leading zero count operation as per 6173 // following equation. All AVX3 targets support AVX512CD feature which offers 6174 // direct vector instruction to compute leading zero count. 6175 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6176 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6177 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6178 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6179 assert(is_integral_type(bt), ""); 6180 // xtmp = -1 6181 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6182 // xtmp = xtmp + src 6183 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6184 // xtmp = xtmp & ~src 6185 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6186 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6187 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6188 vpsub(bt, dst, xtmp4, dst, vec_enc); 6189 } 6190 6191 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6192 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6193 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6194 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6195 assert(is_integral_type(bt), ""); 6196 // xtmp = 0 6197 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6198 // xtmp = 0 - src 6199 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6200 // xtmp = xtmp | src 6201 vpor(xtmp3, xtmp3, src, vec_enc); 6202 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6203 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6204 vpsub(bt, dst, xtmp1, dst, vec_enc); 6205 } 6206 6207 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6208 Label done; 6209 Label neg_divisor_fastpath; 6210 cmpl(divisor, 0); 6211 jccb(Assembler::less, neg_divisor_fastpath); 6212 xorl(rdx, rdx); 6213 divl(divisor); 6214 jmpb(done); 6215 bind(neg_divisor_fastpath); 6216 // Fastpath for divisor < 0: 6217 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6218 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6219 movl(rdx, rax); 6220 subl(rdx, divisor); 6221 if (VM_Version::supports_bmi1()) { 6222 andnl(rax, rdx, rax); 6223 } else { 6224 notl(rdx); 6225 andl(rax, rdx); 6226 } 6227 shrl(rax, 31); 6228 bind(done); 6229 } 6230 6231 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6232 Label done; 6233 Label neg_divisor_fastpath; 6234 cmpl(divisor, 0); 6235 jccb(Assembler::less, neg_divisor_fastpath); 6236 xorl(rdx, rdx); 6237 divl(divisor); 6238 jmpb(done); 6239 bind(neg_divisor_fastpath); 6240 // Fastpath when divisor < 0: 6241 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6242 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6243 movl(rdx, rax); 6244 subl(rax, divisor); 6245 if (VM_Version::supports_bmi1()) { 6246 andnl(rax, rax, rdx); 6247 } else { 6248 notl(rax); 6249 andl(rax, rdx); 6250 } 6251 sarl(rax, 31); 6252 andl(rax, divisor); 6253 subl(rdx, rax); 6254 bind(done); 6255 } 6256 6257 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6258 Label done; 6259 Label neg_divisor_fastpath; 6260 6261 cmpl(divisor, 0); 6262 jccb(Assembler::less, neg_divisor_fastpath); 6263 xorl(rdx, rdx); 6264 divl(divisor); 6265 jmpb(done); 6266 bind(neg_divisor_fastpath); 6267 // Fastpath for divisor < 0: 6268 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6269 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6270 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6271 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6272 movl(rdx, rax); 6273 subl(rax, divisor); 6274 if (VM_Version::supports_bmi1()) { 6275 andnl(rax, rax, rdx); 6276 } else { 6277 notl(rax); 6278 andl(rax, rdx); 6279 } 6280 movl(tmp, rax); 6281 shrl(rax, 31); // quotient 6282 sarl(tmp, 31); 6283 andl(tmp, divisor); 6284 subl(rdx, tmp); // remainder 6285 bind(done); 6286 } 6287 6288 #ifdef _LP64 6289 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6290 XMMRegister xtmp2, Register rtmp) { 6291 if(VM_Version::supports_gfni()) { 6292 // Galois field instruction based bit reversal based on following algorithm. 6293 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6294 mov64(rtmp, 0x8040201008040201L); 6295 movq(xtmp1, src); 6296 movq(xtmp2, rtmp); 6297 gf2p8affineqb(xtmp1, xtmp2, 0); 6298 movq(dst, xtmp1); 6299 } else { 6300 // Swap even and odd numbered bits. 6301 movl(rtmp, src); 6302 andl(rtmp, 0x55555555); 6303 shll(rtmp, 1); 6304 movl(dst, src); 6305 andl(dst, 0xAAAAAAAA); 6306 shrl(dst, 1); 6307 orl(dst, rtmp); 6308 6309 // Swap LSB and MSB 2 bits of each nibble. 6310 movl(rtmp, dst); 6311 andl(rtmp, 0x33333333); 6312 shll(rtmp, 2); 6313 andl(dst, 0xCCCCCCCC); 6314 shrl(dst, 2); 6315 orl(dst, rtmp); 6316 6317 // Swap LSB and MSB 4 bits of each byte. 6318 movl(rtmp, dst); 6319 andl(rtmp, 0x0F0F0F0F); 6320 shll(rtmp, 4); 6321 andl(dst, 0xF0F0F0F0); 6322 shrl(dst, 4); 6323 orl(dst, rtmp); 6324 } 6325 bswapl(dst); 6326 } 6327 6328 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6329 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6330 if(VM_Version::supports_gfni()) { 6331 // Galois field instruction based bit reversal based on following algorithm. 6332 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6333 mov64(rtmp1, 0x8040201008040201L); 6334 movq(xtmp1, src); 6335 movq(xtmp2, rtmp1); 6336 gf2p8affineqb(xtmp1, xtmp2, 0); 6337 movq(dst, xtmp1); 6338 } else { 6339 // Swap even and odd numbered bits. 6340 movq(rtmp1, src); 6341 mov64(rtmp2, 0x5555555555555555L); 6342 andq(rtmp1, rtmp2); 6343 shlq(rtmp1, 1); 6344 movq(dst, src); 6345 notq(rtmp2); 6346 andq(dst, rtmp2); 6347 shrq(dst, 1); 6348 orq(dst, rtmp1); 6349 6350 // Swap LSB and MSB 2 bits of each nibble. 6351 movq(rtmp1, dst); 6352 mov64(rtmp2, 0x3333333333333333L); 6353 andq(rtmp1, rtmp2); 6354 shlq(rtmp1, 2); 6355 notq(rtmp2); 6356 andq(dst, rtmp2); 6357 shrq(dst, 2); 6358 orq(dst, rtmp1); 6359 6360 // Swap LSB and MSB 4 bits of each byte. 6361 movq(rtmp1, dst); 6362 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6363 andq(rtmp1, rtmp2); 6364 shlq(rtmp1, 4); 6365 notq(rtmp2); 6366 andq(dst, rtmp2); 6367 shrq(dst, 4); 6368 orq(dst, rtmp1); 6369 } 6370 bswapq(dst); 6371 } 6372 6373 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6374 Label done; 6375 Label neg_divisor_fastpath; 6376 cmpq(divisor, 0); 6377 jccb(Assembler::less, neg_divisor_fastpath); 6378 xorl(rdx, rdx); 6379 divq(divisor); 6380 jmpb(done); 6381 bind(neg_divisor_fastpath); 6382 // Fastpath for divisor < 0: 6383 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6384 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6385 movq(rdx, rax); 6386 subq(rdx, divisor); 6387 if (VM_Version::supports_bmi1()) { 6388 andnq(rax, rdx, rax); 6389 } else { 6390 notq(rdx); 6391 andq(rax, rdx); 6392 } 6393 shrq(rax, 63); 6394 bind(done); 6395 } 6396 6397 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6398 Label done; 6399 Label neg_divisor_fastpath; 6400 cmpq(divisor, 0); 6401 jccb(Assembler::less, neg_divisor_fastpath); 6402 xorq(rdx, rdx); 6403 divq(divisor); 6404 jmp(done); 6405 bind(neg_divisor_fastpath); 6406 // Fastpath when divisor < 0: 6407 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6408 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6409 movq(rdx, rax); 6410 subq(rax, divisor); 6411 if (VM_Version::supports_bmi1()) { 6412 andnq(rax, rax, rdx); 6413 } else { 6414 notq(rax); 6415 andq(rax, rdx); 6416 } 6417 sarq(rax, 63); 6418 andq(rax, divisor); 6419 subq(rdx, rax); 6420 bind(done); 6421 } 6422 6423 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6424 Label done; 6425 Label neg_divisor_fastpath; 6426 cmpq(divisor, 0); 6427 jccb(Assembler::less, neg_divisor_fastpath); 6428 xorq(rdx, rdx); 6429 divq(divisor); 6430 jmp(done); 6431 bind(neg_divisor_fastpath); 6432 // Fastpath for divisor < 0: 6433 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6434 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6435 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6436 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6437 movq(rdx, rax); 6438 subq(rax, divisor); 6439 if (VM_Version::supports_bmi1()) { 6440 andnq(rax, rax, rdx); 6441 } else { 6442 notq(rax); 6443 andq(rax, rdx); 6444 } 6445 movq(tmp, rax); 6446 shrq(rax, 63); // quotient 6447 sarq(tmp, 63); 6448 andq(tmp, divisor); 6449 subq(rdx, tmp); // remainder 6450 bind(done); 6451 } 6452 #endif 6453 6454 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6455 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6456 int vlen_enc) { 6457 assert(VM_Version::supports_avx512bw(), ""); 6458 // Byte shuffles are inlane operations and indices are determined using 6459 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6460 // normalized to index range 0-15. This makes sure that all the multiples 6461 // of an index value are placed at same relative position in 128 bit 6462 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6463 // will be 16th element in their respective 128 bit lanes. 6464 movl(rtmp, 16); 6465 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6466 6467 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6468 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6469 // original shuffle indices and move the shuffled lanes corresponding to true 6470 // mask to destination vector. 6471 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6472 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6473 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6474 6475 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6476 // and broadcasting second 128 bit lane. 6477 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6478 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6479 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6480 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6481 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6482 6483 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6484 // and broadcasting third 128 bit lane. 6485 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6486 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6487 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6488 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6489 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6490 6491 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6492 // and broadcasting third 128 bit lane. 6493 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6494 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6495 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6496 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6497 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6498 } 6499 6500 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6501 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6502 if (vlen_enc == AVX_128bit) { 6503 vpermilps(dst, src, shuffle, vlen_enc); 6504 } else if (bt == T_INT) { 6505 vpermd(dst, shuffle, src, vlen_enc); 6506 } else { 6507 assert(bt == T_FLOAT, ""); 6508 vpermps(dst, shuffle, src, vlen_enc); 6509 } 6510 }