1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/globals.hpp" 37 #include "runtime/objectMonitor.hpp" 38 #include "runtime/stubRoutines.hpp" 39 #include "utilities/checkedCast.hpp" 40 #include "utilities/globalDefinitions.hpp" 41 #include "utilities/powerOfTwo.hpp" 42 #include "utilities/sizes.hpp" 43 44 #ifdef PRODUCT 45 #define BLOCK_COMMENT(str) /* nothing */ 46 #define STOP(error) stop(error) 47 #else 48 #define BLOCK_COMMENT(str) block_comment(str) 49 #define STOP(error) block_comment(error); stop(error) 50 #endif 51 52 // C2 compiled method's prolog code. 53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 54 55 // WARNING: Initial instruction MUST be 5 bytes or longer so that 56 // NativeJump::patch_verified_entry will be able to patch out the entry 57 // code safely. The push to verify stack depth is ok at 5 bytes, 58 // the frame allocation can be either 3 or 6 bytes. So if we don't do 59 // stack bang then we must use the 6 byte frame allocation even if 60 // we have no frame. :-( 61 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 62 63 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 64 // Remove word for return addr 65 framesize -= wordSize; 66 stack_bang_size -= wordSize; 67 68 // Calls to C2R adapters often do not accept exceptional returns. 69 // We require that their callers must bang for them. But be careful, because 70 // some VM calls (such as call site linkage) can use several kilobytes of 71 // stack. But the stack safety zone should account for that. 72 // See bugs 4446381, 4468289, 4497237. 73 if (stack_bang_size > 0) { 74 generate_stack_overflow_check(stack_bang_size); 75 76 // We always push rbp, so that on return to interpreter rbp, will be 77 // restored correctly and we can correct the stack. 78 push(rbp); 79 // Save caller's stack pointer into RBP if the frame pointer is preserved. 80 if (PreserveFramePointer) { 81 mov(rbp, rsp); 82 } 83 // Remove word for ebp 84 framesize -= wordSize; 85 86 // Create frame 87 if (framesize) { 88 subptr(rsp, framesize); 89 } 90 } else { 91 // Create frame (force generation of a 4 byte immediate value) 92 subptr_imm32(rsp, framesize); 93 94 // Save RBP register now. 95 framesize -= wordSize; 96 movptr(Address(rsp, framesize), rbp); 97 // Save caller's stack pointer into RBP if the frame pointer is preserved. 98 if (PreserveFramePointer) { 99 movptr(rbp, rsp); 100 if (framesize > 0) { 101 addptr(rbp, framesize); 102 } 103 } 104 } 105 106 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 107 framesize -= wordSize; 108 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 109 } 110 111 #ifndef _LP64 112 // If method sets FPU control word do it now 113 if (fp_mode_24b) { 114 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 115 } 116 if (UseSSE >= 2 && VerifyFPU) { 117 verify_FPU(0, "FPU stack must be clean on entry"); 118 } 119 #endif 120 121 #ifdef ASSERT 122 if (VerifyStackAtCalls) { 123 Label L; 124 push(rax); 125 mov(rax, rsp); 126 andptr(rax, StackAlignmentInBytes-1); 127 cmpptr(rax, StackAlignmentInBytes-wordSize); 128 pop(rax); 129 jcc(Assembler::equal, L); 130 STOP("Stack is not properly aligned!"); 131 bind(L); 132 } 133 #endif 134 135 if (!is_stub) { 136 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 137 #ifdef _LP64 138 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 139 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 140 Label dummy_slow_path; 141 Label dummy_continuation; 142 Label* slow_path = &dummy_slow_path; 143 Label* continuation = &dummy_continuation; 144 if (!Compile::current()->output()->in_scratch_emit_size()) { 145 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 146 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 147 Compile::current()->output()->add_stub(stub); 148 slow_path = &stub->entry(); 149 continuation = &stub->continuation(); 150 } 151 bs->nmethod_entry_barrier(this, slow_path, continuation); 152 } 153 #else 154 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 155 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 156 #endif 157 } 158 } 159 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 161 switch (vlen_in_bytes) { 162 case 4: // fall-through 163 case 8: // fall-through 164 case 16: return Assembler::AVX_128bit; 165 case 32: return Assembler::AVX_256bit; 166 case 64: return Assembler::AVX_512bit; 167 168 default: { 169 ShouldNotReachHere(); 170 return Assembler::AVX_NoVec; 171 } 172 } 173 } 174 175 // fast_lock and fast_unlock used by C2 176 177 // Because the transitions from emitted code to the runtime 178 // monitorenter/exit helper stubs are so slow it's critical that 179 // we inline both the stack-locking fast path and the inflated fast path. 180 // 181 // See also: cmpFastLock and cmpFastUnlock. 182 // 183 // What follows is a specialized inline transliteration of the code 184 // in enter() and exit(). If we're concerned about I$ bloat another 185 // option would be to emit TrySlowEnter and TrySlowExit methods 186 // at startup-time. These methods would accept arguments as 187 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 188 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 189 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 190 // In practice, however, the # of lock sites is bounded and is usually small. 191 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 192 // if the processor uses simple bimodal branch predictors keyed by EIP 193 // Since the helper routines would be called from multiple synchronization 194 // sites. 195 // 196 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 197 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 198 // to those specialized methods. That'd give us a mostly platform-independent 199 // implementation that the JITs could optimize and inline at their pleasure. 200 // Done correctly, the only time we'd need to cross to native could would be 201 // to park() or unpark() threads. We'd also need a few more unsafe operators 202 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 203 // (b) explicit barriers or fence operations. 204 // 205 // TODO: 206 // 207 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 208 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 209 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 210 // the lock operators would typically be faster than reifying Self. 211 // 212 // * Ideally I'd define the primitives as: 213 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 214 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 215 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 216 // Instead, we're stuck with a rather awkward and brittle register assignments below. 217 // Furthermore the register assignments are overconstrained, possibly resulting in 218 // sub-optimal code near the synchronization site. 219 // 220 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 221 // Alternately, use a better sp-proximity test. 222 // 223 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 224 // Either one is sufficient to uniquely identify a thread. 225 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 226 // 227 // * Intrinsify notify() and notifyAll() for the common cases where the 228 // object is locked by the calling thread but the waitlist is empty. 229 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 230 // 231 // * use jccb and jmpb instead of jcc and jmp to improve code density. 232 // But beware of excessive branch density on AMD Opterons. 233 // 234 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 235 // or failure of the fast path. If the fast path fails then we pass 236 // control to the slow path, typically in C. In fast_lock and 237 // fast_unlock we often branch to DONE_LABEL, just to find that C2 238 // will emit a conditional branch immediately after the node. 239 // So we have branches to branches and lots of ICC.ZF games. 240 // Instead, it might be better to have C2 pass a "FailureLabel" 241 // into fast_lock and fast_unlock. In the case of success, control 242 // will drop through the node. ICC.ZF is undefined at exit. 243 // In the case of failure, the node will branch directly to the 244 // FailureLabel 245 246 247 // obj: object to lock 248 // box: on-stack box address (displaced header location) - KILLED 249 // rax,: tmp -- KILLED 250 // scr: tmp -- KILLED 251 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 252 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 253 Metadata* method_data) { 254 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 255 // Ensure the register assignments are disjoint 256 assert(tmpReg == rax, ""); 257 assert(cx1Reg == noreg, ""); 258 assert(cx2Reg == noreg, ""); 259 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 260 261 // Possible cases that we'll encounter in fast_lock 262 // ------------------------------------------------ 263 // * Inflated 264 // -- unlocked 265 // -- Locked 266 // = by self 267 // = by other 268 // * neutral 269 // * stack-locked 270 // -- by self 271 // = sp-proximity test hits 272 // = sp-proximity test generates false-negative 273 // -- by other 274 // 275 276 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 277 278 if (DiagnoseSyncOnValueBasedClasses != 0) { 279 load_klass(tmpReg, objReg, scrReg); 280 testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 281 jcc(Assembler::notZero, DONE_LABEL); 282 } 283 284 if (LockingMode == LM_MONITOR) { 285 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 286 testptr(objReg, objReg); 287 } else { 288 assert(LockingMode == LM_LEGACY, "must be"); 289 290 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 291 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 292 jcc(Assembler::notZero, IsInflated); 293 294 // Attempt stack-locking ... 295 orptr (tmpReg, markWord::unlocked_value); 296 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 297 lock(); 298 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 299 jcc(Assembler::equal, COUNT); // Success 300 301 // Recursive locking. 302 // The object is stack-locked: markword contains stack pointer to BasicLock. 303 // Locked by current thread if difference with current SP is less than one page. 304 subptr(tmpReg, rsp); 305 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 306 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 307 movptr(Address(boxReg, 0), tmpReg); 308 } 309 jmp(DONE_LABEL); 310 311 bind(IsInflated); 312 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 313 314 #ifndef _LP64 315 // The object is inflated. 316 317 // boxReg refers to the on-stack BasicLock in the current frame. 318 // We'd like to write: 319 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 320 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 321 // additional latency as we have another ST in the store buffer that must drain. 322 323 // avoid ST-before-CAS 324 // register juggle because we need tmpReg for cmpxchgptr below 325 movptr(scrReg, boxReg); 326 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 327 328 // Optimistic form: consider XORL tmpReg,tmpReg 329 movptr(tmpReg, NULL_WORD); 330 331 // Appears unlocked - try to swing _owner from null to non-null. 332 // Ideally, I'd manifest "Self" with get_thread and then attempt 333 // to CAS the register containing thread id into m->Owner. 334 // But we don't have enough registers, so instead we can either try to CAS 335 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 336 // we later store thread id into m->Owner. Transiently storing a stack address 337 // (rsp or the address of the box) into m->owner is harmless. 338 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 339 lock(); 340 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 341 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 342 // If we weren't able to swing _owner from null to the BasicLock 343 // then take the slow path. 344 jccb (Assembler::notZero, NO_COUNT); 345 // update _owner from BasicLock to thread 346 get_thread (scrReg); // beware: clobbers ICCs 347 movptr(scrReg, Address(scrReg, JavaThread::lock_id_offset())); 348 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 349 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 350 351 // If the CAS fails we can either retry or pass control to the slow path. 352 // We use the latter tactic. 353 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 354 // If the CAS was successful ... 355 // Self has acquired the lock 356 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 357 // Intentional fall-through into DONE_LABEL ... 358 #else // _LP64 359 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 360 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 361 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 362 363 // It's inflated and we use scrReg for ObjectMonitor* in this section. 364 movq(scrReg, tmpReg); 365 xorq(tmpReg, tmpReg); 366 movptr(boxReg, Address(r15_thread, JavaThread::lock_id_offset())); 367 lock(); 368 cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 369 370 // Propagate ICC.ZF from CAS above into DONE_LABEL. 371 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 372 373 cmpptr(boxReg, rax); // Check if we are already the owner (recursive lock) 374 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 375 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 376 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 377 #endif // _LP64 378 bind(DONE_LABEL); 379 380 // ZFlag == 1 count in fast path 381 // ZFlag == 0 count in slow path 382 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 383 384 bind(COUNT); 385 // Count monitors in fast path 386 increment(Address(thread, JavaThread::held_monitor_count_offset())); 387 388 xorl(tmpReg, tmpReg); // Set ZF == 1 389 390 bind(NO_COUNT); 391 392 // At NO_COUNT the icc ZFlag is set as follows ... 393 // fast_unlock uses the same protocol. 394 // ZFlag == 1 -> Success 395 // ZFlag == 0 -> Failure - force control through the slow path 396 } 397 398 // obj: object to unlock 399 // box: box address (displaced header location), killed. Must be EAX. 400 // tmp: killed, cannot be obj nor box. 401 // 402 // Some commentary on balanced locking: 403 // 404 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 405 // Methods that don't have provably balanced locking are forced to run in the 406 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 407 // The interpreter provides two properties: 408 // I1: At return-time the interpreter automatically and quietly unlocks any 409 // objects acquired the current activation (frame). Recall that the 410 // interpreter maintains an on-stack list of locks currently held by 411 // a frame. 412 // I2: If a method attempts to unlock an object that is not held by the 413 // the frame the interpreter throws IMSX. 414 // 415 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 416 // B() doesn't have provably balanced locking so it runs in the interpreter. 417 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 418 // is still locked by A(). 419 // 420 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 421 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 422 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 423 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 424 // Arguably given that the spec legislates the JNI case as undefined our implementation 425 // could reasonably *avoid* checking owner in fast_unlock(). 426 // In the interest of performance we elide m->Owner==Self check in unlock. 427 // A perfectly viable alternative is to elide the owner check except when 428 // Xcheck:jni is enabled. 429 430 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, Register scrReg) { 431 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 432 assert(boxReg == rax, ""); 433 assert_different_registers(objReg, boxReg, tmpReg); 434 435 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 436 437 if (LockingMode == LM_LEGACY) { 438 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 439 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 440 } 441 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 442 if (LockingMode != LM_MONITOR) { 443 testptr(tmpReg, markWord::monitor_value); // Inflated? 444 jcc(Assembler::zero, Stacked); 445 } 446 447 // It's inflated. 448 449 // Despite our balanced locking property we still check that m->_owner == Self 450 // as java routines or native JNI code called by this thread might 451 // have released the lock. 452 // Refer to the comments in synchronizer.cpp for how we might encode extra 453 // state in _succ so we can avoid fetching EntryList|cxq. 454 // 455 // If there's no contention try a 1-0 exit. That is, exit without 456 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 457 // we detect and recover from the race that the 1-0 exit admits. 458 // 459 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 460 // before it STs null into _owner, releasing the lock. Updates 461 // to data protected by the critical section must be visible before 462 // we drop the lock (and thus before any other thread could acquire 463 // the lock and observe the fields protected by the lock). 464 // IA32's memory-model is SPO, so STs are ordered with respect to 465 // each other and there's no need for an explicit barrier (fence). 466 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 467 Label LSuccess, LNotRecursive; 468 469 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 470 jccb(Assembler::equal, LNotRecursive); 471 472 // Recursive inflated unlock 473 decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 474 jmpb(LSuccess); 475 476 bind(LNotRecursive); 477 478 // Set owner to null. 479 // Release to satisfy the JMM 480 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 481 // We need a full fence after clearing owner to avoid stranding. 482 // StoreLoad achieves this. 483 membar(StoreLoad); 484 485 // Check if the entry lists are empty. 486 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 487 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 488 jccb(Assembler::zero, LSuccess); // If so we are done. 489 490 // Check if there is a successor. 491 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 492 jccb(Assembler::notZero, LSuccess); // If so we are done. 493 494 // Save the monitor pointer in the current thread, so we can try to 495 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 496 andptr(tmpReg, ~(int32_t)markWord::monitor_value); 497 #ifndef _LP64 498 get_thread(boxReg); 499 movptr(Address(boxReg, JavaThread::unlocked_inflated_monitor_offset()), tmpReg); 500 #else // _LP64 501 movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg); 502 #endif 503 504 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 505 jmpb (DONE_LABEL); 506 507 bind (LSuccess); 508 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 509 jmpb (DONE_LABEL); 510 511 if (LockingMode == LM_LEGACY) { 512 bind (Stacked); 513 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 514 lock(); 515 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 516 // Intentional fall-thru into DONE_LABEL 517 } 518 519 bind(DONE_LABEL); 520 521 // ZFlag == 1 count in fast path 522 // ZFlag == 0 count in slow path 523 jccb(Assembler::notZero, NO_COUNT); 524 525 bind(COUNT); 526 527 if (LockingMode == LM_LEGACY) { 528 // Count monitors in fast path 529 #ifndef _LP64 530 get_thread(tmpReg); 531 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 532 #else // _LP64 533 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 534 #endif 535 } 536 537 xorl(tmpReg, tmpReg); // Set ZF == 1 538 539 bind(NO_COUNT); 540 } 541 542 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 543 Register t, Register thread) { 544 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 545 assert(rax_reg == rax, "Used for CAS"); 546 assert_different_registers(obj, box, rax_reg, t, thread); 547 548 // Handle inflated monitor. 549 Label inflated; 550 // Finish fast lock successfully. ZF value is irrelevant. 551 Label locked; 552 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 553 Label slow_path; 554 555 if (UseObjectMonitorTable) { 556 // Clear cache in case fast locking succeeds. 557 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 558 } 559 560 if (DiagnoseSyncOnValueBasedClasses != 0) { 561 load_klass(rax_reg, obj, t); 562 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 563 jcc(Assembler::notZero, slow_path); 564 } 565 566 const Register mark = t; 567 568 { // Lightweight Lock 569 570 Label push; 571 572 const Register top = UseObjectMonitorTable ? rax_reg : box; 573 574 // Load the mark. 575 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 576 577 // Prefetch top. 578 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 579 580 // Check for monitor (0b10). 581 testptr(mark, markWord::monitor_value); 582 jcc(Assembler::notZero, inflated); 583 584 // Check if lock-stack is full. 585 cmpl(top, LockStack::end_offset() - 1); 586 jcc(Assembler::greater, slow_path); 587 588 // Check if recursive. 589 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 590 jccb(Assembler::equal, push); 591 592 // Try to lock. Transition lock bits 0b01 => 0b00 593 movptr(rax_reg, mark); 594 orptr(rax_reg, markWord::unlocked_value); 595 andptr(mark, ~(int32_t)markWord::unlocked_value); 596 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 597 jcc(Assembler::notEqual, slow_path); 598 599 if (UseObjectMonitorTable) { 600 // Need to reload top, clobbered by CAS. 601 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 602 } 603 bind(push); 604 // After successful lock, push object on lock-stack. 605 movptr(Address(thread, top), obj); 606 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 607 jmpb(locked); 608 } 609 610 { // Handle inflated monitor. 611 bind(inflated); 612 613 const Register monitor = t; 614 615 if (!UseObjectMonitorTable) { 616 assert(mark == monitor, "should be the same here"); 617 } else { 618 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 619 // Fetch ObjectMonitor* from the cache or take the slow-path. 620 Label monitor_found; 621 622 // Load cache address 623 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 624 625 const int num_unrolled = 2; 626 for (int i = 0; i < num_unrolled; i++) { 627 cmpptr(obj, Address(t)); 628 jccb(Assembler::equal, monitor_found); 629 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 630 } 631 632 Label loop; 633 634 // Search for obj in cache. 635 bind(loop); 636 637 // Check for match. 638 cmpptr(obj, Address(t)); 639 jccb(Assembler::equal, monitor_found); 640 641 // Search until null encountered, guaranteed _null_sentinel at end. 642 cmpptr(Address(t), 1); 643 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 644 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 645 jmpb(loop); 646 647 // Cache hit. 648 bind(monitor_found); 649 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 650 } 651 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 652 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 653 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 654 655 Label monitor_locked; 656 // Lock the monitor. 657 658 if (UseObjectMonitorTable) { 659 // Cache the monitor for unlock before trashing box. On failure to acquire 660 // the lock, the slow path will reset the entry accordingly (see CacheSetter). 661 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 662 } 663 664 // CAS owner (null => current thread). 665 xorptr(rax_reg, rax_reg); 666 movptr(box, Address(thread, JavaThread::lock_id_offset())); 667 lock(); cmpxchgptr(box, owner_address); 668 jccb(Assembler::equal, monitor_locked); 669 670 // Check if recursive. 671 cmpptr(box, rax_reg); 672 jccb(Assembler::notEqual, slow_path); 673 674 // Recursive. 675 increment(recursions_address); 676 677 bind(monitor_locked); 678 } 679 680 bind(locked); 681 // Set ZF = 1 682 xorl(rax_reg, rax_reg); 683 684 #ifdef ASSERT 685 // Check that locked label is reached with ZF set. 686 Label zf_correct; 687 Label zf_bad_zero; 688 jcc(Assembler::zero, zf_correct); 689 jmp(zf_bad_zero); 690 #endif 691 692 bind(slow_path); 693 #ifdef ASSERT 694 // Check that slow_path label is reached with ZF not set. 695 jcc(Assembler::notZero, zf_correct); 696 stop("Fast Lock ZF != 0"); 697 bind(zf_bad_zero); 698 stop("Fast Lock ZF != 1"); 699 bind(zf_correct); 700 #endif 701 // C2 uses the value of ZF to determine the continuation. 702 } 703 704 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t1, Register t2, Register thread) { 705 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 706 assert(reg_rax == rax, "Used for CAS"); 707 assert_different_registers(obj, reg_rax, t1, t2); 708 709 // Handle inflated monitor. 710 Label inflated, inflated_check_lock_stack; 711 // Finish fast unlock successfully. MUST jump with ZF == 1 712 Label unlocked, slow_path; 713 714 const Register mark = t1; 715 const Register monitor = t1; 716 const Register top = UseObjectMonitorTable ? t1 : reg_rax; 717 const Register box = reg_rax; 718 719 Label dummy; 720 C2FastUnlockLightweightStub* stub = nullptr; 721 722 if (!Compile::current()->output()->in_scratch_emit_size()) { 723 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, t2, thread); 724 Compile::current()->output()->add_stub(stub); 725 } 726 727 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 728 729 { // Lightweight Unlock 730 731 // Load top. 732 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 733 734 if (!UseObjectMonitorTable) { 735 // Prefetch mark. 736 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 737 } 738 739 // Check if obj is top of lock-stack. 740 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 741 // Top of lock stack was not obj. Must be monitor. 742 jcc(Assembler::notEqual, inflated_check_lock_stack); 743 744 // Pop lock-stack. 745 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 746 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 747 748 // Check if recursive. 749 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 750 jcc(Assembler::equal, unlocked); 751 752 // We elide the monitor check, let the CAS fail instead. 753 754 if (UseObjectMonitorTable) { 755 // Load mark. 756 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 757 } 758 759 // Try to unlock. Transition lock bits 0b00 => 0b01 760 movptr(reg_rax, mark); 761 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 762 orptr(mark, markWord::unlocked_value); 763 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 764 jcc(Assembler::notEqual, push_and_slow_path); 765 jmp(unlocked); 766 } 767 768 769 { // Handle inflated monitor. 770 bind(inflated_check_lock_stack); 771 #ifdef ASSERT 772 Label check_done; 773 subl(top, oopSize); 774 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 775 jcc(Assembler::below, check_done); 776 cmpptr(obj, Address(thread, top)); 777 jccb(Assembler::notEqual, inflated_check_lock_stack); 778 stop("Fast Unlock lock on stack"); 779 bind(check_done); 780 if (UseObjectMonitorTable) { 781 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 782 } 783 testptr(mark, markWord::monitor_value); 784 jccb(Assembler::notZero, inflated); 785 stop("Fast Unlock not monitor"); 786 #endif 787 788 bind(inflated); 789 790 if (!UseObjectMonitorTable) { 791 assert(mark == monitor, "should be the same here"); 792 } else { 793 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 794 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 795 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 796 cmpptr(monitor, alignof(ObjectMonitor*)); 797 jcc(Assembler::below, slow_path); 798 } 799 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 800 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 801 const Address cxq_address{monitor, ObjectMonitor::cxq_offset() - monitor_tag}; 802 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 803 const Address EntryList_address{monitor, ObjectMonitor::EntryList_offset() - monitor_tag}; 804 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 805 806 Label recursive; 807 808 // Check if recursive. 809 cmpptr(recursions_address, 0); 810 jccb(Assembler::notZero, recursive); 811 812 // Set owner to null. 813 // Release to satisfy the JMM 814 movptr(owner_address, NULL_WORD); 815 // We need a full fence after clearing owner to avoid stranding. 816 // StoreLoad achieves this. 817 membar(StoreLoad); 818 819 // Check if the entry lists are empty. 820 movptr(reg_rax, cxq_address); 821 orptr(reg_rax, EntryList_address); 822 jccb(Assembler::zero, unlocked); // If so we are done. 823 824 // Check if there is a successor. 825 cmpptr(succ_address, NULL_WORD); 826 jccb(Assembler::notZero, unlocked); // If so we are done. 827 828 // Save the monitor pointer in the current thread, so we can try to 829 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 830 if (!UseObjectMonitorTable) { 831 andptr(monitor, ~(int32_t)markWord::monitor_value); 832 } 833 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 834 835 testl(monitor, monitor); // Fast Unlock ZF = 0 836 jmpb(slow_path); 837 838 // Recursive unlock. 839 bind(recursive); 840 decrement(recursions_address); 841 } 842 843 bind(unlocked); 844 xorl(t1, t1); // Fast Unlock ZF = 1 845 846 #ifdef ASSERT 847 // Check that unlocked label is reached with ZF set. 848 Label zf_correct; 849 jcc(Assembler::zero, zf_correct); 850 stop("Fast Unlock ZF != 1"); 851 #endif 852 853 bind(slow_path); 854 if (stub != nullptr) { 855 bind(stub->slow_path_continuation()); 856 } 857 #ifdef ASSERT 858 // Check that stub->continuation() label is reached with ZF not set. 859 jccb(Assembler::notZero, zf_correct); 860 stop("Fast Unlock ZF != 0"); 861 bind(zf_correct); 862 #endif 863 // C2 uses the value of ZF to determine the continuation. 864 } 865 866 //------------------------------------------------------------------------------------------- 867 // Generic instructions support for use in .ad files C2 code generation 868 869 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 870 if (dst != src) { 871 movdqu(dst, src); 872 } 873 if (opcode == Op_AbsVD) { 874 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 875 } else { 876 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 877 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 878 } 879 } 880 881 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 882 if (opcode == Op_AbsVD) { 883 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 884 } else { 885 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 886 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 887 } 888 } 889 890 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 891 if (dst != src) { 892 movdqu(dst, src); 893 } 894 if (opcode == Op_AbsVF) { 895 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 896 } else { 897 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 898 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 899 } 900 } 901 902 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 903 if (opcode == Op_AbsVF) { 904 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 905 } else { 906 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 907 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 908 } 909 } 910 911 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 912 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 913 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 914 915 if (opcode == Op_MinV) { 916 if (elem_bt == T_BYTE) { 917 pminsb(dst, src); 918 } else if (elem_bt == T_SHORT) { 919 pminsw(dst, src); 920 } else if (elem_bt == T_INT) { 921 pminsd(dst, src); 922 } else { 923 assert(elem_bt == T_LONG, "required"); 924 assert(tmp == xmm0, "required"); 925 assert_different_registers(dst, src, tmp); 926 movdqu(xmm0, dst); 927 pcmpgtq(xmm0, src); 928 blendvpd(dst, src); // xmm0 as mask 929 } 930 } else { // opcode == Op_MaxV 931 if (elem_bt == T_BYTE) { 932 pmaxsb(dst, src); 933 } else if (elem_bt == T_SHORT) { 934 pmaxsw(dst, src); 935 } else if (elem_bt == T_INT) { 936 pmaxsd(dst, src); 937 } else { 938 assert(elem_bt == T_LONG, "required"); 939 assert(tmp == xmm0, "required"); 940 assert_different_registers(dst, src, tmp); 941 movdqu(xmm0, src); 942 pcmpgtq(xmm0, dst); 943 blendvpd(dst, src); // xmm0 as mask 944 } 945 } 946 } 947 948 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 949 XMMRegister dst, XMMRegister src1, XMMRegister src2, 950 int vlen_enc) { 951 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 952 953 if (opcode == Op_MinV) { 954 if (elem_bt == T_BYTE) { 955 vpminsb(dst, src1, src2, vlen_enc); 956 } else if (elem_bt == T_SHORT) { 957 vpminsw(dst, src1, src2, vlen_enc); 958 } else if (elem_bt == T_INT) { 959 vpminsd(dst, src1, src2, vlen_enc); 960 } else { 961 assert(elem_bt == T_LONG, "required"); 962 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 963 vpminsq(dst, src1, src2, vlen_enc); 964 } else { 965 assert_different_registers(dst, src1, src2); 966 vpcmpgtq(dst, src1, src2, vlen_enc); 967 vblendvpd(dst, src1, src2, dst, vlen_enc); 968 } 969 } 970 } else { // opcode == Op_MaxV 971 if (elem_bt == T_BYTE) { 972 vpmaxsb(dst, src1, src2, vlen_enc); 973 } else if (elem_bt == T_SHORT) { 974 vpmaxsw(dst, src1, src2, vlen_enc); 975 } else if (elem_bt == T_INT) { 976 vpmaxsd(dst, src1, src2, vlen_enc); 977 } else { 978 assert(elem_bt == T_LONG, "required"); 979 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 980 vpmaxsq(dst, src1, src2, vlen_enc); 981 } else { 982 assert_different_registers(dst, src1, src2); 983 vpcmpgtq(dst, src1, src2, vlen_enc); 984 vblendvpd(dst, src2, src1, dst, vlen_enc); 985 } 986 } 987 } 988 } 989 990 // Float/Double min max 991 992 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 993 XMMRegister dst, XMMRegister a, XMMRegister b, 994 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 995 int vlen_enc) { 996 assert(UseAVX > 0, "required"); 997 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 998 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 999 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1000 assert_different_registers(a, tmp, atmp, btmp); 1001 assert_different_registers(b, tmp, atmp, btmp); 1002 1003 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1004 bool is_double_word = is_double_word_type(elem_bt); 1005 1006 /* Note on 'non-obvious' assembly sequence: 1007 * 1008 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1009 * and Java on how they handle floats: 1010 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1011 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1012 * 1013 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1014 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1015 * (only useful when signs differ, noop otherwise) 1016 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1017 1018 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1019 * btmp = (b < +0.0) ? a : b 1020 * atmp = (b < +0.0) ? b : a 1021 * Tmp = Max_Float(atmp , btmp) 1022 * Res = (atmp == NaN) ? atmp : Tmp 1023 */ 1024 1025 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1026 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1027 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1028 XMMRegister mask; 1029 1030 if (!is_double_word && is_min) { 1031 mask = a; 1032 vblend = &MacroAssembler::vblendvps; 1033 vmaxmin = &MacroAssembler::vminps; 1034 vcmp = &MacroAssembler::vcmpps; 1035 } else if (!is_double_word && !is_min) { 1036 mask = b; 1037 vblend = &MacroAssembler::vblendvps; 1038 vmaxmin = &MacroAssembler::vmaxps; 1039 vcmp = &MacroAssembler::vcmpps; 1040 } else if (is_double_word && is_min) { 1041 mask = a; 1042 vblend = &MacroAssembler::vblendvpd; 1043 vmaxmin = &MacroAssembler::vminpd; 1044 vcmp = &MacroAssembler::vcmppd; 1045 } else { 1046 assert(is_double_word && !is_min, "sanity"); 1047 mask = b; 1048 vblend = &MacroAssembler::vblendvpd; 1049 vmaxmin = &MacroAssembler::vmaxpd; 1050 vcmp = &MacroAssembler::vcmppd; 1051 } 1052 1053 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1054 XMMRegister maxmin, scratch; 1055 if (dst == btmp) { 1056 maxmin = btmp; 1057 scratch = tmp; 1058 } else { 1059 maxmin = tmp; 1060 scratch = btmp; 1061 } 1062 1063 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1064 if (precompute_mask && !is_double_word) { 1065 vpsrad(tmp, mask, 32, vlen_enc); 1066 mask = tmp; 1067 } else if (precompute_mask && is_double_word) { 1068 vpxor(tmp, tmp, tmp, vlen_enc); 1069 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1070 mask = tmp; 1071 } 1072 1073 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1074 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1075 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1076 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1077 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1078 } 1079 1080 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1081 XMMRegister dst, XMMRegister a, XMMRegister b, 1082 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1083 int vlen_enc) { 1084 assert(UseAVX > 2, "required"); 1085 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1086 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1087 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1088 assert_different_registers(dst, a, atmp, btmp); 1089 assert_different_registers(dst, b, atmp, btmp); 1090 1091 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1092 bool is_double_word = is_double_word_type(elem_bt); 1093 bool merge = true; 1094 1095 if (!is_double_word && is_min) { 1096 evpmovd2m(ktmp, a, vlen_enc); 1097 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1098 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1099 vminps(dst, atmp, btmp, vlen_enc); 1100 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1101 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1102 } else if (!is_double_word && !is_min) { 1103 evpmovd2m(ktmp, b, vlen_enc); 1104 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1105 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1106 vmaxps(dst, atmp, btmp, vlen_enc); 1107 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1108 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1109 } else if (is_double_word && is_min) { 1110 evpmovq2m(ktmp, a, vlen_enc); 1111 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1112 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1113 vminpd(dst, atmp, btmp, vlen_enc); 1114 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1115 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1116 } else { 1117 assert(is_double_word && !is_min, "sanity"); 1118 evpmovq2m(ktmp, b, vlen_enc); 1119 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1120 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1121 vmaxpd(dst, atmp, btmp, vlen_enc); 1122 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1123 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1124 } 1125 } 1126 1127 // Float/Double signum 1128 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1129 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1130 1131 Label DONE_LABEL; 1132 1133 if (opcode == Op_SignumF) { 1134 assert(UseSSE > 0, "required"); 1135 ucomiss(dst, zero); 1136 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1137 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1138 movflt(dst, one); 1139 jcc(Assembler::above, DONE_LABEL); 1140 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1141 } else if (opcode == Op_SignumD) { 1142 assert(UseSSE > 1, "required"); 1143 ucomisd(dst, zero); 1144 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1145 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1146 movdbl(dst, one); 1147 jcc(Assembler::above, DONE_LABEL); 1148 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1149 } 1150 1151 bind(DONE_LABEL); 1152 } 1153 1154 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1155 if (sign) { 1156 pmovsxbw(dst, src); 1157 } else { 1158 pmovzxbw(dst, src); 1159 } 1160 } 1161 1162 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1163 if (sign) { 1164 vpmovsxbw(dst, src, vector_len); 1165 } else { 1166 vpmovzxbw(dst, src, vector_len); 1167 } 1168 } 1169 1170 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1171 if (sign) { 1172 vpmovsxbd(dst, src, vector_len); 1173 } else { 1174 vpmovzxbd(dst, src, vector_len); 1175 } 1176 } 1177 1178 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1179 if (sign) { 1180 vpmovsxwd(dst, src, vector_len); 1181 } else { 1182 vpmovzxwd(dst, src, vector_len); 1183 } 1184 } 1185 1186 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1187 int shift, int vector_len) { 1188 if (opcode == Op_RotateLeftV) { 1189 if (etype == T_INT) { 1190 evprold(dst, src, shift, vector_len); 1191 } else { 1192 assert(etype == T_LONG, "expected type T_LONG"); 1193 evprolq(dst, src, shift, vector_len); 1194 } 1195 } else { 1196 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1197 if (etype == T_INT) { 1198 evprord(dst, src, shift, vector_len); 1199 } else { 1200 assert(etype == T_LONG, "expected type T_LONG"); 1201 evprorq(dst, src, shift, vector_len); 1202 } 1203 } 1204 } 1205 1206 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1207 XMMRegister shift, int vector_len) { 1208 if (opcode == Op_RotateLeftV) { 1209 if (etype == T_INT) { 1210 evprolvd(dst, src, shift, vector_len); 1211 } else { 1212 assert(etype == T_LONG, "expected type T_LONG"); 1213 evprolvq(dst, src, shift, vector_len); 1214 } 1215 } else { 1216 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1217 if (etype == T_INT) { 1218 evprorvd(dst, src, shift, vector_len); 1219 } else { 1220 assert(etype == T_LONG, "expected type T_LONG"); 1221 evprorvq(dst, src, shift, vector_len); 1222 } 1223 } 1224 } 1225 1226 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1227 if (opcode == Op_RShiftVI) { 1228 psrad(dst, shift); 1229 } else if (opcode == Op_LShiftVI) { 1230 pslld(dst, shift); 1231 } else { 1232 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1233 psrld(dst, shift); 1234 } 1235 } 1236 1237 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1238 switch (opcode) { 1239 case Op_RShiftVI: psrad(dst, shift); break; 1240 case Op_LShiftVI: pslld(dst, shift); break; 1241 case Op_URShiftVI: psrld(dst, shift); break; 1242 1243 default: assert(false, "%s", NodeClassNames[opcode]); 1244 } 1245 } 1246 1247 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1248 if (opcode == Op_RShiftVI) { 1249 vpsrad(dst, nds, shift, vector_len); 1250 } else if (opcode == Op_LShiftVI) { 1251 vpslld(dst, nds, shift, vector_len); 1252 } else { 1253 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1254 vpsrld(dst, nds, shift, vector_len); 1255 } 1256 } 1257 1258 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1259 switch (opcode) { 1260 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1261 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1262 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1263 1264 default: assert(false, "%s", NodeClassNames[opcode]); 1265 } 1266 } 1267 1268 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1269 switch (opcode) { 1270 case Op_RShiftVB: // fall-through 1271 case Op_RShiftVS: psraw(dst, shift); break; 1272 1273 case Op_LShiftVB: // fall-through 1274 case Op_LShiftVS: psllw(dst, shift); break; 1275 1276 case Op_URShiftVS: // fall-through 1277 case Op_URShiftVB: psrlw(dst, shift); break; 1278 1279 default: assert(false, "%s", NodeClassNames[opcode]); 1280 } 1281 } 1282 1283 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1284 switch (opcode) { 1285 case Op_RShiftVB: // fall-through 1286 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1287 1288 case Op_LShiftVB: // fall-through 1289 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1290 1291 case Op_URShiftVS: // fall-through 1292 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1293 1294 default: assert(false, "%s", NodeClassNames[opcode]); 1295 } 1296 } 1297 1298 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1299 switch (opcode) { 1300 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1301 case Op_LShiftVL: psllq(dst, shift); break; 1302 case Op_URShiftVL: psrlq(dst, shift); break; 1303 1304 default: assert(false, "%s", NodeClassNames[opcode]); 1305 } 1306 } 1307 1308 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1309 if (opcode == Op_RShiftVL) { 1310 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1311 } else if (opcode == Op_LShiftVL) { 1312 psllq(dst, shift); 1313 } else { 1314 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1315 psrlq(dst, shift); 1316 } 1317 } 1318 1319 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1320 switch (opcode) { 1321 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1322 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1323 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1324 1325 default: assert(false, "%s", NodeClassNames[opcode]); 1326 } 1327 } 1328 1329 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1330 if (opcode == Op_RShiftVL) { 1331 evpsraq(dst, nds, shift, vector_len); 1332 } else if (opcode == Op_LShiftVL) { 1333 vpsllq(dst, nds, shift, vector_len); 1334 } else { 1335 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1336 vpsrlq(dst, nds, shift, vector_len); 1337 } 1338 } 1339 1340 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1341 switch (opcode) { 1342 case Op_RShiftVB: // fall-through 1343 case Op_RShiftVS: // fall-through 1344 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1345 1346 case Op_LShiftVB: // fall-through 1347 case Op_LShiftVS: // fall-through 1348 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1349 1350 case Op_URShiftVB: // fall-through 1351 case Op_URShiftVS: // fall-through 1352 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1353 1354 default: assert(false, "%s", NodeClassNames[opcode]); 1355 } 1356 } 1357 1358 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1359 switch (opcode) { 1360 case Op_RShiftVB: // fall-through 1361 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1362 1363 case Op_LShiftVB: // fall-through 1364 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1365 1366 case Op_URShiftVB: // fall-through 1367 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1368 1369 default: assert(false, "%s", NodeClassNames[opcode]); 1370 } 1371 } 1372 1373 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1374 assert(UseAVX >= 2, "required"); 1375 switch (opcode) { 1376 case Op_RShiftVL: { 1377 if (UseAVX > 2) { 1378 assert(tmp == xnoreg, "not used"); 1379 if (!VM_Version::supports_avx512vl()) { 1380 vlen_enc = Assembler::AVX_512bit; 1381 } 1382 evpsravq(dst, src, shift, vlen_enc); 1383 } else { 1384 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1385 vpsrlvq(dst, src, shift, vlen_enc); 1386 vpsrlvq(tmp, tmp, shift, vlen_enc); 1387 vpxor(dst, dst, tmp, vlen_enc); 1388 vpsubq(dst, dst, tmp, vlen_enc); 1389 } 1390 break; 1391 } 1392 case Op_LShiftVL: { 1393 assert(tmp == xnoreg, "not used"); 1394 vpsllvq(dst, src, shift, vlen_enc); 1395 break; 1396 } 1397 case Op_URShiftVL: { 1398 assert(tmp == xnoreg, "not used"); 1399 vpsrlvq(dst, src, shift, vlen_enc); 1400 break; 1401 } 1402 default: assert(false, "%s", NodeClassNames[opcode]); 1403 } 1404 } 1405 1406 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1407 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1408 assert(opcode == Op_LShiftVB || 1409 opcode == Op_RShiftVB || 1410 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1411 bool sign = (opcode != Op_URShiftVB); 1412 assert(vector_len == 0, "required"); 1413 vextendbd(sign, dst, src, 1); 1414 vpmovzxbd(vtmp, shift, 1); 1415 varshiftd(opcode, dst, dst, vtmp, 1); 1416 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1417 vextracti128_high(vtmp, dst); 1418 vpackusdw(dst, dst, vtmp, 0); 1419 } 1420 1421 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1422 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1423 assert(opcode == Op_LShiftVB || 1424 opcode == Op_RShiftVB || 1425 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1426 bool sign = (opcode != Op_URShiftVB); 1427 int ext_vector_len = vector_len + 1; 1428 vextendbw(sign, dst, src, ext_vector_len); 1429 vpmovzxbw(vtmp, shift, ext_vector_len); 1430 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1431 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1432 if (vector_len == 0) { 1433 vextracti128_high(vtmp, dst); 1434 vpackuswb(dst, dst, vtmp, vector_len); 1435 } else { 1436 vextracti64x4_high(vtmp, dst); 1437 vpackuswb(dst, dst, vtmp, vector_len); 1438 vpermq(dst, dst, 0xD8, vector_len); 1439 } 1440 } 1441 1442 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1443 switch(typ) { 1444 case T_BYTE: 1445 pinsrb(dst, val, idx); 1446 break; 1447 case T_SHORT: 1448 pinsrw(dst, val, idx); 1449 break; 1450 case T_INT: 1451 pinsrd(dst, val, idx); 1452 break; 1453 case T_LONG: 1454 pinsrq(dst, val, idx); 1455 break; 1456 default: 1457 assert(false,"Should not reach here."); 1458 break; 1459 } 1460 } 1461 1462 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1463 switch(typ) { 1464 case T_BYTE: 1465 vpinsrb(dst, src, val, idx); 1466 break; 1467 case T_SHORT: 1468 vpinsrw(dst, src, val, idx); 1469 break; 1470 case T_INT: 1471 vpinsrd(dst, src, val, idx); 1472 break; 1473 case T_LONG: 1474 vpinsrq(dst, src, val, idx); 1475 break; 1476 default: 1477 assert(false,"Should not reach here."); 1478 break; 1479 } 1480 } 1481 1482 #ifdef _LP64 1483 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1484 XMMRegister dst, Register base, 1485 Register idx_base, 1486 Register offset, Register mask, 1487 Register mask_idx, Register rtmp, 1488 int vlen_enc) { 1489 vpxor(dst, dst, dst, vlen_enc); 1490 if (elem_bt == T_SHORT) { 1491 for (int i = 0; i < 4; i++) { 1492 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1493 Label skip_load; 1494 btq(mask, mask_idx); 1495 jccb(Assembler::carryClear, skip_load); 1496 movl(rtmp, Address(idx_base, i * 4)); 1497 if (offset != noreg) { 1498 addl(rtmp, offset); 1499 } 1500 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1501 bind(skip_load); 1502 incq(mask_idx); 1503 } 1504 } else { 1505 assert(elem_bt == T_BYTE, ""); 1506 for (int i = 0; i < 8; i++) { 1507 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1508 Label skip_load; 1509 btq(mask, mask_idx); 1510 jccb(Assembler::carryClear, skip_load); 1511 movl(rtmp, Address(idx_base, i * 4)); 1512 if (offset != noreg) { 1513 addl(rtmp, offset); 1514 } 1515 pinsrb(dst, Address(base, rtmp), i); 1516 bind(skip_load); 1517 incq(mask_idx); 1518 } 1519 } 1520 } 1521 #endif // _LP64 1522 1523 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1524 Register base, Register idx_base, 1525 Register offset, Register rtmp, 1526 int vlen_enc) { 1527 vpxor(dst, dst, dst, vlen_enc); 1528 if (elem_bt == T_SHORT) { 1529 for (int i = 0; i < 4; i++) { 1530 // dst[i] = src[offset + idx_base[i]] 1531 movl(rtmp, Address(idx_base, i * 4)); 1532 if (offset != noreg) { 1533 addl(rtmp, offset); 1534 } 1535 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1536 } 1537 } else { 1538 assert(elem_bt == T_BYTE, ""); 1539 for (int i = 0; i < 8; i++) { 1540 // dst[i] = src[offset + idx_base[i]] 1541 movl(rtmp, Address(idx_base, i * 4)); 1542 if (offset != noreg) { 1543 addl(rtmp, offset); 1544 } 1545 pinsrb(dst, Address(base, rtmp), i); 1546 } 1547 } 1548 } 1549 1550 /* 1551 * Gather using hybrid algorithm, first partially unroll scalar loop 1552 * to accumulate values from gather indices into a quad-word(64bit) slice. 1553 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1554 * permutation to place the slice into appropriate vector lane 1555 * locations in destination vector. Following pseudo code describes the 1556 * algorithm in detail: 1557 * 1558 * DST_VEC = ZERO_VEC 1559 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1560 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1561 * FOREACH_ITER: 1562 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1563 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1564 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1565 * PERM_INDEX = PERM_INDEX - TWO_VEC 1566 * 1567 * With each iteration, doubleword permute indices (0,1) corresponding 1568 * to gathered quadword gets right shifted by two lane positions. 1569 * 1570 */ 1571 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1572 Register base, Register idx_base, 1573 Register offset, Register mask, 1574 XMMRegister xtmp1, XMMRegister xtmp2, 1575 XMMRegister temp_dst, Register rtmp, 1576 Register mask_idx, Register length, 1577 int vector_len, int vlen_enc) { 1578 Label GATHER8_LOOP; 1579 assert(is_subword_type(elem_ty), ""); 1580 movl(length, vector_len); 1581 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1582 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1583 vallones(xtmp2, vlen_enc); 1584 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1585 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1586 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1587 1588 bind(GATHER8_LOOP); 1589 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1590 if (mask == noreg) { 1591 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1592 } else { 1593 LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc)); 1594 } 1595 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1596 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1597 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1598 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1599 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1600 vpor(dst, dst, temp_dst, vlen_enc); 1601 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1602 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1603 jcc(Assembler::notEqual, GATHER8_LOOP); 1604 } 1605 1606 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1607 switch(typ) { 1608 case T_INT: 1609 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1610 break; 1611 case T_FLOAT: 1612 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1613 break; 1614 case T_LONG: 1615 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1616 break; 1617 case T_DOUBLE: 1618 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1619 break; 1620 default: 1621 assert(false,"Should not reach here."); 1622 break; 1623 } 1624 } 1625 1626 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1627 switch(typ) { 1628 case T_INT: 1629 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1630 break; 1631 case T_FLOAT: 1632 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1633 break; 1634 case T_LONG: 1635 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1636 break; 1637 case T_DOUBLE: 1638 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1639 break; 1640 default: 1641 assert(false,"Should not reach here."); 1642 break; 1643 } 1644 } 1645 1646 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1647 switch(typ) { 1648 case T_INT: 1649 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1650 break; 1651 case T_FLOAT: 1652 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1653 break; 1654 case T_LONG: 1655 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1656 break; 1657 case T_DOUBLE: 1658 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1659 break; 1660 default: 1661 assert(false,"Should not reach here."); 1662 break; 1663 } 1664 } 1665 1666 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1667 if (vlen_in_bytes <= 16) { 1668 pxor (dst, dst); 1669 psubb(dst, src); 1670 switch (elem_bt) { 1671 case T_BYTE: /* nothing to do */ break; 1672 case T_SHORT: pmovsxbw(dst, dst); break; 1673 case T_INT: pmovsxbd(dst, dst); break; 1674 case T_FLOAT: pmovsxbd(dst, dst); break; 1675 case T_LONG: pmovsxbq(dst, dst); break; 1676 case T_DOUBLE: pmovsxbq(dst, dst); break; 1677 1678 default: assert(false, "%s", type2name(elem_bt)); 1679 } 1680 } else { 1681 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1682 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1683 1684 vpxor (dst, dst, dst, vlen_enc); 1685 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1686 1687 switch (elem_bt) { 1688 case T_BYTE: /* nothing to do */ break; 1689 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1690 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1691 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1692 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1693 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1694 1695 default: assert(false, "%s", type2name(elem_bt)); 1696 } 1697 } 1698 } 1699 1700 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1701 if (novlbwdq) { 1702 vpmovsxbd(xtmp, src, vlen_enc); 1703 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1704 Assembler::eq, true, vlen_enc, noreg); 1705 } else { 1706 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1707 vpsubb(xtmp, xtmp, src, vlen_enc); 1708 evpmovb2m(dst, xtmp, vlen_enc); 1709 } 1710 } 1711 1712 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1713 switch (vlen_in_bytes) { 1714 case 4: movdl(dst, src); break; 1715 case 8: movq(dst, src); break; 1716 case 16: movdqu(dst, src); break; 1717 case 32: vmovdqu(dst, src); break; 1718 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1719 default: ShouldNotReachHere(); 1720 } 1721 } 1722 1723 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1724 assert(rscratch != noreg || always_reachable(src), "missing"); 1725 1726 if (reachable(src)) { 1727 load_vector(dst, as_Address(src), vlen_in_bytes); 1728 } else { 1729 lea(rscratch, src); 1730 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1731 } 1732 } 1733 1734 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1735 int vlen_enc = vector_length_encoding(vlen); 1736 if (VM_Version::supports_avx()) { 1737 if (bt == T_LONG) { 1738 if (VM_Version::supports_avx2()) { 1739 vpbroadcastq(dst, src, vlen_enc); 1740 } else { 1741 vmovddup(dst, src, vlen_enc); 1742 } 1743 } else if (bt == T_DOUBLE) { 1744 if (vlen_enc != Assembler::AVX_128bit) { 1745 vbroadcastsd(dst, src, vlen_enc, noreg); 1746 } else { 1747 vmovddup(dst, src, vlen_enc); 1748 } 1749 } else { 1750 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1751 vpbroadcastd(dst, src, vlen_enc); 1752 } else { 1753 vbroadcastss(dst, src, vlen_enc); 1754 } 1755 } 1756 } else if (VM_Version::supports_sse3()) { 1757 movddup(dst, src); 1758 } else { 1759 movq(dst, src); 1760 if (vlen == 16) { 1761 punpcklqdq(dst, dst); 1762 } 1763 } 1764 } 1765 1766 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1767 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1768 int offset = exact_log2(type2aelembytes(bt)) << 6; 1769 if (is_floating_point_type(bt)) { 1770 offset += 128; 1771 } 1772 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1773 load_vector(dst, addr, vlen_in_bytes); 1774 } 1775 1776 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1777 1778 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1779 int vector_len = Assembler::AVX_128bit; 1780 1781 switch (opcode) { 1782 case Op_AndReductionV: pand(dst, src); break; 1783 case Op_OrReductionV: por (dst, src); break; 1784 case Op_XorReductionV: pxor(dst, src); break; 1785 case Op_MinReductionV: 1786 switch (typ) { 1787 case T_BYTE: pminsb(dst, src); break; 1788 case T_SHORT: pminsw(dst, src); break; 1789 case T_INT: pminsd(dst, src); break; 1790 case T_LONG: assert(UseAVX > 2, "required"); 1791 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1792 default: assert(false, "wrong type"); 1793 } 1794 break; 1795 case Op_MaxReductionV: 1796 switch (typ) { 1797 case T_BYTE: pmaxsb(dst, src); break; 1798 case T_SHORT: pmaxsw(dst, src); break; 1799 case T_INT: pmaxsd(dst, src); break; 1800 case T_LONG: assert(UseAVX > 2, "required"); 1801 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1802 default: assert(false, "wrong type"); 1803 } 1804 break; 1805 case Op_AddReductionVF: addss(dst, src); break; 1806 case Op_AddReductionVD: addsd(dst, src); break; 1807 case Op_AddReductionVI: 1808 switch (typ) { 1809 case T_BYTE: paddb(dst, src); break; 1810 case T_SHORT: paddw(dst, src); break; 1811 case T_INT: paddd(dst, src); break; 1812 default: assert(false, "wrong type"); 1813 } 1814 break; 1815 case Op_AddReductionVL: paddq(dst, src); break; 1816 case Op_MulReductionVF: mulss(dst, src); break; 1817 case Op_MulReductionVD: mulsd(dst, src); break; 1818 case Op_MulReductionVI: 1819 switch (typ) { 1820 case T_SHORT: pmullw(dst, src); break; 1821 case T_INT: pmulld(dst, src); break; 1822 default: assert(false, "wrong type"); 1823 } 1824 break; 1825 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1826 evpmullq(dst, dst, src, vector_len); break; 1827 default: assert(false, "wrong opcode"); 1828 } 1829 } 1830 1831 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1832 switch (opcode) { 1833 case Op_AddReductionVF: addps(dst, src); break; 1834 case Op_AddReductionVD: addpd(dst, src); break; 1835 case Op_MulReductionVF: mulps(dst, src); break; 1836 case Op_MulReductionVD: mulpd(dst, src); break; 1837 default: assert(false, "%s", NodeClassNames[opcode]); 1838 } 1839 } 1840 1841 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1842 int vector_len = Assembler::AVX_256bit; 1843 1844 switch (opcode) { 1845 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1846 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1847 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1848 case Op_MinReductionV: 1849 switch (typ) { 1850 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1851 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1852 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1853 case T_LONG: assert(UseAVX > 2, "required"); 1854 vpminsq(dst, src1, src2, vector_len); break; 1855 default: assert(false, "wrong type"); 1856 } 1857 break; 1858 case Op_MaxReductionV: 1859 switch (typ) { 1860 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1861 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1862 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1863 case T_LONG: assert(UseAVX > 2, "required"); 1864 vpmaxsq(dst, src1, src2, vector_len); break; 1865 default: assert(false, "wrong type"); 1866 } 1867 break; 1868 case Op_AddReductionVI: 1869 switch (typ) { 1870 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1871 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1872 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1873 default: assert(false, "wrong type"); 1874 } 1875 break; 1876 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1877 case Op_MulReductionVI: 1878 switch (typ) { 1879 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1880 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1881 default: assert(false, "wrong type"); 1882 } 1883 break; 1884 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1885 default: assert(false, "wrong opcode"); 1886 } 1887 } 1888 1889 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1890 int vector_len = Assembler::AVX_256bit; 1891 1892 switch (opcode) { 1893 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 1894 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 1895 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 1896 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 1897 default: assert(false, "%s", NodeClassNames[opcode]); 1898 } 1899 } 1900 1901 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1902 XMMRegister dst, XMMRegister src, 1903 XMMRegister vtmp1, XMMRegister vtmp2) { 1904 switch (opcode) { 1905 case Op_AddReductionVF: 1906 case Op_MulReductionVF: 1907 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1908 break; 1909 1910 case Op_AddReductionVD: 1911 case Op_MulReductionVD: 1912 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1913 break; 1914 1915 default: assert(false, "wrong opcode"); 1916 } 1917 } 1918 1919 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 1920 XMMRegister dst, XMMRegister src, 1921 XMMRegister vtmp1, XMMRegister vtmp2) { 1922 switch (opcode) { 1923 case Op_AddReductionVF: 1924 case Op_MulReductionVF: 1925 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1926 break; 1927 1928 case Op_AddReductionVD: 1929 case Op_MulReductionVD: 1930 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1931 break; 1932 1933 default: assert(false, "%s", NodeClassNames[opcode]); 1934 } 1935 } 1936 1937 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1938 Register dst, Register src1, XMMRegister src2, 1939 XMMRegister vtmp1, XMMRegister vtmp2) { 1940 switch (vlen) { 1941 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1942 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1943 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1944 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1945 1946 default: assert(false, "wrong vector length"); 1947 } 1948 } 1949 1950 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1951 Register dst, Register src1, XMMRegister src2, 1952 XMMRegister vtmp1, XMMRegister vtmp2) { 1953 switch (vlen) { 1954 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1955 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1956 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1957 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1958 1959 default: assert(false, "wrong vector length"); 1960 } 1961 } 1962 1963 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1964 Register dst, Register src1, XMMRegister src2, 1965 XMMRegister vtmp1, XMMRegister vtmp2) { 1966 switch (vlen) { 1967 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1968 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1969 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1970 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1971 1972 default: assert(false, "wrong vector length"); 1973 } 1974 } 1975 1976 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1977 Register dst, Register src1, XMMRegister src2, 1978 XMMRegister vtmp1, XMMRegister vtmp2) { 1979 switch (vlen) { 1980 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1981 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1982 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1983 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1984 1985 default: assert(false, "wrong vector length"); 1986 } 1987 } 1988 1989 #ifdef _LP64 1990 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1991 Register dst, Register src1, XMMRegister src2, 1992 XMMRegister vtmp1, XMMRegister vtmp2) { 1993 switch (vlen) { 1994 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1995 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1996 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1997 1998 default: assert(false, "wrong vector length"); 1999 } 2000 } 2001 #endif // _LP64 2002 2003 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2004 switch (vlen) { 2005 case 2: 2006 assert(vtmp2 == xnoreg, ""); 2007 reduce2F(opcode, dst, src, vtmp1); 2008 break; 2009 case 4: 2010 assert(vtmp2 == xnoreg, ""); 2011 reduce4F(opcode, dst, src, vtmp1); 2012 break; 2013 case 8: 2014 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2015 break; 2016 case 16: 2017 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2018 break; 2019 default: assert(false, "wrong vector length"); 2020 } 2021 } 2022 2023 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2024 switch (vlen) { 2025 case 2: 2026 assert(vtmp2 == xnoreg, ""); 2027 reduce2D(opcode, dst, src, vtmp1); 2028 break; 2029 case 4: 2030 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2031 break; 2032 case 8: 2033 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2034 break; 2035 default: assert(false, "wrong vector length"); 2036 } 2037 } 2038 2039 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2040 switch (vlen) { 2041 case 2: 2042 assert(vtmp1 == xnoreg, ""); 2043 assert(vtmp2 == xnoreg, ""); 2044 unorderedReduce2F(opcode, dst, src); 2045 break; 2046 case 4: 2047 assert(vtmp2 == xnoreg, ""); 2048 unorderedReduce4F(opcode, dst, src, vtmp1); 2049 break; 2050 case 8: 2051 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 2052 break; 2053 case 16: 2054 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 2055 break; 2056 default: assert(false, "wrong vector length"); 2057 } 2058 } 2059 2060 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2061 switch (vlen) { 2062 case 2: 2063 assert(vtmp1 == xnoreg, ""); 2064 assert(vtmp2 == xnoreg, ""); 2065 unorderedReduce2D(opcode, dst, src); 2066 break; 2067 case 4: 2068 assert(vtmp2 == xnoreg, ""); 2069 unorderedReduce4D(opcode, dst, src, vtmp1); 2070 break; 2071 case 8: 2072 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2073 break; 2074 default: assert(false, "wrong vector length"); 2075 } 2076 } 2077 2078 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2079 if (opcode == Op_AddReductionVI) { 2080 if (vtmp1 != src2) { 2081 movdqu(vtmp1, src2); 2082 } 2083 phaddd(vtmp1, vtmp1); 2084 } else { 2085 pshufd(vtmp1, src2, 0x1); 2086 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2087 } 2088 movdl(vtmp2, src1); 2089 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2090 movdl(dst, vtmp1); 2091 } 2092 2093 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2094 if (opcode == Op_AddReductionVI) { 2095 if (vtmp1 != src2) { 2096 movdqu(vtmp1, src2); 2097 } 2098 phaddd(vtmp1, src2); 2099 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2100 } else { 2101 pshufd(vtmp2, src2, 0xE); 2102 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2103 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2104 } 2105 } 2106 2107 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2108 if (opcode == Op_AddReductionVI) { 2109 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2110 vextracti128_high(vtmp2, vtmp1); 2111 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2112 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2113 } else { 2114 vextracti128_high(vtmp1, src2); 2115 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2116 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2117 } 2118 } 2119 2120 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2121 vextracti64x4_high(vtmp2, src2); 2122 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2123 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2124 } 2125 2126 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2127 pshufd(vtmp2, src2, 0x1); 2128 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2129 movdqu(vtmp1, vtmp2); 2130 psrldq(vtmp1, 2); 2131 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2132 movdqu(vtmp2, vtmp1); 2133 psrldq(vtmp2, 1); 2134 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2135 movdl(vtmp2, src1); 2136 pmovsxbd(vtmp1, vtmp1); 2137 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2138 pextrb(dst, vtmp1, 0x0); 2139 movsbl(dst, dst); 2140 } 2141 2142 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2143 pshufd(vtmp1, src2, 0xE); 2144 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2145 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2146 } 2147 2148 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2149 vextracti128_high(vtmp2, src2); 2150 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2151 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2152 } 2153 2154 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2155 vextracti64x4_high(vtmp1, src2); 2156 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2157 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2158 } 2159 2160 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2161 pmovsxbw(vtmp2, src2); 2162 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2163 } 2164 2165 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2166 if (UseAVX > 1) { 2167 int vector_len = Assembler::AVX_256bit; 2168 vpmovsxbw(vtmp1, src2, vector_len); 2169 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2170 } else { 2171 pmovsxbw(vtmp2, src2); 2172 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2173 pshufd(vtmp2, src2, 0x1); 2174 pmovsxbw(vtmp2, src2); 2175 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2176 } 2177 } 2178 2179 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2180 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2181 int vector_len = Assembler::AVX_512bit; 2182 vpmovsxbw(vtmp1, src2, vector_len); 2183 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2184 } else { 2185 assert(UseAVX >= 2,"Should not reach here."); 2186 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2187 vextracti128_high(vtmp2, src2); 2188 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2189 } 2190 } 2191 2192 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2193 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2194 vextracti64x4_high(vtmp2, src2); 2195 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2196 } 2197 2198 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2199 if (opcode == Op_AddReductionVI) { 2200 if (vtmp1 != src2) { 2201 movdqu(vtmp1, src2); 2202 } 2203 phaddw(vtmp1, vtmp1); 2204 phaddw(vtmp1, vtmp1); 2205 } else { 2206 pshufd(vtmp2, src2, 0x1); 2207 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2208 movdqu(vtmp1, vtmp2); 2209 psrldq(vtmp1, 2); 2210 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2211 } 2212 movdl(vtmp2, src1); 2213 pmovsxwd(vtmp1, vtmp1); 2214 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2215 pextrw(dst, vtmp1, 0x0); 2216 movswl(dst, dst); 2217 } 2218 2219 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2220 if (opcode == Op_AddReductionVI) { 2221 if (vtmp1 != src2) { 2222 movdqu(vtmp1, src2); 2223 } 2224 phaddw(vtmp1, src2); 2225 } else { 2226 pshufd(vtmp1, src2, 0xE); 2227 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2228 } 2229 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2230 } 2231 2232 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2233 if (opcode == Op_AddReductionVI) { 2234 int vector_len = Assembler::AVX_256bit; 2235 vphaddw(vtmp2, src2, src2, vector_len); 2236 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2237 } else { 2238 vextracti128_high(vtmp2, src2); 2239 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2240 } 2241 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2242 } 2243 2244 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2245 int vector_len = Assembler::AVX_256bit; 2246 vextracti64x4_high(vtmp1, src2); 2247 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2248 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2249 } 2250 2251 #ifdef _LP64 2252 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2253 pshufd(vtmp2, src2, 0xE); 2254 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2255 movdq(vtmp1, src1); 2256 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2257 movdq(dst, vtmp1); 2258 } 2259 2260 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2261 vextracti128_high(vtmp1, src2); 2262 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2263 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2264 } 2265 2266 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2267 vextracti64x4_high(vtmp2, src2); 2268 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2269 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2270 } 2271 2272 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2273 mov64(temp, -1L); 2274 bzhiq(temp, temp, len); 2275 kmovql(dst, temp); 2276 } 2277 #endif // _LP64 2278 2279 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2280 reduce_operation_128(T_FLOAT, opcode, dst, src); 2281 pshufd(vtmp, src, 0x1); 2282 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2283 } 2284 2285 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2286 reduce2F(opcode, dst, src, vtmp); 2287 pshufd(vtmp, src, 0x2); 2288 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2289 pshufd(vtmp, src, 0x3); 2290 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2291 } 2292 2293 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2294 reduce4F(opcode, dst, src, vtmp2); 2295 vextractf128_high(vtmp2, src); 2296 reduce4F(opcode, dst, vtmp2, vtmp1); 2297 } 2298 2299 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2300 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2301 vextracti64x4_high(vtmp1, src); 2302 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2303 } 2304 2305 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2306 pshufd(dst, src, 0x1); 2307 reduce_operation_128(T_FLOAT, opcode, dst, src); 2308 } 2309 2310 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2311 pshufd(vtmp, src, 0xE); 2312 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2313 unorderedReduce2F(opcode, dst, vtmp); 2314 } 2315 2316 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2317 vextractf128_high(vtmp1, src); 2318 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2319 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2320 } 2321 2322 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2323 vextractf64x4_high(vtmp2, src); 2324 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2325 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2326 } 2327 2328 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2329 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2330 pshufd(vtmp, src, 0xE); 2331 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2332 } 2333 2334 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2335 reduce2D(opcode, dst, src, vtmp2); 2336 vextractf128_high(vtmp2, src); 2337 reduce2D(opcode, dst, vtmp2, vtmp1); 2338 } 2339 2340 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2341 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2342 vextracti64x4_high(vtmp1, src); 2343 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2344 } 2345 2346 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2347 pshufd(dst, src, 0xE); 2348 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2349 } 2350 2351 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2352 vextractf128_high(vtmp, src); 2353 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2354 unorderedReduce2D(opcode, dst, vtmp); 2355 } 2356 2357 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2358 vextractf64x4_high(vtmp2, src); 2359 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2360 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2361 } 2362 2363 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2364 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2365 } 2366 2367 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2368 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2369 } 2370 2371 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2372 int vec_enc) { 2373 switch(elem_bt) { 2374 case T_INT: 2375 case T_FLOAT: 2376 vmaskmovps(dst, src, mask, vec_enc); 2377 break; 2378 case T_LONG: 2379 case T_DOUBLE: 2380 vmaskmovpd(dst, src, mask, vec_enc); 2381 break; 2382 default: 2383 fatal("Unsupported type %s", type2name(elem_bt)); 2384 break; 2385 } 2386 } 2387 2388 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2389 int vec_enc) { 2390 switch(elem_bt) { 2391 case T_INT: 2392 case T_FLOAT: 2393 vmaskmovps(dst, src, mask, vec_enc); 2394 break; 2395 case T_LONG: 2396 case T_DOUBLE: 2397 vmaskmovpd(dst, src, mask, vec_enc); 2398 break; 2399 default: 2400 fatal("Unsupported type %s", type2name(elem_bt)); 2401 break; 2402 } 2403 } 2404 2405 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2406 XMMRegister dst, XMMRegister src, 2407 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2408 XMMRegister xmm_0, XMMRegister xmm_1) { 2409 const int permconst[] = {1, 14}; 2410 XMMRegister wsrc = src; 2411 XMMRegister wdst = xmm_0; 2412 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2413 2414 int vlen_enc = Assembler::AVX_128bit; 2415 if (vlen == 16) { 2416 vlen_enc = Assembler::AVX_256bit; 2417 } 2418 2419 for (int i = log2(vlen) - 1; i >=0; i--) { 2420 if (i == 0 && !is_dst_valid) { 2421 wdst = dst; 2422 } 2423 if (i == 3) { 2424 vextracti64x4_high(wtmp, wsrc); 2425 } else if (i == 2) { 2426 vextracti128_high(wtmp, wsrc); 2427 } else { // i = [0,1] 2428 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2429 } 2430 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2431 wsrc = wdst; 2432 vlen_enc = Assembler::AVX_128bit; 2433 } 2434 if (is_dst_valid) { 2435 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2436 } 2437 } 2438 2439 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2440 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2441 XMMRegister xmm_0, XMMRegister xmm_1) { 2442 XMMRegister wsrc = src; 2443 XMMRegister wdst = xmm_0; 2444 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2445 int vlen_enc = Assembler::AVX_128bit; 2446 if (vlen == 8) { 2447 vlen_enc = Assembler::AVX_256bit; 2448 } 2449 for (int i = log2(vlen) - 1; i >=0; i--) { 2450 if (i == 0 && !is_dst_valid) { 2451 wdst = dst; 2452 } 2453 if (i == 1) { 2454 vextracti128_high(wtmp, wsrc); 2455 } else if (i == 2) { 2456 vextracti64x4_high(wtmp, wsrc); 2457 } else { 2458 assert(i == 0, "%d", i); 2459 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2460 } 2461 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2462 wsrc = wdst; 2463 vlen_enc = Assembler::AVX_128bit; 2464 } 2465 if (is_dst_valid) { 2466 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2467 } 2468 } 2469 2470 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2471 switch (bt) { 2472 case T_BYTE: pextrb(dst, src, idx); break; 2473 case T_SHORT: pextrw(dst, src, idx); break; 2474 case T_INT: pextrd(dst, src, idx); break; 2475 case T_LONG: pextrq(dst, src, idx); break; 2476 2477 default: 2478 assert(false,"Should not reach here."); 2479 break; 2480 } 2481 } 2482 2483 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2484 int esize = type2aelembytes(typ); 2485 int elem_per_lane = 16/esize; 2486 int lane = elemindex / elem_per_lane; 2487 int eindex = elemindex % elem_per_lane; 2488 2489 if (lane >= 2) { 2490 assert(UseAVX > 2, "required"); 2491 vextractf32x4(dst, src, lane & 3); 2492 return dst; 2493 } else if (lane > 0) { 2494 assert(UseAVX > 0, "required"); 2495 vextractf128(dst, src, lane); 2496 return dst; 2497 } else { 2498 return src; 2499 } 2500 } 2501 2502 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2503 if (typ == T_BYTE) { 2504 movsbl(dst, dst); 2505 } else if (typ == T_SHORT) { 2506 movswl(dst, dst); 2507 } 2508 } 2509 2510 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2511 int esize = type2aelembytes(typ); 2512 int elem_per_lane = 16/esize; 2513 int eindex = elemindex % elem_per_lane; 2514 assert(is_integral_type(typ),"required"); 2515 2516 if (eindex == 0) { 2517 if (typ == T_LONG) { 2518 movq(dst, src); 2519 } else { 2520 movdl(dst, src); 2521 movsxl(typ, dst); 2522 } 2523 } else { 2524 extract(typ, dst, src, eindex); 2525 movsxl(typ, dst); 2526 } 2527 } 2528 2529 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2530 int esize = type2aelembytes(typ); 2531 int elem_per_lane = 16/esize; 2532 int eindex = elemindex % elem_per_lane; 2533 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2534 2535 if (eindex == 0) { 2536 movq(dst, src); 2537 } else { 2538 if (typ == T_FLOAT) { 2539 if (UseAVX == 0) { 2540 movdqu(dst, src); 2541 shufps(dst, dst, eindex); 2542 } else { 2543 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2544 } 2545 } else { 2546 if (UseAVX == 0) { 2547 movdqu(dst, src); 2548 psrldq(dst, eindex*esize); 2549 } else { 2550 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2551 } 2552 movq(dst, dst); 2553 } 2554 } 2555 // Zero upper bits 2556 if (typ == T_FLOAT) { 2557 if (UseAVX == 0) { 2558 assert(vtmp != xnoreg, "required."); 2559 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2560 pand(dst, vtmp); 2561 } else { 2562 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2563 } 2564 } 2565 } 2566 2567 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2568 switch(typ) { 2569 case T_BYTE: 2570 case T_BOOLEAN: 2571 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2572 break; 2573 case T_SHORT: 2574 case T_CHAR: 2575 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2576 break; 2577 case T_INT: 2578 case T_FLOAT: 2579 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2580 break; 2581 case T_LONG: 2582 case T_DOUBLE: 2583 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2584 break; 2585 default: 2586 assert(false,"Should not reach here."); 2587 break; 2588 } 2589 } 2590 2591 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2592 assert(rscratch != noreg || always_reachable(src2), "missing"); 2593 2594 switch(typ) { 2595 case T_BOOLEAN: 2596 case T_BYTE: 2597 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2598 break; 2599 case T_CHAR: 2600 case T_SHORT: 2601 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2602 break; 2603 case T_INT: 2604 case T_FLOAT: 2605 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2606 break; 2607 case T_LONG: 2608 case T_DOUBLE: 2609 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2610 break; 2611 default: 2612 assert(false,"Should not reach here."); 2613 break; 2614 } 2615 } 2616 2617 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2618 switch(typ) { 2619 case T_BYTE: 2620 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2621 break; 2622 case T_SHORT: 2623 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2624 break; 2625 case T_INT: 2626 case T_FLOAT: 2627 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2628 break; 2629 case T_LONG: 2630 case T_DOUBLE: 2631 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2632 break; 2633 default: 2634 assert(false,"Should not reach here."); 2635 break; 2636 } 2637 } 2638 2639 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2640 assert(vlen_in_bytes <= 32, ""); 2641 int esize = type2aelembytes(bt); 2642 if (vlen_in_bytes == 32) { 2643 assert(vtmp == xnoreg, "required."); 2644 if (esize >= 4) { 2645 vtestps(src1, src2, AVX_256bit); 2646 } else { 2647 vptest(src1, src2, AVX_256bit); 2648 } 2649 return; 2650 } 2651 if (vlen_in_bytes < 16) { 2652 // Duplicate the lower part to fill the whole register, 2653 // Don't need to do so for src2 2654 assert(vtmp != xnoreg, "required"); 2655 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2656 pshufd(vtmp, src1, shuffle_imm); 2657 } else { 2658 assert(vtmp == xnoreg, "required"); 2659 vtmp = src1; 2660 } 2661 if (esize >= 4 && VM_Version::supports_avx()) { 2662 vtestps(vtmp, src2, AVX_128bit); 2663 } else { 2664 ptest(vtmp, src2); 2665 } 2666 } 2667 2668 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2669 assert(UseAVX >= 2, "required"); 2670 #ifdef ASSERT 2671 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2672 bool is_bw_supported = VM_Version::supports_avx512bw(); 2673 if (is_bw && !is_bw_supported) { 2674 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2675 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2676 "XMM register should be 0-15"); 2677 } 2678 #endif // ASSERT 2679 switch (elem_bt) { 2680 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2681 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2682 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2683 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2684 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2685 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2686 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2687 } 2688 } 2689 2690 #ifdef _LP64 2691 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2692 assert(UseAVX >= 2, "required"); 2693 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2694 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2695 if ((UseAVX > 2) && 2696 (!is_bw || VM_Version::supports_avx512bw()) && 2697 (!is_vl || VM_Version::supports_avx512vl())) { 2698 switch (elem_bt) { 2699 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2700 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2701 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2702 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2703 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2704 } 2705 } else { 2706 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2707 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2708 switch (elem_bt) { 2709 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2710 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2711 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2712 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2713 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2714 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2715 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2716 } 2717 } 2718 } 2719 #endif 2720 2721 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2722 switch (to_elem_bt) { 2723 case T_SHORT: 2724 vpmovsxbw(dst, src, vlen_enc); 2725 break; 2726 case T_INT: 2727 vpmovsxbd(dst, src, vlen_enc); 2728 break; 2729 case T_FLOAT: 2730 vpmovsxbd(dst, src, vlen_enc); 2731 vcvtdq2ps(dst, dst, vlen_enc); 2732 break; 2733 case T_LONG: 2734 vpmovsxbq(dst, src, vlen_enc); 2735 break; 2736 case T_DOUBLE: { 2737 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2738 vpmovsxbd(dst, src, mid_vlen_enc); 2739 vcvtdq2pd(dst, dst, vlen_enc); 2740 break; 2741 } 2742 default: 2743 fatal("Unsupported type %s", type2name(to_elem_bt)); 2744 break; 2745 } 2746 } 2747 2748 //------------------------------------------------------------------------------------------- 2749 2750 // IndexOf for constant substrings with size >= 8 chars 2751 // which don't need to be loaded through stack. 2752 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2753 Register cnt1, Register cnt2, 2754 int int_cnt2, Register result, 2755 XMMRegister vec, Register tmp, 2756 int ae) { 2757 ShortBranchVerifier sbv(this); 2758 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2759 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2760 2761 // This method uses the pcmpestri instruction with bound registers 2762 // inputs: 2763 // xmm - substring 2764 // rax - substring length (elements count) 2765 // mem - scanned string 2766 // rdx - string length (elements count) 2767 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2768 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2769 // outputs: 2770 // rcx - matched index in string 2771 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2772 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2773 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2774 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2775 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2776 2777 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2778 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2779 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2780 2781 // Note, inline_string_indexOf() generates checks: 2782 // if (substr.count > string.count) return -1; 2783 // if (substr.count == 0) return 0; 2784 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2785 2786 // Load substring. 2787 if (ae == StrIntrinsicNode::UL) { 2788 pmovzxbw(vec, Address(str2, 0)); 2789 } else { 2790 movdqu(vec, Address(str2, 0)); 2791 } 2792 movl(cnt2, int_cnt2); 2793 movptr(result, str1); // string addr 2794 2795 if (int_cnt2 > stride) { 2796 jmpb(SCAN_TO_SUBSTR); 2797 2798 // Reload substr for rescan, this code 2799 // is executed only for large substrings (> 8 chars) 2800 bind(RELOAD_SUBSTR); 2801 if (ae == StrIntrinsicNode::UL) { 2802 pmovzxbw(vec, Address(str2, 0)); 2803 } else { 2804 movdqu(vec, Address(str2, 0)); 2805 } 2806 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2807 2808 bind(RELOAD_STR); 2809 // We came here after the beginning of the substring was 2810 // matched but the rest of it was not so we need to search 2811 // again. Start from the next element after the previous match. 2812 2813 // cnt2 is number of substring reminding elements and 2814 // cnt1 is number of string reminding elements when cmp failed. 2815 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2816 subl(cnt1, cnt2); 2817 addl(cnt1, int_cnt2); 2818 movl(cnt2, int_cnt2); // Now restore cnt2 2819 2820 decrementl(cnt1); // Shift to next element 2821 cmpl(cnt1, cnt2); 2822 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2823 2824 addptr(result, (1<<scale1)); 2825 2826 } // (int_cnt2 > 8) 2827 2828 // Scan string for start of substr in 16-byte vectors 2829 bind(SCAN_TO_SUBSTR); 2830 pcmpestri(vec, Address(result, 0), mode); 2831 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2832 subl(cnt1, stride); 2833 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2834 cmpl(cnt1, cnt2); 2835 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2836 addptr(result, 16); 2837 jmpb(SCAN_TO_SUBSTR); 2838 2839 // Found a potential substr 2840 bind(FOUND_CANDIDATE); 2841 // Matched whole vector if first element matched (tmp(rcx) == 0). 2842 if (int_cnt2 == stride) { 2843 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2844 } else { // int_cnt2 > 8 2845 jccb(Assembler::overflow, FOUND_SUBSTR); 2846 } 2847 // After pcmpestri tmp(rcx) contains matched element index 2848 // Compute start addr of substr 2849 lea(result, Address(result, tmp, scale1)); 2850 2851 // Make sure string is still long enough 2852 subl(cnt1, tmp); 2853 cmpl(cnt1, cnt2); 2854 if (int_cnt2 == stride) { 2855 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2856 } else { // int_cnt2 > 8 2857 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2858 } 2859 // Left less then substring. 2860 2861 bind(RET_NOT_FOUND); 2862 movl(result, -1); 2863 jmp(EXIT); 2864 2865 if (int_cnt2 > stride) { 2866 // This code is optimized for the case when whole substring 2867 // is matched if its head is matched. 2868 bind(MATCH_SUBSTR_HEAD); 2869 pcmpestri(vec, Address(result, 0), mode); 2870 // Reload only string if does not match 2871 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2872 2873 Label CONT_SCAN_SUBSTR; 2874 // Compare the rest of substring (> 8 chars). 2875 bind(FOUND_SUBSTR); 2876 // First 8 chars are already matched. 2877 negptr(cnt2); 2878 addptr(cnt2, stride); 2879 2880 bind(SCAN_SUBSTR); 2881 subl(cnt1, stride); 2882 cmpl(cnt2, -stride); // Do not read beyond substring 2883 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2884 // Back-up strings to avoid reading beyond substring: 2885 // cnt1 = cnt1 - cnt2 + 8 2886 addl(cnt1, cnt2); // cnt2 is negative 2887 addl(cnt1, stride); 2888 movl(cnt2, stride); negptr(cnt2); 2889 bind(CONT_SCAN_SUBSTR); 2890 if (int_cnt2 < (int)G) { 2891 int tail_off1 = int_cnt2<<scale1; 2892 int tail_off2 = int_cnt2<<scale2; 2893 if (ae == StrIntrinsicNode::UL) { 2894 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2895 } else { 2896 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2897 } 2898 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2899 } else { 2900 // calculate index in register to avoid integer overflow (int_cnt2*2) 2901 movl(tmp, int_cnt2); 2902 addptr(tmp, cnt2); 2903 if (ae == StrIntrinsicNode::UL) { 2904 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2905 } else { 2906 movdqu(vec, Address(str2, tmp, scale2, 0)); 2907 } 2908 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2909 } 2910 // Need to reload strings pointers if not matched whole vector 2911 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2912 addptr(cnt2, stride); 2913 jcc(Assembler::negative, SCAN_SUBSTR); 2914 // Fall through if found full substring 2915 2916 } // (int_cnt2 > 8) 2917 2918 bind(RET_FOUND); 2919 // Found result if we matched full small substring. 2920 // Compute substr offset 2921 subptr(result, str1); 2922 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2923 shrl(result, 1); // index 2924 } 2925 bind(EXIT); 2926 2927 } // string_indexofC8 2928 2929 // Small strings are loaded through stack if they cross page boundary. 2930 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2931 Register cnt1, Register cnt2, 2932 int int_cnt2, Register result, 2933 XMMRegister vec, Register tmp, 2934 int ae) { 2935 ShortBranchVerifier sbv(this); 2936 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2937 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2938 2939 // 2940 // int_cnt2 is length of small (< 8 chars) constant substring 2941 // or (-1) for non constant substring in which case its length 2942 // is in cnt2 register. 2943 // 2944 // Note, inline_string_indexOf() generates checks: 2945 // if (substr.count > string.count) return -1; 2946 // if (substr.count == 0) return 0; 2947 // 2948 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2949 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2950 // This method uses the pcmpestri instruction with bound registers 2951 // inputs: 2952 // xmm - substring 2953 // rax - substring length (elements count) 2954 // mem - scanned string 2955 // rdx - string length (elements count) 2956 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2957 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2958 // outputs: 2959 // rcx - matched index in string 2960 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2961 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2962 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2963 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2964 2965 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2966 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2967 FOUND_CANDIDATE; 2968 2969 { //======================================================== 2970 // We don't know where these strings are located 2971 // and we can't read beyond them. Load them through stack. 2972 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2973 2974 movptr(tmp, rsp); // save old SP 2975 2976 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2977 if (int_cnt2 == (1>>scale2)) { // One byte 2978 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2979 load_unsigned_byte(result, Address(str2, 0)); 2980 movdl(vec, result); // move 32 bits 2981 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2982 // Not enough header space in 32-bit VM: 12+3 = 15. 2983 movl(result, Address(str2, -1)); 2984 shrl(result, 8); 2985 movdl(vec, result); // move 32 bits 2986 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2987 load_unsigned_short(result, Address(str2, 0)); 2988 movdl(vec, result); // move 32 bits 2989 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2990 movdl(vec, Address(str2, 0)); // move 32 bits 2991 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2992 movq(vec, Address(str2, 0)); // move 64 bits 2993 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2994 // Array header size is 12 bytes in 32-bit VM 2995 // + 6 bytes for 3 chars == 18 bytes, 2996 // enough space to load vec and shift. 2997 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2998 if (ae == StrIntrinsicNode::UL) { 2999 int tail_off = int_cnt2-8; 3000 pmovzxbw(vec, Address(str2, tail_off)); 3001 psrldq(vec, -2*tail_off); 3002 } 3003 else { 3004 int tail_off = int_cnt2*(1<<scale2); 3005 movdqu(vec, Address(str2, tail_off-16)); 3006 psrldq(vec, 16-tail_off); 3007 } 3008 } 3009 } else { // not constant substring 3010 cmpl(cnt2, stride); 3011 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3012 3013 // We can read beyond string if srt+16 does not cross page boundary 3014 // since heaps are aligned and mapped by pages. 3015 assert(os::vm_page_size() < (int)G, "default page should be small"); 3016 movl(result, str2); // We need only low 32 bits 3017 andl(result, ((int)os::vm_page_size()-1)); 3018 cmpl(result, ((int)os::vm_page_size()-16)); 3019 jccb(Assembler::belowEqual, CHECK_STR); 3020 3021 // Move small strings to stack to allow load 16 bytes into vec. 3022 subptr(rsp, 16); 3023 int stk_offset = wordSize-(1<<scale2); 3024 push(cnt2); 3025 3026 bind(COPY_SUBSTR); 3027 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3028 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3029 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3030 } else if (ae == StrIntrinsicNode::UU) { 3031 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3032 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3033 } 3034 decrement(cnt2); 3035 jccb(Assembler::notZero, COPY_SUBSTR); 3036 3037 pop(cnt2); 3038 movptr(str2, rsp); // New substring address 3039 } // non constant 3040 3041 bind(CHECK_STR); 3042 cmpl(cnt1, stride); 3043 jccb(Assembler::aboveEqual, BIG_STRINGS); 3044 3045 // Check cross page boundary. 3046 movl(result, str1); // We need only low 32 bits 3047 andl(result, ((int)os::vm_page_size()-1)); 3048 cmpl(result, ((int)os::vm_page_size()-16)); 3049 jccb(Assembler::belowEqual, BIG_STRINGS); 3050 3051 subptr(rsp, 16); 3052 int stk_offset = -(1<<scale1); 3053 if (int_cnt2 < 0) { // not constant 3054 push(cnt2); 3055 stk_offset += wordSize; 3056 } 3057 movl(cnt2, cnt1); 3058 3059 bind(COPY_STR); 3060 if (ae == StrIntrinsicNode::LL) { 3061 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3062 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3063 } else { 3064 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3065 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3066 } 3067 decrement(cnt2); 3068 jccb(Assembler::notZero, COPY_STR); 3069 3070 if (int_cnt2 < 0) { // not constant 3071 pop(cnt2); 3072 } 3073 movptr(str1, rsp); // New string address 3074 3075 bind(BIG_STRINGS); 3076 // Load substring. 3077 if (int_cnt2 < 0) { // -1 3078 if (ae == StrIntrinsicNode::UL) { 3079 pmovzxbw(vec, Address(str2, 0)); 3080 } else { 3081 movdqu(vec, Address(str2, 0)); 3082 } 3083 push(cnt2); // substr count 3084 push(str2); // substr addr 3085 push(str1); // string addr 3086 } else { 3087 // Small (< 8 chars) constant substrings are loaded already. 3088 movl(cnt2, int_cnt2); 3089 } 3090 push(tmp); // original SP 3091 3092 } // Finished loading 3093 3094 //======================================================== 3095 // Start search 3096 // 3097 3098 movptr(result, str1); // string addr 3099 3100 if (int_cnt2 < 0) { // Only for non constant substring 3101 jmpb(SCAN_TO_SUBSTR); 3102 3103 // SP saved at sp+0 3104 // String saved at sp+1*wordSize 3105 // Substr saved at sp+2*wordSize 3106 // Substr count saved at sp+3*wordSize 3107 3108 // Reload substr for rescan, this code 3109 // is executed only for large substrings (> 8 chars) 3110 bind(RELOAD_SUBSTR); 3111 movptr(str2, Address(rsp, 2*wordSize)); 3112 movl(cnt2, Address(rsp, 3*wordSize)); 3113 if (ae == StrIntrinsicNode::UL) { 3114 pmovzxbw(vec, Address(str2, 0)); 3115 } else { 3116 movdqu(vec, Address(str2, 0)); 3117 } 3118 // We came here after the beginning of the substring was 3119 // matched but the rest of it was not so we need to search 3120 // again. Start from the next element after the previous match. 3121 subptr(str1, result); // Restore counter 3122 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3123 shrl(str1, 1); 3124 } 3125 addl(cnt1, str1); 3126 decrementl(cnt1); // Shift to next element 3127 cmpl(cnt1, cnt2); 3128 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3129 3130 addptr(result, (1<<scale1)); 3131 } // non constant 3132 3133 // Scan string for start of substr in 16-byte vectors 3134 bind(SCAN_TO_SUBSTR); 3135 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3136 pcmpestri(vec, Address(result, 0), mode); 3137 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3138 subl(cnt1, stride); 3139 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3140 cmpl(cnt1, cnt2); 3141 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3142 addptr(result, 16); 3143 3144 bind(ADJUST_STR); 3145 cmpl(cnt1, stride); // Do not read beyond string 3146 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3147 // Back-up string to avoid reading beyond string. 3148 lea(result, Address(result, cnt1, scale1, -16)); 3149 movl(cnt1, stride); 3150 jmpb(SCAN_TO_SUBSTR); 3151 3152 // Found a potential substr 3153 bind(FOUND_CANDIDATE); 3154 // After pcmpestri tmp(rcx) contains matched element index 3155 3156 // Make sure string is still long enough 3157 subl(cnt1, tmp); 3158 cmpl(cnt1, cnt2); 3159 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3160 // Left less then substring. 3161 3162 bind(RET_NOT_FOUND); 3163 movl(result, -1); 3164 jmp(CLEANUP); 3165 3166 bind(FOUND_SUBSTR); 3167 // Compute start addr of substr 3168 lea(result, Address(result, tmp, scale1)); 3169 if (int_cnt2 > 0) { // Constant substring 3170 // Repeat search for small substring (< 8 chars) 3171 // from new point without reloading substring. 3172 // Have to check that we don't read beyond string. 3173 cmpl(tmp, stride-int_cnt2); 3174 jccb(Assembler::greater, ADJUST_STR); 3175 // Fall through if matched whole substring. 3176 } else { // non constant 3177 assert(int_cnt2 == -1, "should be != 0"); 3178 3179 addl(tmp, cnt2); 3180 // Found result if we matched whole substring. 3181 cmpl(tmp, stride); 3182 jcc(Assembler::lessEqual, RET_FOUND); 3183 3184 // Repeat search for small substring (<= 8 chars) 3185 // from new point 'str1' without reloading substring. 3186 cmpl(cnt2, stride); 3187 // Have to check that we don't read beyond string. 3188 jccb(Assembler::lessEqual, ADJUST_STR); 3189 3190 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3191 // Compare the rest of substring (> 8 chars). 3192 movptr(str1, result); 3193 3194 cmpl(tmp, cnt2); 3195 // First 8 chars are already matched. 3196 jccb(Assembler::equal, CHECK_NEXT); 3197 3198 bind(SCAN_SUBSTR); 3199 pcmpestri(vec, Address(str1, 0), mode); 3200 // Need to reload strings pointers if not matched whole vector 3201 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3202 3203 bind(CHECK_NEXT); 3204 subl(cnt2, stride); 3205 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3206 addptr(str1, 16); 3207 if (ae == StrIntrinsicNode::UL) { 3208 addptr(str2, 8); 3209 } else { 3210 addptr(str2, 16); 3211 } 3212 subl(cnt1, stride); 3213 cmpl(cnt2, stride); // Do not read beyond substring 3214 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3215 // Back-up strings to avoid reading beyond substring. 3216 3217 if (ae == StrIntrinsicNode::UL) { 3218 lea(str2, Address(str2, cnt2, scale2, -8)); 3219 lea(str1, Address(str1, cnt2, scale1, -16)); 3220 } else { 3221 lea(str2, Address(str2, cnt2, scale2, -16)); 3222 lea(str1, Address(str1, cnt2, scale1, -16)); 3223 } 3224 subl(cnt1, cnt2); 3225 movl(cnt2, stride); 3226 addl(cnt1, stride); 3227 bind(CONT_SCAN_SUBSTR); 3228 if (ae == StrIntrinsicNode::UL) { 3229 pmovzxbw(vec, Address(str2, 0)); 3230 } else { 3231 movdqu(vec, Address(str2, 0)); 3232 } 3233 jmp(SCAN_SUBSTR); 3234 3235 bind(RET_FOUND_LONG); 3236 movptr(str1, Address(rsp, wordSize)); 3237 } // non constant 3238 3239 bind(RET_FOUND); 3240 // Compute substr offset 3241 subptr(result, str1); 3242 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3243 shrl(result, 1); // index 3244 } 3245 bind(CLEANUP); 3246 pop(rsp); // restore SP 3247 3248 } // string_indexof 3249 3250 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3251 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3252 ShortBranchVerifier sbv(this); 3253 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3254 3255 int stride = 8; 3256 3257 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3258 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3259 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3260 FOUND_SEQ_CHAR, DONE_LABEL; 3261 3262 movptr(result, str1); 3263 if (UseAVX >= 2) { 3264 cmpl(cnt1, stride); 3265 jcc(Assembler::less, SCAN_TO_CHAR); 3266 cmpl(cnt1, 2*stride); 3267 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3268 movdl(vec1, ch); 3269 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3270 vpxor(vec2, vec2); 3271 movl(tmp, cnt1); 3272 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3273 andl(cnt1,0x0000000F); //tail count (in chars) 3274 3275 bind(SCAN_TO_16_CHAR_LOOP); 3276 vmovdqu(vec3, Address(result, 0)); 3277 vpcmpeqw(vec3, vec3, vec1, 1); 3278 vptest(vec2, vec3); 3279 jcc(Assembler::carryClear, FOUND_CHAR); 3280 addptr(result, 32); 3281 subl(tmp, 2*stride); 3282 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3283 jmp(SCAN_TO_8_CHAR); 3284 bind(SCAN_TO_8_CHAR_INIT); 3285 movdl(vec1, ch); 3286 pshuflw(vec1, vec1, 0x00); 3287 pshufd(vec1, vec1, 0); 3288 pxor(vec2, vec2); 3289 } 3290 bind(SCAN_TO_8_CHAR); 3291 cmpl(cnt1, stride); 3292 jcc(Assembler::less, SCAN_TO_CHAR); 3293 if (UseAVX < 2) { 3294 movdl(vec1, ch); 3295 pshuflw(vec1, vec1, 0x00); 3296 pshufd(vec1, vec1, 0); 3297 pxor(vec2, vec2); 3298 } 3299 movl(tmp, cnt1); 3300 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3301 andl(cnt1,0x00000007); //tail count (in chars) 3302 3303 bind(SCAN_TO_8_CHAR_LOOP); 3304 movdqu(vec3, Address(result, 0)); 3305 pcmpeqw(vec3, vec1); 3306 ptest(vec2, vec3); 3307 jcc(Assembler::carryClear, FOUND_CHAR); 3308 addptr(result, 16); 3309 subl(tmp, stride); 3310 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3311 bind(SCAN_TO_CHAR); 3312 testl(cnt1, cnt1); 3313 jcc(Assembler::zero, RET_NOT_FOUND); 3314 bind(SCAN_TO_CHAR_LOOP); 3315 load_unsigned_short(tmp, Address(result, 0)); 3316 cmpl(ch, tmp); 3317 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3318 addptr(result, 2); 3319 subl(cnt1, 1); 3320 jccb(Assembler::zero, RET_NOT_FOUND); 3321 jmp(SCAN_TO_CHAR_LOOP); 3322 3323 bind(RET_NOT_FOUND); 3324 movl(result, -1); 3325 jmpb(DONE_LABEL); 3326 3327 bind(FOUND_CHAR); 3328 if (UseAVX >= 2) { 3329 vpmovmskb(tmp, vec3); 3330 } else { 3331 pmovmskb(tmp, vec3); 3332 } 3333 bsfl(ch, tmp); 3334 addptr(result, ch); 3335 3336 bind(FOUND_SEQ_CHAR); 3337 subptr(result, str1); 3338 shrl(result, 1); 3339 3340 bind(DONE_LABEL); 3341 } // string_indexof_char 3342 3343 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3344 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3345 ShortBranchVerifier sbv(this); 3346 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3347 3348 int stride = 16; 3349 3350 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3351 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3352 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3353 FOUND_SEQ_CHAR, DONE_LABEL; 3354 3355 movptr(result, str1); 3356 if (UseAVX >= 2) { 3357 cmpl(cnt1, stride); 3358 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3359 cmpl(cnt1, stride*2); 3360 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3361 movdl(vec1, ch); 3362 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3363 vpxor(vec2, vec2); 3364 movl(tmp, cnt1); 3365 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3366 andl(cnt1,0x0000001F); //tail count (in chars) 3367 3368 bind(SCAN_TO_32_CHAR_LOOP); 3369 vmovdqu(vec3, Address(result, 0)); 3370 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3371 vptest(vec2, vec3); 3372 jcc(Assembler::carryClear, FOUND_CHAR); 3373 addptr(result, 32); 3374 subl(tmp, stride*2); 3375 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3376 jmp(SCAN_TO_16_CHAR); 3377 3378 bind(SCAN_TO_16_CHAR_INIT); 3379 movdl(vec1, ch); 3380 pxor(vec2, vec2); 3381 pshufb(vec1, vec2); 3382 } 3383 3384 bind(SCAN_TO_16_CHAR); 3385 cmpl(cnt1, stride); 3386 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3387 if (UseAVX < 2) { 3388 movdl(vec1, ch); 3389 pxor(vec2, vec2); 3390 pshufb(vec1, vec2); 3391 } 3392 movl(tmp, cnt1); 3393 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3394 andl(cnt1,0x0000000F); //tail count (in bytes) 3395 3396 bind(SCAN_TO_16_CHAR_LOOP); 3397 movdqu(vec3, Address(result, 0)); 3398 pcmpeqb(vec3, vec1); 3399 ptest(vec2, vec3); 3400 jcc(Assembler::carryClear, FOUND_CHAR); 3401 addptr(result, 16); 3402 subl(tmp, stride); 3403 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3404 3405 bind(SCAN_TO_CHAR_INIT); 3406 testl(cnt1, cnt1); 3407 jcc(Assembler::zero, RET_NOT_FOUND); 3408 bind(SCAN_TO_CHAR_LOOP); 3409 load_unsigned_byte(tmp, Address(result, 0)); 3410 cmpl(ch, tmp); 3411 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3412 addptr(result, 1); 3413 subl(cnt1, 1); 3414 jccb(Assembler::zero, RET_NOT_FOUND); 3415 jmp(SCAN_TO_CHAR_LOOP); 3416 3417 bind(RET_NOT_FOUND); 3418 movl(result, -1); 3419 jmpb(DONE_LABEL); 3420 3421 bind(FOUND_CHAR); 3422 if (UseAVX >= 2) { 3423 vpmovmskb(tmp, vec3); 3424 } else { 3425 pmovmskb(tmp, vec3); 3426 } 3427 bsfl(ch, tmp); 3428 addptr(result, ch); 3429 3430 bind(FOUND_SEQ_CHAR); 3431 subptr(result, str1); 3432 3433 bind(DONE_LABEL); 3434 } // stringL_indexof_char 3435 3436 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3437 switch (eltype) { 3438 case T_BOOLEAN: return sizeof(jboolean); 3439 case T_BYTE: return sizeof(jbyte); 3440 case T_SHORT: return sizeof(jshort); 3441 case T_CHAR: return sizeof(jchar); 3442 case T_INT: return sizeof(jint); 3443 default: 3444 ShouldNotReachHere(); 3445 return -1; 3446 } 3447 } 3448 3449 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3450 switch (eltype) { 3451 // T_BOOLEAN used as surrogate for unsigned byte 3452 case T_BOOLEAN: movzbl(dst, src); break; 3453 case T_BYTE: movsbl(dst, src); break; 3454 case T_SHORT: movswl(dst, src); break; 3455 case T_CHAR: movzwl(dst, src); break; 3456 case T_INT: movl(dst, src); break; 3457 default: 3458 ShouldNotReachHere(); 3459 } 3460 } 3461 3462 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3463 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3464 } 3465 3466 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3467 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3468 } 3469 3470 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3471 const int vlen = Assembler::AVX_256bit; 3472 switch (eltype) { 3473 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3474 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3475 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3476 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3477 case T_INT: 3478 // do nothing 3479 break; 3480 default: 3481 ShouldNotReachHere(); 3482 } 3483 } 3484 3485 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3486 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3487 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3488 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3489 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3490 BasicType eltype) { 3491 ShortBranchVerifier sbv(this); 3492 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3493 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3494 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3495 3496 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3497 SHORT_UNROLLED_LOOP_EXIT, 3498 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3499 UNROLLED_VECTOR_LOOP_BEGIN, 3500 END; 3501 switch (eltype) { 3502 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3503 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3504 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3505 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3506 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3507 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3508 } 3509 3510 // For "renaming" for readibility of the code 3511 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3512 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3513 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3514 3515 const int elsize = arrays_hashcode_elsize(eltype); 3516 3517 /* 3518 if (cnt1 >= 2) { 3519 if (cnt1 >= 32) { 3520 UNROLLED VECTOR LOOP 3521 } 3522 UNROLLED SCALAR LOOP 3523 } 3524 SINGLE SCALAR 3525 */ 3526 3527 cmpl(cnt1, 32); 3528 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3529 3530 // cnt1 >= 32 && generate_vectorized_loop 3531 xorl(index, index); 3532 3533 // vresult = IntVector.zero(I256); 3534 for (int idx = 0; idx < 4; idx++) { 3535 vpxor(vresult[idx], vresult[idx]); 3536 } 3537 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3538 Register bound = tmp2; 3539 Register next = tmp3; 3540 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3541 movl(next, Address(tmp2, 0)); 3542 movdl(vnext, next); 3543 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3544 3545 // index = 0; 3546 // bound = cnt1 & ~(32 - 1); 3547 movl(bound, cnt1); 3548 andl(bound, ~(32 - 1)); 3549 // for (; index < bound; index += 32) { 3550 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3551 // result *= next; 3552 imull(result, next); 3553 // loop fission to upfront the cost of fetching from memory, OOO execution 3554 // can then hopefully do a better job of prefetching 3555 for (int idx = 0; idx < 4; idx++) { 3556 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3557 } 3558 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3559 for (int idx = 0; idx < 4; idx++) { 3560 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3561 arrays_hashcode_elvcast(vtmp[idx], eltype); 3562 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3563 } 3564 // index += 32; 3565 addl(index, 32); 3566 // index < bound; 3567 cmpl(index, bound); 3568 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3569 // } 3570 3571 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3572 subl(cnt1, bound); 3573 // release bound 3574 3575 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3576 for (int idx = 0; idx < 4; idx++) { 3577 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3578 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3579 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3580 } 3581 // result += vresult.reduceLanes(ADD); 3582 for (int idx = 0; idx < 4; idx++) { 3583 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3584 } 3585 3586 // } else if (cnt1 < 32) { 3587 3588 bind(SHORT_UNROLLED_BEGIN); 3589 // int i = 1; 3590 movl(index, 1); 3591 cmpl(index, cnt1); 3592 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3593 3594 // for (; i < cnt1 ; i += 2) { 3595 bind(SHORT_UNROLLED_LOOP_BEGIN); 3596 movl(tmp3, 961); 3597 imull(result, tmp3); 3598 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3599 movl(tmp3, tmp2); 3600 shll(tmp3, 5); 3601 subl(tmp3, tmp2); 3602 addl(result, tmp3); 3603 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3604 addl(result, tmp3); 3605 addl(index, 2); 3606 cmpl(index, cnt1); 3607 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3608 3609 // } 3610 // if (i >= cnt1) { 3611 bind(SHORT_UNROLLED_LOOP_EXIT); 3612 jccb(Assembler::greater, END); 3613 movl(tmp2, result); 3614 shll(result, 5); 3615 subl(result, tmp2); 3616 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3617 addl(result, tmp3); 3618 // } 3619 bind(END); 3620 3621 BLOCK_COMMENT("} // arrays_hashcode"); 3622 3623 } // arrays_hashcode 3624 3625 // helper function for string_compare 3626 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3627 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3628 Address::ScaleFactor scale2, Register index, int ae) { 3629 if (ae == StrIntrinsicNode::LL) { 3630 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3631 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3632 } else if (ae == StrIntrinsicNode::UU) { 3633 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3634 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3635 } else { 3636 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3637 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3638 } 3639 } 3640 3641 // Compare strings, used for char[] and byte[]. 3642 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3643 Register cnt1, Register cnt2, Register result, 3644 XMMRegister vec1, int ae, KRegister mask) { 3645 ShortBranchVerifier sbv(this); 3646 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3647 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3648 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3649 int stride2x2 = 0x40; 3650 Address::ScaleFactor scale = Address::no_scale; 3651 Address::ScaleFactor scale1 = Address::no_scale; 3652 Address::ScaleFactor scale2 = Address::no_scale; 3653 3654 if (ae != StrIntrinsicNode::LL) { 3655 stride2x2 = 0x20; 3656 } 3657 3658 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3659 shrl(cnt2, 1); 3660 } 3661 // Compute the minimum of the string lengths and the 3662 // difference of the string lengths (stack). 3663 // Do the conditional move stuff 3664 movl(result, cnt1); 3665 subl(cnt1, cnt2); 3666 push(cnt1); 3667 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3668 3669 // Is the minimum length zero? 3670 testl(cnt2, cnt2); 3671 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3672 if (ae == StrIntrinsicNode::LL) { 3673 // Load first bytes 3674 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3675 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3676 } else if (ae == StrIntrinsicNode::UU) { 3677 // Load first characters 3678 load_unsigned_short(result, Address(str1, 0)); 3679 load_unsigned_short(cnt1, Address(str2, 0)); 3680 } else { 3681 load_unsigned_byte(result, Address(str1, 0)); 3682 load_unsigned_short(cnt1, Address(str2, 0)); 3683 } 3684 subl(result, cnt1); 3685 jcc(Assembler::notZero, POP_LABEL); 3686 3687 if (ae == StrIntrinsicNode::UU) { 3688 // Divide length by 2 to get number of chars 3689 shrl(cnt2, 1); 3690 } 3691 cmpl(cnt2, 1); 3692 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3693 3694 // Check if the strings start at the same location and setup scale and stride 3695 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3696 cmpptr(str1, str2); 3697 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3698 if (ae == StrIntrinsicNode::LL) { 3699 scale = Address::times_1; 3700 stride = 16; 3701 } else { 3702 scale = Address::times_2; 3703 stride = 8; 3704 } 3705 } else { 3706 scale1 = Address::times_1; 3707 scale2 = Address::times_2; 3708 // scale not used 3709 stride = 8; 3710 } 3711 3712 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3713 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3714 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3715 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3716 Label COMPARE_TAIL_LONG; 3717 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3718 3719 int pcmpmask = 0x19; 3720 if (ae == StrIntrinsicNode::LL) { 3721 pcmpmask &= ~0x01; 3722 } 3723 3724 // Setup to compare 16-chars (32-bytes) vectors, 3725 // start from first character again because it has aligned address. 3726 if (ae == StrIntrinsicNode::LL) { 3727 stride2 = 32; 3728 } else { 3729 stride2 = 16; 3730 } 3731 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3732 adr_stride = stride << scale; 3733 } else { 3734 adr_stride1 = 8; //stride << scale1; 3735 adr_stride2 = 16; //stride << scale2; 3736 } 3737 3738 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3739 // rax and rdx are used by pcmpestri as elements counters 3740 movl(result, cnt2); 3741 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3742 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3743 3744 // fast path : compare first 2 8-char vectors. 3745 bind(COMPARE_16_CHARS); 3746 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3747 movdqu(vec1, Address(str1, 0)); 3748 } else { 3749 pmovzxbw(vec1, Address(str1, 0)); 3750 } 3751 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3752 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3753 3754 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3755 movdqu(vec1, Address(str1, adr_stride)); 3756 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3757 } else { 3758 pmovzxbw(vec1, Address(str1, adr_stride1)); 3759 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3760 } 3761 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3762 addl(cnt1, stride); 3763 3764 // Compare the characters at index in cnt1 3765 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3766 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3767 subl(result, cnt2); 3768 jmp(POP_LABEL); 3769 3770 // Setup the registers to start vector comparison loop 3771 bind(COMPARE_WIDE_VECTORS); 3772 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3773 lea(str1, Address(str1, result, scale)); 3774 lea(str2, Address(str2, result, scale)); 3775 } else { 3776 lea(str1, Address(str1, result, scale1)); 3777 lea(str2, Address(str2, result, scale2)); 3778 } 3779 subl(result, stride2); 3780 subl(cnt2, stride2); 3781 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3782 negptr(result); 3783 3784 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3785 bind(COMPARE_WIDE_VECTORS_LOOP); 3786 3787 #ifdef _LP64 3788 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3789 cmpl(cnt2, stride2x2); 3790 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3791 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3792 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3793 3794 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3795 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3796 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3797 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3798 } else { 3799 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3800 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3801 } 3802 kortestql(mask, mask); 3803 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3804 addptr(result, stride2x2); // update since we already compared at this addr 3805 subl(cnt2, stride2x2); // and sub the size too 3806 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3807 3808 vpxor(vec1, vec1); 3809 jmpb(COMPARE_WIDE_TAIL); 3810 }//if (VM_Version::supports_avx512vlbw()) 3811 #endif // _LP64 3812 3813 3814 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3815 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3816 vmovdqu(vec1, Address(str1, result, scale)); 3817 vpxor(vec1, Address(str2, result, scale)); 3818 } else { 3819 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3820 vpxor(vec1, Address(str2, result, scale2)); 3821 } 3822 vptest(vec1, vec1); 3823 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3824 addptr(result, stride2); 3825 subl(cnt2, stride2); 3826 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3827 // clean upper bits of YMM registers 3828 vpxor(vec1, vec1); 3829 3830 // compare wide vectors tail 3831 bind(COMPARE_WIDE_TAIL); 3832 testptr(result, result); 3833 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3834 3835 movl(result, stride2); 3836 movl(cnt2, result); 3837 negptr(result); 3838 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3839 3840 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3841 bind(VECTOR_NOT_EQUAL); 3842 // clean upper bits of YMM registers 3843 vpxor(vec1, vec1); 3844 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3845 lea(str1, Address(str1, result, scale)); 3846 lea(str2, Address(str2, result, scale)); 3847 } else { 3848 lea(str1, Address(str1, result, scale1)); 3849 lea(str2, Address(str2, result, scale2)); 3850 } 3851 jmp(COMPARE_16_CHARS); 3852 3853 // Compare tail chars, length between 1 to 15 chars 3854 bind(COMPARE_TAIL_LONG); 3855 movl(cnt2, result); 3856 cmpl(cnt2, stride); 3857 jcc(Assembler::less, COMPARE_SMALL_STR); 3858 3859 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3860 movdqu(vec1, Address(str1, 0)); 3861 } else { 3862 pmovzxbw(vec1, Address(str1, 0)); 3863 } 3864 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3865 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3866 subptr(cnt2, stride); 3867 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3868 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3869 lea(str1, Address(str1, result, scale)); 3870 lea(str2, Address(str2, result, scale)); 3871 } else { 3872 lea(str1, Address(str1, result, scale1)); 3873 lea(str2, Address(str2, result, scale2)); 3874 } 3875 negptr(cnt2); 3876 jmpb(WHILE_HEAD_LABEL); 3877 3878 bind(COMPARE_SMALL_STR); 3879 } else if (UseSSE42Intrinsics) { 3880 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3881 int pcmpmask = 0x19; 3882 // Setup to compare 8-char (16-byte) vectors, 3883 // start from first character again because it has aligned address. 3884 movl(result, cnt2); 3885 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3886 if (ae == StrIntrinsicNode::LL) { 3887 pcmpmask &= ~0x01; 3888 } 3889 jcc(Assembler::zero, COMPARE_TAIL); 3890 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3891 lea(str1, Address(str1, result, scale)); 3892 lea(str2, Address(str2, result, scale)); 3893 } else { 3894 lea(str1, Address(str1, result, scale1)); 3895 lea(str2, Address(str2, result, scale2)); 3896 } 3897 negptr(result); 3898 3899 // pcmpestri 3900 // inputs: 3901 // vec1- substring 3902 // rax - negative string length (elements count) 3903 // mem - scanned string 3904 // rdx - string length (elements count) 3905 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3906 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3907 // outputs: 3908 // rcx - first mismatched element index 3909 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3910 3911 bind(COMPARE_WIDE_VECTORS); 3912 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3913 movdqu(vec1, Address(str1, result, scale)); 3914 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3915 } else { 3916 pmovzxbw(vec1, Address(str1, result, scale1)); 3917 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3918 } 3919 // After pcmpestri cnt1(rcx) contains mismatched element index 3920 3921 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3922 addptr(result, stride); 3923 subptr(cnt2, stride); 3924 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3925 3926 // compare wide vectors tail 3927 testptr(result, result); 3928 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3929 3930 movl(cnt2, stride); 3931 movl(result, stride); 3932 negptr(result); 3933 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3934 movdqu(vec1, Address(str1, result, scale)); 3935 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3936 } else { 3937 pmovzxbw(vec1, Address(str1, result, scale1)); 3938 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3939 } 3940 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3941 3942 // Mismatched characters in the vectors 3943 bind(VECTOR_NOT_EQUAL); 3944 addptr(cnt1, result); 3945 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3946 subl(result, cnt2); 3947 jmpb(POP_LABEL); 3948 3949 bind(COMPARE_TAIL); // limit is zero 3950 movl(cnt2, result); 3951 // Fallthru to tail compare 3952 } 3953 // Shift str2 and str1 to the end of the arrays, negate min 3954 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3955 lea(str1, Address(str1, cnt2, scale)); 3956 lea(str2, Address(str2, cnt2, scale)); 3957 } else { 3958 lea(str1, Address(str1, cnt2, scale1)); 3959 lea(str2, Address(str2, cnt2, scale2)); 3960 } 3961 decrementl(cnt2); // first character was compared already 3962 negptr(cnt2); 3963 3964 // Compare the rest of the elements 3965 bind(WHILE_HEAD_LABEL); 3966 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3967 subl(result, cnt1); 3968 jccb(Assembler::notZero, POP_LABEL); 3969 increment(cnt2); 3970 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3971 3972 // Strings are equal up to min length. Return the length difference. 3973 bind(LENGTH_DIFF_LABEL); 3974 pop(result); 3975 if (ae == StrIntrinsicNode::UU) { 3976 // Divide diff by 2 to get number of chars 3977 sarl(result, 1); 3978 } 3979 jmpb(DONE_LABEL); 3980 3981 #ifdef _LP64 3982 if (VM_Version::supports_avx512vlbw()) { 3983 3984 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3985 3986 kmovql(cnt1, mask); 3987 notq(cnt1); 3988 bsfq(cnt2, cnt1); 3989 if (ae != StrIntrinsicNode::LL) { 3990 // Divide diff by 2 to get number of chars 3991 sarl(cnt2, 1); 3992 } 3993 addq(result, cnt2); 3994 if (ae == StrIntrinsicNode::LL) { 3995 load_unsigned_byte(cnt1, Address(str2, result)); 3996 load_unsigned_byte(result, Address(str1, result)); 3997 } else if (ae == StrIntrinsicNode::UU) { 3998 load_unsigned_short(cnt1, Address(str2, result, scale)); 3999 load_unsigned_short(result, Address(str1, result, scale)); 4000 } else { 4001 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4002 load_unsigned_byte(result, Address(str1, result, scale1)); 4003 } 4004 subl(result, cnt1); 4005 jmpb(POP_LABEL); 4006 }//if (VM_Version::supports_avx512vlbw()) 4007 #endif // _LP64 4008 4009 // Discard the stored length difference 4010 bind(POP_LABEL); 4011 pop(cnt1); 4012 4013 // That's it 4014 bind(DONE_LABEL); 4015 if(ae == StrIntrinsicNode::UL) { 4016 negl(result); 4017 } 4018 4019 } 4020 4021 // Search for Non-ASCII character (Negative byte value) in a byte array, 4022 // return the index of the first such character, otherwise the length 4023 // of the array segment searched. 4024 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4025 // @IntrinsicCandidate 4026 // public static int countPositives(byte[] ba, int off, int len) { 4027 // for (int i = off; i < off + len; i++) { 4028 // if (ba[i] < 0) { 4029 // return i - off; 4030 // } 4031 // } 4032 // return len; 4033 // } 4034 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4035 Register result, Register tmp1, 4036 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4037 // rsi: byte array 4038 // rcx: len 4039 // rax: result 4040 ShortBranchVerifier sbv(this); 4041 assert_different_registers(ary1, len, result, tmp1); 4042 assert_different_registers(vec1, vec2); 4043 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4044 4045 movl(result, len); // copy 4046 // len == 0 4047 testl(len, len); 4048 jcc(Assembler::zero, DONE); 4049 4050 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4051 VM_Version::supports_avx512vlbw() && 4052 VM_Version::supports_bmi2()) { 4053 4054 Label test_64_loop, test_tail, BREAK_LOOP; 4055 movl(tmp1, len); 4056 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4057 4058 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4059 andl(len, 0xffffffc0); // vector count (in chars) 4060 jccb(Assembler::zero, test_tail); 4061 4062 lea(ary1, Address(ary1, len, Address::times_1)); 4063 negptr(len); 4064 4065 bind(test_64_loop); 4066 // Check whether our 64 elements of size byte contain negatives 4067 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4068 kortestql(mask1, mask1); 4069 jcc(Assembler::notZero, BREAK_LOOP); 4070 4071 addptr(len, 64); 4072 jccb(Assembler::notZero, test_64_loop); 4073 4074 bind(test_tail); 4075 // bail out when there is nothing to be done 4076 testl(tmp1, -1); 4077 jcc(Assembler::zero, DONE); 4078 4079 4080 // check the tail for absense of negatives 4081 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4082 #ifdef _LP64 4083 { 4084 Register tmp3_aliased = len; 4085 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4086 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4087 notq(tmp3_aliased); 4088 kmovql(mask2, tmp3_aliased); 4089 } 4090 #else 4091 Label k_init; 4092 jmp(k_init); 4093 4094 // We could not read 64-bits from a general purpose register thus we move 4095 // data required to compose 64 1's to the instruction stream 4096 // We emit 64 byte wide series of elements from 0..63 which later on would 4097 // be used as a compare targets with tail count contained in tmp1 register. 4098 // Result would be a k register having tmp1 consecutive number or 1 4099 // counting from least significant bit. 4100 address tmp = pc(); 4101 emit_int64(0x0706050403020100); 4102 emit_int64(0x0F0E0D0C0B0A0908); 4103 emit_int64(0x1716151413121110); 4104 emit_int64(0x1F1E1D1C1B1A1918); 4105 emit_int64(0x2726252423222120); 4106 emit_int64(0x2F2E2D2C2B2A2928); 4107 emit_int64(0x3736353433323130); 4108 emit_int64(0x3F3E3D3C3B3A3938); 4109 4110 bind(k_init); 4111 lea(len, InternalAddress(tmp)); 4112 // create mask to test for negative byte inside a vector 4113 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 4114 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 4115 4116 #endif 4117 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4118 ktestq(mask1, mask2); 4119 jcc(Assembler::zero, DONE); 4120 4121 // do a full check for negative registers in the tail 4122 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4123 // ary1 already pointing to the right place 4124 jmpb(TAIL_START); 4125 4126 bind(BREAK_LOOP); 4127 // At least one byte in the last 64 byte block was negative. 4128 // Set up to look at the last 64 bytes as if they were a tail 4129 lea(ary1, Address(ary1, len, Address::times_1)); 4130 addptr(result, len); 4131 // Ignore the very last byte: if all others are positive, 4132 // it must be negative, so we can skip right to the 2+1 byte 4133 // end comparison at this point 4134 orl(result, 63); 4135 movl(len, 63); 4136 // Fallthru to tail compare 4137 } else { 4138 4139 if (UseAVX >= 2 && UseSSE >= 2) { 4140 // With AVX2, use 32-byte vector compare 4141 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4142 4143 // Compare 32-byte vectors 4144 testl(len, 0xffffffe0); // vector count (in bytes) 4145 jccb(Assembler::zero, TAIL_START); 4146 4147 andl(len, 0xffffffe0); 4148 lea(ary1, Address(ary1, len, Address::times_1)); 4149 negptr(len); 4150 4151 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4152 movdl(vec2, tmp1); 4153 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4154 4155 bind(COMPARE_WIDE_VECTORS); 4156 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4157 vptest(vec1, vec2); 4158 jccb(Assembler::notZero, BREAK_LOOP); 4159 addptr(len, 32); 4160 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4161 4162 testl(result, 0x0000001f); // any bytes remaining? 4163 jcc(Assembler::zero, DONE); 4164 4165 // Quick test using the already prepared vector mask 4166 movl(len, result); 4167 andl(len, 0x0000001f); 4168 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4169 vptest(vec1, vec2); 4170 jcc(Assembler::zero, DONE); 4171 // There are zeros, jump to the tail to determine exactly where 4172 jmpb(TAIL_START); 4173 4174 bind(BREAK_LOOP); 4175 // At least one byte in the last 32-byte vector is negative. 4176 // Set up to look at the last 32 bytes as if they were a tail 4177 lea(ary1, Address(ary1, len, Address::times_1)); 4178 addptr(result, len); 4179 // Ignore the very last byte: if all others are positive, 4180 // it must be negative, so we can skip right to the 2+1 byte 4181 // end comparison at this point 4182 orl(result, 31); 4183 movl(len, 31); 4184 // Fallthru to tail compare 4185 } else if (UseSSE42Intrinsics) { 4186 // With SSE4.2, use double quad vector compare 4187 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4188 4189 // Compare 16-byte vectors 4190 testl(len, 0xfffffff0); // vector count (in bytes) 4191 jcc(Assembler::zero, TAIL_START); 4192 4193 andl(len, 0xfffffff0); 4194 lea(ary1, Address(ary1, len, Address::times_1)); 4195 negptr(len); 4196 4197 movl(tmp1, 0x80808080); 4198 movdl(vec2, tmp1); 4199 pshufd(vec2, vec2, 0); 4200 4201 bind(COMPARE_WIDE_VECTORS); 4202 movdqu(vec1, Address(ary1, len, Address::times_1)); 4203 ptest(vec1, vec2); 4204 jccb(Assembler::notZero, BREAK_LOOP); 4205 addptr(len, 16); 4206 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4207 4208 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4209 jcc(Assembler::zero, DONE); 4210 4211 // Quick test using the already prepared vector mask 4212 movl(len, result); 4213 andl(len, 0x0000000f); // tail count (in bytes) 4214 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4215 ptest(vec1, vec2); 4216 jcc(Assembler::zero, DONE); 4217 jmpb(TAIL_START); 4218 4219 bind(BREAK_LOOP); 4220 // At least one byte in the last 16-byte vector is negative. 4221 // Set up and look at the last 16 bytes as if they were a tail 4222 lea(ary1, Address(ary1, len, Address::times_1)); 4223 addptr(result, len); 4224 // Ignore the very last byte: if all others are positive, 4225 // it must be negative, so we can skip right to the 2+1 byte 4226 // end comparison at this point 4227 orl(result, 15); 4228 movl(len, 15); 4229 // Fallthru to tail compare 4230 } 4231 } 4232 4233 bind(TAIL_START); 4234 // Compare 4-byte vectors 4235 andl(len, 0xfffffffc); // vector count (in bytes) 4236 jccb(Assembler::zero, COMPARE_CHAR); 4237 4238 lea(ary1, Address(ary1, len, Address::times_1)); 4239 negptr(len); 4240 4241 bind(COMPARE_VECTORS); 4242 movl(tmp1, Address(ary1, len, Address::times_1)); 4243 andl(tmp1, 0x80808080); 4244 jccb(Assembler::notZero, TAIL_ADJUST); 4245 addptr(len, 4); 4246 jccb(Assembler::notZero, COMPARE_VECTORS); 4247 4248 // Compare trailing char (final 2-3 bytes), if any 4249 bind(COMPARE_CHAR); 4250 4251 testl(result, 0x2); // tail char 4252 jccb(Assembler::zero, COMPARE_BYTE); 4253 load_unsigned_short(tmp1, Address(ary1, 0)); 4254 andl(tmp1, 0x00008080); 4255 jccb(Assembler::notZero, CHAR_ADJUST); 4256 lea(ary1, Address(ary1, 2)); 4257 4258 bind(COMPARE_BYTE); 4259 testl(result, 0x1); // tail byte 4260 jccb(Assembler::zero, DONE); 4261 load_unsigned_byte(tmp1, Address(ary1, 0)); 4262 testl(tmp1, 0x00000080); 4263 jccb(Assembler::zero, DONE); 4264 subptr(result, 1); 4265 jmpb(DONE); 4266 4267 bind(TAIL_ADJUST); 4268 // there are negative bits in the last 4 byte block. 4269 // Adjust result and check the next three bytes 4270 addptr(result, len); 4271 orl(result, 3); 4272 lea(ary1, Address(ary1, len, Address::times_1)); 4273 jmpb(COMPARE_CHAR); 4274 4275 bind(CHAR_ADJUST); 4276 // We are looking at a char + optional byte tail, and found that one 4277 // of the bytes in the char is negative. Adjust the result, check the 4278 // first byte and readjust if needed. 4279 andl(result, 0xfffffffc); 4280 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4281 jccb(Assembler::notZero, DONE); 4282 addptr(result, 1); 4283 4284 // That's it 4285 bind(DONE); 4286 if (UseAVX >= 2 && UseSSE >= 2) { 4287 // clean upper bits of YMM registers 4288 vpxor(vec1, vec1); 4289 vpxor(vec2, vec2); 4290 } 4291 } 4292 4293 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4294 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4295 Register limit, Register result, Register chr, 4296 XMMRegister vec1, XMMRegister vec2, bool is_char, 4297 KRegister mask, bool expand_ary2) { 4298 // for expand_ary2, limit is the (smaller) size of the second array. 4299 ShortBranchVerifier sbv(this); 4300 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4301 4302 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4303 "Expansion only implemented for AVX2"); 4304 4305 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4306 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4307 4308 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4309 int scaleIncr = expand_ary2 ? 8 : 16; 4310 4311 if (is_array_equ) { 4312 // Check the input args 4313 cmpoop(ary1, ary2); 4314 jcc(Assembler::equal, TRUE_LABEL); 4315 4316 // Need additional checks for arrays_equals. 4317 testptr(ary1, ary1); 4318 jcc(Assembler::zero, FALSE_LABEL); 4319 testptr(ary2, ary2); 4320 jcc(Assembler::zero, FALSE_LABEL); 4321 4322 // Check the lengths 4323 movl(limit, Address(ary1, length_offset)); 4324 cmpl(limit, Address(ary2, length_offset)); 4325 jcc(Assembler::notEqual, FALSE_LABEL); 4326 } 4327 4328 // count == 0 4329 testl(limit, limit); 4330 jcc(Assembler::zero, TRUE_LABEL); 4331 4332 if (is_array_equ) { 4333 // Load array address 4334 lea(ary1, Address(ary1, base_offset)); 4335 lea(ary2, Address(ary2, base_offset)); 4336 } 4337 4338 if (is_array_equ && is_char) { 4339 // arrays_equals when used for char[]. 4340 shll(limit, 1); // byte count != 0 4341 } 4342 movl(result, limit); // copy 4343 4344 if (UseAVX >= 2) { 4345 // With AVX2, use 32-byte vector compare 4346 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4347 4348 // Compare 32-byte vectors 4349 if (expand_ary2) { 4350 andl(result, 0x0000000f); // tail count (in bytes) 4351 andl(limit, 0xfffffff0); // vector count (in bytes) 4352 jcc(Assembler::zero, COMPARE_TAIL); 4353 } else { 4354 andl(result, 0x0000001f); // tail count (in bytes) 4355 andl(limit, 0xffffffe0); // vector count (in bytes) 4356 jcc(Assembler::zero, COMPARE_TAIL_16); 4357 } 4358 4359 lea(ary1, Address(ary1, limit, scaleFactor)); 4360 lea(ary2, Address(ary2, limit, Address::times_1)); 4361 negptr(limit); 4362 4363 #ifdef _LP64 4364 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4365 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4366 4367 cmpl(limit, -64); 4368 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4369 4370 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4371 4372 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4373 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4374 kortestql(mask, mask); 4375 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4376 addptr(limit, 64); // update since we already compared at this addr 4377 cmpl(limit, -64); 4378 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4379 4380 // At this point we may still need to compare -limit+result bytes. 4381 // We could execute the next two instruction and just continue via non-wide path: 4382 // cmpl(limit, 0); 4383 // jcc(Assembler::equal, COMPARE_TAIL); // true 4384 // But since we stopped at the points ary{1,2}+limit which are 4385 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4386 // (|limit| <= 32 and result < 32), 4387 // we may just compare the last 64 bytes. 4388 // 4389 addptr(result, -64); // it is safe, bc we just came from this area 4390 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4391 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4392 kortestql(mask, mask); 4393 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4394 4395 jmp(TRUE_LABEL); 4396 4397 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4398 4399 }//if (VM_Version::supports_avx512vlbw()) 4400 #endif //_LP64 4401 bind(COMPARE_WIDE_VECTORS); 4402 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4403 if (expand_ary2) { 4404 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4405 } else { 4406 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4407 } 4408 vpxor(vec1, vec2); 4409 4410 vptest(vec1, vec1); 4411 jcc(Assembler::notZero, FALSE_LABEL); 4412 addptr(limit, scaleIncr * 2); 4413 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4414 4415 testl(result, result); 4416 jcc(Assembler::zero, TRUE_LABEL); 4417 4418 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4419 if (expand_ary2) { 4420 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4421 } else { 4422 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4423 } 4424 vpxor(vec1, vec2); 4425 4426 vptest(vec1, vec1); 4427 jcc(Assembler::notZero, FALSE_LABEL); 4428 jmp(TRUE_LABEL); 4429 4430 bind(COMPARE_TAIL_16); // limit is zero 4431 movl(limit, result); 4432 4433 // Compare 16-byte chunks 4434 andl(result, 0x0000000f); // tail count (in bytes) 4435 andl(limit, 0xfffffff0); // vector count (in bytes) 4436 jcc(Assembler::zero, COMPARE_TAIL); 4437 4438 lea(ary1, Address(ary1, limit, scaleFactor)); 4439 lea(ary2, Address(ary2, limit, Address::times_1)); 4440 negptr(limit); 4441 4442 bind(COMPARE_WIDE_VECTORS_16); 4443 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4444 if (expand_ary2) { 4445 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4446 } else { 4447 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4448 } 4449 pxor(vec1, vec2); 4450 4451 ptest(vec1, vec1); 4452 jcc(Assembler::notZero, FALSE_LABEL); 4453 addptr(limit, scaleIncr); 4454 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4455 4456 bind(COMPARE_TAIL); // limit is zero 4457 movl(limit, result); 4458 // Fallthru to tail compare 4459 } else if (UseSSE42Intrinsics) { 4460 // With SSE4.2, use double quad vector compare 4461 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4462 4463 // Compare 16-byte vectors 4464 andl(result, 0x0000000f); // tail count (in bytes) 4465 andl(limit, 0xfffffff0); // vector count (in bytes) 4466 jcc(Assembler::zero, COMPARE_TAIL); 4467 4468 lea(ary1, Address(ary1, limit, Address::times_1)); 4469 lea(ary2, Address(ary2, limit, Address::times_1)); 4470 negptr(limit); 4471 4472 bind(COMPARE_WIDE_VECTORS); 4473 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4474 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4475 pxor(vec1, vec2); 4476 4477 ptest(vec1, vec1); 4478 jcc(Assembler::notZero, FALSE_LABEL); 4479 addptr(limit, 16); 4480 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4481 4482 testl(result, result); 4483 jcc(Assembler::zero, TRUE_LABEL); 4484 4485 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4486 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4487 pxor(vec1, vec2); 4488 4489 ptest(vec1, vec1); 4490 jccb(Assembler::notZero, FALSE_LABEL); 4491 jmpb(TRUE_LABEL); 4492 4493 bind(COMPARE_TAIL); // limit is zero 4494 movl(limit, result); 4495 // Fallthru to tail compare 4496 } 4497 4498 // Compare 4-byte vectors 4499 if (expand_ary2) { 4500 testl(result, result); 4501 jccb(Assembler::zero, TRUE_LABEL); 4502 } else { 4503 andl(limit, 0xfffffffc); // vector count (in bytes) 4504 jccb(Assembler::zero, COMPARE_CHAR); 4505 } 4506 4507 lea(ary1, Address(ary1, limit, scaleFactor)); 4508 lea(ary2, Address(ary2, limit, Address::times_1)); 4509 negptr(limit); 4510 4511 bind(COMPARE_VECTORS); 4512 if (expand_ary2) { 4513 // There are no "vector" operations for bytes to shorts 4514 movzbl(chr, Address(ary2, limit, Address::times_1)); 4515 cmpw(Address(ary1, limit, Address::times_2), chr); 4516 jccb(Assembler::notEqual, FALSE_LABEL); 4517 addptr(limit, 1); 4518 jcc(Assembler::notZero, COMPARE_VECTORS); 4519 jmp(TRUE_LABEL); 4520 } else { 4521 movl(chr, Address(ary1, limit, Address::times_1)); 4522 cmpl(chr, Address(ary2, limit, Address::times_1)); 4523 jccb(Assembler::notEqual, FALSE_LABEL); 4524 addptr(limit, 4); 4525 jcc(Assembler::notZero, COMPARE_VECTORS); 4526 } 4527 4528 // Compare trailing char (final 2 bytes), if any 4529 bind(COMPARE_CHAR); 4530 testl(result, 0x2); // tail char 4531 jccb(Assembler::zero, COMPARE_BYTE); 4532 load_unsigned_short(chr, Address(ary1, 0)); 4533 load_unsigned_short(limit, Address(ary2, 0)); 4534 cmpl(chr, limit); 4535 jccb(Assembler::notEqual, FALSE_LABEL); 4536 4537 if (is_array_equ && is_char) { 4538 bind(COMPARE_BYTE); 4539 } else { 4540 lea(ary1, Address(ary1, 2)); 4541 lea(ary2, Address(ary2, 2)); 4542 4543 bind(COMPARE_BYTE); 4544 testl(result, 0x1); // tail byte 4545 jccb(Assembler::zero, TRUE_LABEL); 4546 load_unsigned_byte(chr, Address(ary1, 0)); 4547 load_unsigned_byte(limit, Address(ary2, 0)); 4548 cmpl(chr, limit); 4549 jccb(Assembler::notEqual, FALSE_LABEL); 4550 } 4551 bind(TRUE_LABEL); 4552 movl(result, 1); // return true 4553 jmpb(DONE); 4554 4555 bind(FALSE_LABEL); 4556 xorl(result, result); // return false 4557 4558 // That's it 4559 bind(DONE); 4560 if (UseAVX >= 2) { 4561 // clean upper bits of YMM registers 4562 vpxor(vec1, vec1); 4563 vpxor(vec2, vec2); 4564 } 4565 } 4566 4567 #ifdef _LP64 4568 4569 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4570 #define __ masm. 4571 Register dst = stub.data<0>(); 4572 XMMRegister src = stub.data<1>(); 4573 address target = stub.data<2>(); 4574 __ bind(stub.entry()); 4575 __ subptr(rsp, 8); 4576 __ movdbl(Address(rsp), src); 4577 __ call(RuntimeAddress(target)); 4578 __ pop(dst); 4579 __ jmp(stub.continuation()); 4580 #undef __ 4581 } 4582 4583 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4584 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4585 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4586 4587 address slowpath_target; 4588 if (dst_bt == T_INT) { 4589 if (src_bt == T_FLOAT) { 4590 cvttss2sil(dst, src); 4591 cmpl(dst, 0x80000000); 4592 slowpath_target = StubRoutines::x86::f2i_fixup(); 4593 } else { 4594 cvttsd2sil(dst, src); 4595 cmpl(dst, 0x80000000); 4596 slowpath_target = StubRoutines::x86::d2i_fixup(); 4597 } 4598 } else { 4599 if (src_bt == T_FLOAT) { 4600 cvttss2siq(dst, src); 4601 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4602 slowpath_target = StubRoutines::x86::f2l_fixup(); 4603 } else { 4604 cvttsd2siq(dst, src); 4605 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4606 slowpath_target = StubRoutines::x86::d2l_fixup(); 4607 } 4608 } 4609 4610 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4611 jcc(Assembler::equal, stub->entry()); 4612 bind(stub->continuation()); 4613 } 4614 4615 #endif // _LP64 4616 4617 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4618 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4619 switch(ideal_opc) { 4620 case Op_LShiftVS: 4621 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4622 case Op_LShiftVI: 4623 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4624 case Op_LShiftVL: 4625 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4626 case Op_RShiftVS: 4627 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4628 case Op_RShiftVI: 4629 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4630 case Op_RShiftVL: 4631 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4632 case Op_URShiftVS: 4633 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4634 case Op_URShiftVI: 4635 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4636 case Op_URShiftVL: 4637 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4638 case Op_RotateRightV: 4639 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4640 case Op_RotateLeftV: 4641 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4642 default: 4643 fatal("Unsupported masked operation"); break; 4644 } 4645 } 4646 4647 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4648 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4649 bool is_varshift) { 4650 switch (ideal_opc) { 4651 case Op_AddVB: 4652 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4653 case Op_AddVS: 4654 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4655 case Op_AddVI: 4656 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4657 case Op_AddVL: 4658 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4659 case Op_AddVF: 4660 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4661 case Op_AddVD: 4662 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4663 case Op_SubVB: 4664 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4665 case Op_SubVS: 4666 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4667 case Op_SubVI: 4668 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4669 case Op_SubVL: 4670 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4671 case Op_SubVF: 4672 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4673 case Op_SubVD: 4674 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4675 case Op_MulVS: 4676 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4677 case Op_MulVI: 4678 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4679 case Op_MulVL: 4680 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4681 case Op_MulVF: 4682 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4683 case Op_MulVD: 4684 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4685 case Op_DivVF: 4686 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4687 case Op_DivVD: 4688 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4689 case Op_SqrtVF: 4690 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4691 case Op_SqrtVD: 4692 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4693 case Op_AbsVB: 4694 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4695 case Op_AbsVS: 4696 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4697 case Op_AbsVI: 4698 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4699 case Op_AbsVL: 4700 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4701 case Op_FmaVF: 4702 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4703 case Op_FmaVD: 4704 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4705 case Op_VectorRearrange: 4706 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4707 case Op_LShiftVS: 4708 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4709 case Op_LShiftVI: 4710 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4711 case Op_LShiftVL: 4712 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4713 case Op_RShiftVS: 4714 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4715 case Op_RShiftVI: 4716 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4717 case Op_RShiftVL: 4718 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4719 case Op_URShiftVS: 4720 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4721 case Op_URShiftVI: 4722 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4723 case Op_URShiftVL: 4724 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4725 case Op_RotateLeftV: 4726 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4727 case Op_RotateRightV: 4728 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4729 case Op_MaxV: 4730 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4731 case Op_MinV: 4732 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4733 case Op_XorV: 4734 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4735 case Op_OrV: 4736 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4737 case Op_AndV: 4738 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4739 default: 4740 fatal("Unsupported masked operation"); break; 4741 } 4742 } 4743 4744 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4745 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4746 switch (ideal_opc) { 4747 case Op_AddVB: 4748 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4749 case Op_AddVS: 4750 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4751 case Op_AddVI: 4752 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4753 case Op_AddVL: 4754 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4755 case Op_AddVF: 4756 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4757 case Op_AddVD: 4758 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4759 case Op_SubVB: 4760 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4761 case Op_SubVS: 4762 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4763 case Op_SubVI: 4764 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4765 case Op_SubVL: 4766 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4767 case Op_SubVF: 4768 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4769 case Op_SubVD: 4770 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4771 case Op_MulVS: 4772 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4773 case Op_MulVI: 4774 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4775 case Op_MulVL: 4776 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4777 case Op_MulVF: 4778 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4779 case Op_MulVD: 4780 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4781 case Op_DivVF: 4782 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4783 case Op_DivVD: 4784 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4785 case Op_FmaVF: 4786 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4787 case Op_FmaVD: 4788 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4789 case Op_MaxV: 4790 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4791 case Op_MinV: 4792 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4793 case Op_XorV: 4794 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4795 case Op_OrV: 4796 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4797 case Op_AndV: 4798 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4799 default: 4800 fatal("Unsupported masked operation"); break; 4801 } 4802 } 4803 4804 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4805 KRegister src1, KRegister src2) { 4806 BasicType etype = T_ILLEGAL; 4807 switch(mask_len) { 4808 case 2: 4809 case 4: 4810 case 8: etype = T_BYTE; break; 4811 case 16: etype = T_SHORT; break; 4812 case 32: etype = T_INT; break; 4813 case 64: etype = T_LONG; break; 4814 default: fatal("Unsupported type"); break; 4815 } 4816 assert(etype != T_ILLEGAL, ""); 4817 switch(ideal_opc) { 4818 case Op_AndVMask: 4819 kand(etype, dst, src1, src2); break; 4820 case Op_OrVMask: 4821 kor(etype, dst, src1, src2); break; 4822 case Op_XorVMask: 4823 kxor(etype, dst, src1, src2); break; 4824 default: 4825 fatal("Unsupported masked operation"); break; 4826 } 4827 } 4828 4829 /* 4830 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4831 * If src is NaN, the result is 0. 4832 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4833 * the result is equal to the value of Integer.MIN_VALUE. 4834 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4835 * the result is equal to the value of Integer.MAX_VALUE. 4836 */ 4837 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4838 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4839 Register rscratch, AddressLiteral float_sign_flip, 4840 int vec_enc) { 4841 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4842 Label done; 4843 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4844 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4845 vptest(xtmp2, xtmp2, vec_enc); 4846 jccb(Assembler::equal, done); 4847 4848 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4849 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4850 4851 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4852 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4853 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4854 4855 // Recompute the mask for remaining special value. 4856 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4857 // Extract SRC values corresponding to TRUE mask lanes. 4858 vpand(xtmp4, xtmp2, src, vec_enc); 4859 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4860 // values are set. 4861 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4862 4863 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4864 bind(done); 4865 } 4866 4867 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4868 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4869 Register rscratch, AddressLiteral float_sign_flip, 4870 int vec_enc) { 4871 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4872 Label done; 4873 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4874 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4875 kortestwl(ktmp1, ktmp1); 4876 jccb(Assembler::equal, done); 4877 4878 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4879 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4880 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4881 4882 kxorwl(ktmp1, ktmp1, ktmp2); 4883 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4884 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4885 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4886 bind(done); 4887 } 4888 4889 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4890 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4891 Register rscratch, AddressLiteral double_sign_flip, 4892 int vec_enc) { 4893 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4894 4895 Label done; 4896 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4897 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4898 kortestwl(ktmp1, ktmp1); 4899 jccb(Assembler::equal, done); 4900 4901 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4902 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4903 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4904 4905 kxorwl(ktmp1, ktmp1, ktmp2); 4906 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4907 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4908 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4909 bind(done); 4910 } 4911 4912 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4913 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4914 Register rscratch, AddressLiteral float_sign_flip, 4915 int vec_enc) { 4916 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4917 Label done; 4918 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4919 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4920 kortestwl(ktmp1, ktmp1); 4921 jccb(Assembler::equal, done); 4922 4923 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4924 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4925 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4926 4927 kxorwl(ktmp1, ktmp1, ktmp2); 4928 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4929 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4930 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4931 bind(done); 4932 } 4933 4934 /* 4935 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4936 * If src is NaN, the result is 0. 4937 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4938 * the result is equal to the value of Long.MIN_VALUE. 4939 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4940 * the result is equal to the value of Long.MAX_VALUE. 4941 */ 4942 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4943 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4944 Register rscratch, AddressLiteral double_sign_flip, 4945 int vec_enc) { 4946 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4947 4948 Label done; 4949 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4950 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4951 kortestwl(ktmp1, ktmp1); 4952 jccb(Assembler::equal, done); 4953 4954 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4955 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4956 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4957 4958 kxorwl(ktmp1, ktmp1, ktmp2); 4959 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4960 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4961 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4962 bind(done); 4963 } 4964 4965 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 4966 XMMRegister xtmp, int index, int vec_enc) { 4967 assert(vec_enc < Assembler::AVX_512bit, ""); 4968 if (vec_enc == Assembler::AVX_256bit) { 4969 vextractf128_high(xtmp, src); 4970 vshufps(dst, src, xtmp, index, vec_enc); 4971 } else { 4972 vshufps(dst, src, zero, index, vec_enc); 4973 } 4974 } 4975 4976 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4977 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 4978 AddressLiteral float_sign_flip, int src_vec_enc) { 4979 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4980 4981 Label done; 4982 // Compare the destination lanes with float_sign_flip 4983 // value to get mask for all special values. 4984 movdqu(xtmp1, float_sign_flip, rscratch); 4985 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 4986 ptest(xtmp2, xtmp2); 4987 jccb(Assembler::equal, done); 4988 4989 // Flip float_sign_flip to get max integer value. 4990 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 4991 pxor(xtmp1, xtmp4); 4992 4993 // Set detination lanes corresponding to unordered source lanes as zero. 4994 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 4995 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 4996 4997 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4998 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4999 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5000 5001 // Recompute the mask for remaining special value. 5002 pxor(xtmp2, xtmp3); 5003 // Extract mask corresponding to non-negative source lanes. 5004 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5005 5006 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5007 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5008 pand(xtmp3, xtmp2); 5009 5010 // Replace destination lanes holding special value(0x80000000) with max int 5011 // if corresponding source lane holds a +ve value. 5012 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5013 bind(done); 5014 } 5015 5016 5017 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5018 XMMRegister xtmp, Register rscratch, int vec_enc) { 5019 switch(to_elem_bt) { 5020 case T_SHORT: 5021 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5022 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5023 vpackusdw(dst, dst, zero, vec_enc); 5024 if (vec_enc == Assembler::AVX_256bit) { 5025 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5026 } 5027 break; 5028 case T_BYTE: 5029 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5030 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5031 vpackusdw(dst, dst, zero, vec_enc); 5032 if (vec_enc == Assembler::AVX_256bit) { 5033 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5034 } 5035 vpackuswb(dst, dst, zero, vec_enc); 5036 break; 5037 default: assert(false, "%s", type2name(to_elem_bt)); 5038 } 5039 } 5040 5041 /* 5042 * Algorithm for vector D2L and F2I conversions:- 5043 * a) Perform vector D2L/F2I cast. 5044 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5045 * It signifies that source value could be any of the special floating point 5046 * values(NaN,-Inf,Inf,Max,-Min). 5047 * c) Set destination to zero if source is NaN value. 5048 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5049 */ 5050 5051 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5052 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5053 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5054 int to_elem_sz = type2aelembytes(to_elem_bt); 5055 assert(to_elem_sz <= 4, ""); 5056 vcvttps2dq(dst, src, vec_enc); 5057 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5058 if (to_elem_sz < 4) { 5059 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5060 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5061 } 5062 } 5063 5064 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5065 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5066 Register rscratch, int vec_enc) { 5067 int to_elem_sz = type2aelembytes(to_elem_bt); 5068 assert(to_elem_sz <= 4, ""); 5069 vcvttps2dq(dst, src, vec_enc); 5070 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5071 switch(to_elem_bt) { 5072 case T_INT: 5073 break; 5074 case T_SHORT: 5075 evpmovdw(dst, dst, vec_enc); 5076 break; 5077 case T_BYTE: 5078 evpmovdb(dst, dst, vec_enc); 5079 break; 5080 default: assert(false, "%s", type2name(to_elem_bt)); 5081 } 5082 } 5083 5084 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5085 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5086 Register rscratch, int vec_enc) { 5087 evcvttps2qq(dst, src, vec_enc); 5088 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5089 } 5090 5091 // Handling for downcasting from double to integer or sub-word types on AVX2. 5092 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5093 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5094 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5095 int to_elem_sz = type2aelembytes(to_elem_bt); 5096 assert(to_elem_sz < 8, ""); 5097 vcvttpd2dq(dst, src, vec_enc); 5098 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5099 float_sign_flip, vec_enc); 5100 if (to_elem_sz < 4) { 5101 // xtmp4 holds all zero lanes. 5102 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5103 } 5104 } 5105 5106 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5107 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5108 KRegister ktmp2, AddressLiteral sign_flip, 5109 Register rscratch, int vec_enc) { 5110 if (VM_Version::supports_avx512dq()) { 5111 evcvttpd2qq(dst, src, vec_enc); 5112 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5113 switch(to_elem_bt) { 5114 case T_LONG: 5115 break; 5116 case T_INT: 5117 evpmovsqd(dst, dst, vec_enc); 5118 break; 5119 case T_SHORT: 5120 evpmovsqd(dst, dst, vec_enc); 5121 evpmovdw(dst, dst, vec_enc); 5122 break; 5123 case T_BYTE: 5124 evpmovsqd(dst, dst, vec_enc); 5125 evpmovdb(dst, dst, vec_enc); 5126 break; 5127 default: assert(false, "%s", type2name(to_elem_bt)); 5128 } 5129 } else { 5130 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5131 vcvttpd2dq(dst, src, vec_enc); 5132 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5133 switch(to_elem_bt) { 5134 case T_INT: 5135 break; 5136 case T_SHORT: 5137 evpmovdw(dst, dst, vec_enc); 5138 break; 5139 case T_BYTE: 5140 evpmovdb(dst, dst, vec_enc); 5141 break; 5142 default: assert(false, "%s", type2name(to_elem_bt)); 5143 } 5144 } 5145 } 5146 5147 #ifdef _LP64 5148 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5149 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5150 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5151 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5152 // and re-instantiate original MXCSR.RC mode after that. 5153 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5154 5155 mov64(tmp, julong_cast(0.5L)); 5156 evpbroadcastq(xtmp1, tmp, vec_enc); 5157 vaddpd(xtmp1, src , xtmp1, vec_enc); 5158 evcvtpd2qq(dst, xtmp1, vec_enc); 5159 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5160 double_sign_flip, vec_enc);; 5161 5162 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5163 } 5164 5165 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5166 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5167 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5168 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5169 // and re-instantiate original MXCSR.RC mode after that. 5170 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5171 5172 movl(tmp, jint_cast(0.5)); 5173 movq(xtmp1, tmp); 5174 vbroadcastss(xtmp1, xtmp1, vec_enc); 5175 vaddps(xtmp1, src , xtmp1, vec_enc); 5176 vcvtps2dq(dst, xtmp1, vec_enc); 5177 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5178 float_sign_flip, vec_enc); 5179 5180 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5181 } 5182 5183 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5184 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5185 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5186 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5187 // and re-instantiate original MXCSR.RC mode after that. 5188 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5189 5190 movl(tmp, jint_cast(0.5)); 5191 movq(xtmp1, tmp); 5192 vbroadcastss(xtmp1, xtmp1, vec_enc); 5193 vaddps(xtmp1, src , xtmp1, vec_enc); 5194 vcvtps2dq(dst, xtmp1, vec_enc); 5195 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5196 5197 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5198 } 5199 #endif // _LP64 5200 5201 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5202 BasicType from_elem_bt, BasicType to_elem_bt) { 5203 switch (from_elem_bt) { 5204 case T_BYTE: 5205 switch (to_elem_bt) { 5206 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5207 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5208 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5209 default: ShouldNotReachHere(); 5210 } 5211 break; 5212 case T_SHORT: 5213 switch (to_elem_bt) { 5214 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5215 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5216 default: ShouldNotReachHere(); 5217 } 5218 break; 5219 case T_INT: 5220 assert(to_elem_bt == T_LONG, ""); 5221 vpmovzxdq(dst, src, vlen_enc); 5222 break; 5223 default: 5224 ShouldNotReachHere(); 5225 } 5226 } 5227 5228 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5229 BasicType from_elem_bt, BasicType to_elem_bt) { 5230 switch (from_elem_bt) { 5231 case T_BYTE: 5232 switch (to_elem_bt) { 5233 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5234 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5235 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5236 default: ShouldNotReachHere(); 5237 } 5238 break; 5239 case T_SHORT: 5240 switch (to_elem_bt) { 5241 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5242 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5243 default: ShouldNotReachHere(); 5244 } 5245 break; 5246 case T_INT: 5247 assert(to_elem_bt == T_LONG, ""); 5248 vpmovsxdq(dst, src, vlen_enc); 5249 break; 5250 default: 5251 ShouldNotReachHere(); 5252 } 5253 } 5254 5255 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5256 BasicType dst_bt, BasicType src_bt, int vlen) { 5257 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5258 assert(vlen_enc != AVX_512bit, ""); 5259 5260 int dst_bt_size = type2aelembytes(dst_bt); 5261 int src_bt_size = type2aelembytes(src_bt); 5262 if (dst_bt_size > src_bt_size) { 5263 switch (dst_bt_size / src_bt_size) { 5264 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5265 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5266 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5267 default: ShouldNotReachHere(); 5268 } 5269 } else { 5270 assert(dst_bt_size < src_bt_size, ""); 5271 switch (src_bt_size / dst_bt_size) { 5272 case 2: { 5273 if (vlen_enc == AVX_128bit) { 5274 vpacksswb(dst, src, src, vlen_enc); 5275 } else { 5276 vpacksswb(dst, src, src, vlen_enc); 5277 vpermq(dst, dst, 0x08, vlen_enc); 5278 } 5279 break; 5280 } 5281 case 4: { 5282 if (vlen_enc == AVX_128bit) { 5283 vpackssdw(dst, src, src, vlen_enc); 5284 vpacksswb(dst, dst, dst, vlen_enc); 5285 } else { 5286 vpackssdw(dst, src, src, vlen_enc); 5287 vpermq(dst, dst, 0x08, vlen_enc); 5288 vpacksswb(dst, dst, dst, AVX_128bit); 5289 } 5290 break; 5291 } 5292 case 8: { 5293 if (vlen_enc == AVX_128bit) { 5294 vpshufd(dst, src, 0x08, vlen_enc); 5295 vpackssdw(dst, dst, dst, vlen_enc); 5296 vpacksswb(dst, dst, dst, vlen_enc); 5297 } else { 5298 vpshufd(dst, src, 0x08, vlen_enc); 5299 vpermq(dst, dst, 0x08, vlen_enc); 5300 vpackssdw(dst, dst, dst, AVX_128bit); 5301 vpacksswb(dst, dst, dst, AVX_128bit); 5302 } 5303 break; 5304 } 5305 default: ShouldNotReachHere(); 5306 } 5307 } 5308 } 5309 5310 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5311 bool merge, BasicType bt, int vlen_enc) { 5312 if (bt == T_INT) { 5313 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5314 } else { 5315 assert(bt == T_LONG, ""); 5316 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5317 } 5318 } 5319 5320 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5321 bool merge, BasicType bt, int vlen_enc) { 5322 if (bt == T_INT) { 5323 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5324 } else { 5325 assert(bt == T_LONG, ""); 5326 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5327 } 5328 } 5329 5330 #ifdef _LP64 5331 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5332 Register rtmp2, XMMRegister xtmp, int mask_len, 5333 int vec_enc) { 5334 int index = 0; 5335 int vindex = 0; 5336 mov64(rtmp1, 0x0101010101010101L); 5337 pdepq(rtmp1, src, rtmp1); 5338 if (mask_len > 8) { 5339 movq(rtmp2, src); 5340 vpxor(xtmp, xtmp, xtmp, vec_enc); 5341 movq(xtmp, rtmp1); 5342 } 5343 movq(dst, rtmp1); 5344 5345 mask_len -= 8; 5346 while (mask_len > 0) { 5347 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5348 index++; 5349 if ((index % 2) == 0) { 5350 pxor(xtmp, xtmp); 5351 } 5352 mov64(rtmp1, 0x0101010101010101L); 5353 shrq(rtmp2, 8); 5354 pdepq(rtmp1, rtmp2, rtmp1); 5355 pinsrq(xtmp, rtmp1, index % 2); 5356 vindex = index / 2; 5357 if (vindex) { 5358 // Write entire 16 byte vector when both 64 bit 5359 // lanes are update to save redundant instructions. 5360 if (index % 2) { 5361 vinsertf128(dst, dst, xtmp, vindex); 5362 } 5363 } else { 5364 vmovdqu(dst, xtmp); 5365 } 5366 mask_len -= 8; 5367 } 5368 } 5369 5370 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5371 switch(opc) { 5372 case Op_VectorMaskTrueCount: 5373 popcntq(dst, tmp); 5374 break; 5375 case Op_VectorMaskLastTrue: 5376 if (VM_Version::supports_lzcnt()) { 5377 lzcntq(tmp, tmp); 5378 movl(dst, 63); 5379 subl(dst, tmp); 5380 } else { 5381 movl(dst, -1); 5382 bsrq(tmp, tmp); 5383 cmov32(Assembler::notZero, dst, tmp); 5384 } 5385 break; 5386 case Op_VectorMaskFirstTrue: 5387 if (VM_Version::supports_bmi1()) { 5388 if (masklen < 32) { 5389 orl(tmp, 1 << masklen); 5390 tzcntl(dst, tmp); 5391 } else if (masklen == 32) { 5392 tzcntl(dst, tmp); 5393 } else { 5394 assert(masklen == 64, ""); 5395 tzcntq(dst, tmp); 5396 } 5397 } else { 5398 if (masklen < 32) { 5399 orl(tmp, 1 << masklen); 5400 bsfl(dst, tmp); 5401 } else { 5402 assert(masklen == 32 || masklen == 64, ""); 5403 movl(dst, masklen); 5404 if (masklen == 32) { 5405 bsfl(tmp, tmp); 5406 } else { 5407 bsfq(tmp, tmp); 5408 } 5409 cmov32(Assembler::notZero, dst, tmp); 5410 } 5411 } 5412 break; 5413 case Op_VectorMaskToLong: 5414 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5415 break; 5416 default: assert(false, "Unhandled mask operation"); 5417 } 5418 } 5419 5420 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5421 int masklen, int masksize, int vec_enc) { 5422 assert(VM_Version::supports_popcnt(), ""); 5423 5424 if(VM_Version::supports_avx512bw()) { 5425 kmovql(tmp, mask); 5426 } else { 5427 assert(masklen <= 16, ""); 5428 kmovwl(tmp, mask); 5429 } 5430 5431 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5432 // operations needs to be clipped. 5433 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5434 andq(tmp, (1 << masklen) - 1); 5435 } 5436 5437 vector_mask_operation_helper(opc, dst, tmp, masklen); 5438 } 5439 5440 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5441 Register tmp, int masklen, BasicType bt, int vec_enc) { 5442 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5443 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5444 assert(VM_Version::supports_popcnt(), ""); 5445 5446 bool need_clip = false; 5447 switch(bt) { 5448 case T_BOOLEAN: 5449 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5450 vpxor(xtmp, xtmp, xtmp, vec_enc); 5451 vpsubb(xtmp, xtmp, mask, vec_enc); 5452 vpmovmskb(tmp, xtmp, vec_enc); 5453 need_clip = masklen < 16; 5454 break; 5455 case T_BYTE: 5456 vpmovmskb(tmp, mask, vec_enc); 5457 need_clip = masklen < 16; 5458 break; 5459 case T_SHORT: 5460 vpacksswb(xtmp, mask, mask, vec_enc); 5461 if (masklen >= 16) { 5462 vpermpd(xtmp, xtmp, 8, vec_enc); 5463 } 5464 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5465 need_clip = masklen < 16; 5466 break; 5467 case T_INT: 5468 case T_FLOAT: 5469 vmovmskps(tmp, mask, vec_enc); 5470 need_clip = masklen < 4; 5471 break; 5472 case T_LONG: 5473 case T_DOUBLE: 5474 vmovmskpd(tmp, mask, vec_enc); 5475 need_clip = masklen < 2; 5476 break; 5477 default: assert(false, "Unhandled type, %s", type2name(bt)); 5478 } 5479 5480 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5481 // operations needs to be clipped. 5482 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5483 // need_clip implies masklen < 32 5484 andq(tmp, (1 << masklen) - 1); 5485 } 5486 5487 vector_mask_operation_helper(opc, dst, tmp, masklen); 5488 } 5489 5490 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5491 Register rtmp2, int mask_len) { 5492 kmov(rtmp1, src); 5493 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5494 mov64(rtmp2, -1L); 5495 pextq(rtmp2, rtmp2, rtmp1); 5496 kmov(dst, rtmp2); 5497 } 5498 5499 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5500 XMMRegister mask, Register rtmp, Register rscratch, 5501 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5502 int vec_enc) { 5503 assert(type2aelembytes(bt) >= 4, ""); 5504 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5505 address compress_perm_table = nullptr; 5506 address expand_perm_table = nullptr; 5507 if (type2aelembytes(bt) == 8) { 5508 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5509 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5510 vmovmskpd(rtmp, mask, vec_enc); 5511 } else { 5512 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5513 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5514 vmovmskps(rtmp, mask, vec_enc); 5515 } 5516 shlq(rtmp, 5); // for 32 byte permute row. 5517 if (opcode == Op_CompressV) { 5518 lea(rscratch, ExternalAddress(compress_perm_table)); 5519 } else { 5520 lea(rscratch, ExternalAddress(expand_perm_table)); 5521 } 5522 addptr(rtmp, rscratch); 5523 vmovdqu(permv, Address(rtmp)); 5524 vpermps(dst, permv, src, Assembler::AVX_256bit); 5525 vpxor(xtmp, xtmp, xtmp, vec_enc); 5526 // Blend the result with zero vector using permute mask, each column entry 5527 // in a permute table row contains either a valid permute index or a -1 (default) 5528 // value, this can potentially be used as a blending mask after 5529 // compressing/expanding the source vector lanes. 5530 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5531 } 5532 5533 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5534 bool merge, BasicType bt, int vec_enc) { 5535 if (opcode == Op_CompressV) { 5536 switch(bt) { 5537 case T_BYTE: 5538 evpcompressb(dst, mask, src, merge, vec_enc); 5539 break; 5540 case T_CHAR: 5541 case T_SHORT: 5542 evpcompressw(dst, mask, src, merge, vec_enc); 5543 break; 5544 case T_INT: 5545 evpcompressd(dst, mask, src, merge, vec_enc); 5546 break; 5547 case T_FLOAT: 5548 evcompressps(dst, mask, src, merge, vec_enc); 5549 break; 5550 case T_LONG: 5551 evpcompressq(dst, mask, src, merge, vec_enc); 5552 break; 5553 case T_DOUBLE: 5554 evcompresspd(dst, mask, src, merge, vec_enc); 5555 break; 5556 default: 5557 fatal("Unsupported type %s", type2name(bt)); 5558 break; 5559 } 5560 } else { 5561 assert(opcode == Op_ExpandV, ""); 5562 switch(bt) { 5563 case T_BYTE: 5564 evpexpandb(dst, mask, src, merge, vec_enc); 5565 break; 5566 case T_CHAR: 5567 case T_SHORT: 5568 evpexpandw(dst, mask, src, merge, vec_enc); 5569 break; 5570 case T_INT: 5571 evpexpandd(dst, mask, src, merge, vec_enc); 5572 break; 5573 case T_FLOAT: 5574 evexpandps(dst, mask, src, merge, vec_enc); 5575 break; 5576 case T_LONG: 5577 evpexpandq(dst, mask, src, merge, vec_enc); 5578 break; 5579 case T_DOUBLE: 5580 evexpandpd(dst, mask, src, merge, vec_enc); 5581 break; 5582 default: 5583 fatal("Unsupported type %s", type2name(bt)); 5584 break; 5585 } 5586 } 5587 } 5588 #endif 5589 5590 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5591 KRegister ktmp1, int vec_enc) { 5592 if (opcode == Op_SignumVD) { 5593 vsubpd(dst, zero, one, vec_enc); 5594 // if src < 0 ? -1 : 1 5595 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5596 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5597 // if src == NaN, -0.0 or 0.0 return src. 5598 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5599 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5600 } else { 5601 assert(opcode == Op_SignumVF, ""); 5602 vsubps(dst, zero, one, vec_enc); 5603 // if src < 0 ? -1 : 1 5604 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5605 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5606 // if src == NaN, -0.0 or 0.0 return src. 5607 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5608 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5609 } 5610 } 5611 5612 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5613 XMMRegister xtmp1, int vec_enc) { 5614 if (opcode == Op_SignumVD) { 5615 vsubpd(dst, zero, one, vec_enc); 5616 // if src < 0 ? -1 : 1 5617 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5618 // if src == NaN, -0.0 or 0.0 return src. 5619 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5620 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5621 } else { 5622 assert(opcode == Op_SignumVF, ""); 5623 vsubps(dst, zero, one, vec_enc); 5624 // if src < 0 ? -1 : 1 5625 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5626 // if src == NaN, -0.0 or 0.0 return src. 5627 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5628 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5629 } 5630 } 5631 5632 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5633 if (VM_Version::supports_avx512bw()) { 5634 if (mask_len > 32) { 5635 kmovql(dst, src); 5636 } else { 5637 kmovdl(dst, src); 5638 if (mask_len != 32) { 5639 kshiftrdl(dst, dst, 32 - mask_len); 5640 } 5641 } 5642 } else { 5643 assert(mask_len <= 16, ""); 5644 kmovwl(dst, src); 5645 if (mask_len != 16) { 5646 kshiftrwl(dst, dst, 16 - mask_len); 5647 } 5648 } 5649 } 5650 5651 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5652 int lane_size = type2aelembytes(bt); 5653 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5654 if ((is_LP64 || lane_size < 8) && 5655 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5656 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5657 movptr(rtmp, imm32); 5658 switch(lane_size) { 5659 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5660 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5661 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5662 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5663 fatal("Unsupported lane size %d", lane_size); 5664 break; 5665 } 5666 } else { 5667 movptr(rtmp, imm32); 5668 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5669 switch(lane_size) { 5670 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5671 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5672 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5673 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5674 fatal("Unsupported lane size %d", lane_size); 5675 break; 5676 } 5677 } 5678 } 5679 5680 // 5681 // Following is lookup table based popcount computation algorithm:- 5682 // Index Bit set count 5683 // [ 0000 -> 0, 5684 // 0001 -> 1, 5685 // 0010 -> 1, 5686 // 0011 -> 2, 5687 // 0100 -> 1, 5688 // 0101 -> 2, 5689 // 0110 -> 2, 5690 // 0111 -> 3, 5691 // 1000 -> 1, 5692 // 1001 -> 2, 5693 // 1010 -> 3, 5694 // 1011 -> 3, 5695 // 1100 -> 2, 5696 // 1101 -> 3, 5697 // 1111 -> 4 ] 5698 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5699 // shuffle indices for lookup table access. 5700 // b. Right shift each byte of vector lane by 4 positions. 5701 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5702 // shuffle indices for lookup table access. 5703 // d. Add the bitset count of upper and lower 4 bits of each byte. 5704 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5705 // count of all the bytes of a quadword. 5706 // f. Perform step e. for upper 128bit vector lane. 5707 // g. Pack the bitset count of quadwords back to double word. 5708 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5709 5710 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5711 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5712 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5713 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5714 vpsrlw(dst, src, 4, vec_enc); 5715 vpand(dst, dst, xtmp1, vec_enc); 5716 vpand(xtmp1, src, xtmp1, vec_enc); 5717 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5718 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5719 vpshufb(dst, xtmp2, dst, vec_enc); 5720 vpaddb(dst, dst, xtmp1, vec_enc); 5721 } 5722 5723 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5724 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5725 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5726 // Following code is as per steps e,f,g and h of above algorithm. 5727 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5728 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5729 vpsadbw(dst, dst, xtmp2, vec_enc); 5730 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5731 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5732 vpackuswb(dst, xtmp1, dst, vec_enc); 5733 } 5734 5735 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5736 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5737 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5738 // Add the popcount of upper and lower bytes of word. 5739 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5740 vpsrlw(dst, xtmp1, 8, vec_enc); 5741 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5742 vpaddw(dst, dst, xtmp1, vec_enc); 5743 } 5744 5745 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5746 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5747 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5748 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5749 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5750 } 5751 5752 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5753 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5754 switch(bt) { 5755 case T_LONG: 5756 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5757 break; 5758 case T_INT: 5759 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5760 break; 5761 case T_CHAR: 5762 case T_SHORT: 5763 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5764 break; 5765 case T_BYTE: 5766 case T_BOOLEAN: 5767 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5768 break; 5769 default: 5770 fatal("Unsupported type %s", type2name(bt)); 5771 break; 5772 } 5773 } 5774 5775 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5776 KRegister mask, bool merge, int vec_enc) { 5777 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5778 switch(bt) { 5779 case T_LONG: 5780 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5781 evpopcntq(dst, mask, src, merge, vec_enc); 5782 break; 5783 case T_INT: 5784 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5785 evpopcntd(dst, mask, src, merge, vec_enc); 5786 break; 5787 case T_CHAR: 5788 case T_SHORT: 5789 assert(VM_Version::supports_avx512_bitalg(), ""); 5790 evpopcntw(dst, mask, src, merge, vec_enc); 5791 break; 5792 case T_BYTE: 5793 case T_BOOLEAN: 5794 assert(VM_Version::supports_avx512_bitalg(), ""); 5795 evpopcntb(dst, mask, src, merge, vec_enc); 5796 break; 5797 default: 5798 fatal("Unsupported type %s", type2name(bt)); 5799 break; 5800 } 5801 } 5802 5803 #ifndef _LP64 5804 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5805 assert(VM_Version::supports_avx512bw(), ""); 5806 kmovdl(tmp, src); 5807 kunpckdql(dst, tmp, tmp); 5808 } 5809 #endif 5810 5811 // Bit reversal algorithm first reverses the bits of each byte followed by 5812 // a byte level reversal for multi-byte primitive types (short/int/long). 5813 // Algorithm performs a lookup table access to get reverse bit sequence 5814 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5815 // is obtained by swapping the reverse bit sequences of upper and lower 5816 // nibble of a byte. 5817 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5818 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5819 if (VM_Version::supports_avx512vlbw()) { 5820 5821 // Get the reverse bit sequence of lower nibble of each byte. 5822 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5823 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5824 evpandq(dst, xtmp2, src, vec_enc); 5825 vpshufb(dst, xtmp1, dst, vec_enc); 5826 vpsllq(dst, dst, 4, vec_enc); 5827 5828 // Get the reverse bit sequence of upper nibble of each byte. 5829 vpandn(xtmp2, xtmp2, src, vec_enc); 5830 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5831 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5832 5833 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5834 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5835 evporq(xtmp2, dst, xtmp2, vec_enc); 5836 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5837 5838 } else if(vec_enc == Assembler::AVX_512bit) { 5839 // Shift based bit reversal. 5840 assert(bt == T_LONG || bt == T_INT, ""); 5841 5842 // Swap lower and upper nibble of each byte. 5843 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5844 5845 // Swap two least and most significant bits of each nibble. 5846 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5847 5848 // Swap adjacent pair of bits. 5849 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5850 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5851 5852 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5853 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5854 } else { 5855 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5856 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5857 5858 // Get the reverse bit sequence of lower nibble of each byte. 5859 vpand(dst, xtmp2, src, vec_enc); 5860 vpshufb(dst, xtmp1, dst, vec_enc); 5861 vpsllq(dst, dst, 4, vec_enc); 5862 5863 // Get the reverse bit sequence of upper nibble of each byte. 5864 vpandn(xtmp2, xtmp2, src, vec_enc); 5865 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5866 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5867 5868 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5869 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5870 vpor(xtmp2, dst, xtmp2, vec_enc); 5871 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5872 } 5873 } 5874 5875 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5876 XMMRegister xtmp, Register rscratch) { 5877 assert(VM_Version::supports_gfni(), ""); 5878 assert(rscratch != noreg || always_reachable(mask), "missing"); 5879 5880 // Galois field instruction based bit reversal based on following algorithm. 5881 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5882 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5883 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5884 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5885 } 5886 5887 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5888 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5889 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5890 evpandq(dst, xtmp1, src, vec_enc); 5891 vpsllq(dst, dst, nbits, vec_enc); 5892 vpandn(xtmp1, xtmp1, src, vec_enc); 5893 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5894 evporq(dst, dst, xtmp1, vec_enc); 5895 } 5896 5897 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5898 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5899 // Shift based bit reversal. 5900 assert(VM_Version::supports_evex(), ""); 5901 switch(bt) { 5902 case T_LONG: 5903 // Swap upper and lower double word of each quad word. 5904 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5905 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5906 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5907 break; 5908 case T_INT: 5909 // Swap upper and lower word of each double word. 5910 evprord(xtmp1, k0, src, 16, true, vec_enc); 5911 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5912 break; 5913 case T_CHAR: 5914 case T_SHORT: 5915 // Swap upper and lower byte of each word. 5916 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 5917 break; 5918 case T_BYTE: 5919 evmovdquq(dst, k0, src, true, vec_enc); 5920 break; 5921 default: 5922 fatal("Unsupported type %s", type2name(bt)); 5923 break; 5924 } 5925 } 5926 5927 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5928 if (bt == T_BYTE) { 5929 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 5930 evmovdquq(dst, k0, src, true, vec_enc); 5931 } else { 5932 vmovdqu(dst, src); 5933 } 5934 return; 5935 } 5936 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 5937 // pre-computed shuffle indices. 5938 switch(bt) { 5939 case T_LONG: 5940 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 5941 break; 5942 case T_INT: 5943 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 5944 break; 5945 case T_CHAR: 5946 case T_SHORT: 5947 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 5948 break; 5949 default: 5950 fatal("Unsupported type %s", type2name(bt)); 5951 break; 5952 } 5953 vpshufb(dst, src, dst, vec_enc); 5954 } 5955 5956 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5957 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5958 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 5959 assert(is_integral_type(bt), ""); 5960 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5961 assert(VM_Version::supports_avx512cd(), ""); 5962 switch(bt) { 5963 case T_LONG: 5964 evplzcntq(dst, ktmp, src, merge, vec_enc); 5965 break; 5966 case T_INT: 5967 evplzcntd(dst, ktmp, src, merge, vec_enc); 5968 break; 5969 case T_SHORT: 5970 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 5971 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 5972 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 5973 vpunpckhwd(dst, xtmp1, src, vec_enc); 5974 evplzcntd(dst, ktmp, dst, merge, vec_enc); 5975 vpackusdw(dst, xtmp2, dst, vec_enc); 5976 break; 5977 case T_BYTE: 5978 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5979 // accessing the lookup table. 5980 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5981 // accessing the lookup table. 5982 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5983 assert(VM_Version::supports_avx512bw(), ""); 5984 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 5985 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 5986 vpand(xtmp2, dst, src, vec_enc); 5987 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5988 vpsrlw(xtmp3, src, 4, vec_enc); 5989 vpand(xtmp3, dst, xtmp3, vec_enc); 5990 vpshufb(dst, xtmp1, xtmp3, vec_enc); 5991 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5992 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 5993 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 5994 break; 5995 default: 5996 fatal("Unsupported type %s", type2name(bt)); 5997 break; 5998 } 5999 } 6000 6001 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6002 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6003 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6004 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6005 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6006 // accessing the lookup table. 6007 vpand(dst, xtmp2, src, vec_enc); 6008 vpshufb(dst, xtmp1, dst, vec_enc); 6009 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6010 // accessing the lookup table. 6011 vpsrlw(xtmp3, src, 4, vec_enc); 6012 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6013 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6014 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6015 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6016 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6017 vpaddb(dst, dst, xtmp2, vec_enc); 6018 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6019 } 6020 6021 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6022 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6023 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6024 // Add zero counts of lower byte and upper byte of a word if 6025 // upper byte holds a zero value. 6026 vpsrlw(xtmp3, src, 8, vec_enc); 6027 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6028 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6029 vpsllw(xtmp2, dst, 8, vec_enc); 6030 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6031 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6032 vpsrlw(dst, dst, 8, vec_enc); 6033 } 6034 6035 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6036 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6037 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6038 // hence biased exponent can be used to compute leading zero count as per 6039 // following formula:- 6040 // LZCNT = 32 - (biased_exp - 127) 6041 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6042 6043 // Broadcast 0xFF 6044 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6045 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6046 6047 // Extract biased exponent. 6048 vcvtdq2ps(dst, src, vec_enc); 6049 vpsrld(dst, dst, 23, vec_enc); 6050 vpand(dst, dst, xtmp1, vec_enc); 6051 6052 // Broadcast 127. 6053 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6054 // Exponent = biased_exp - 127 6055 vpsubd(dst, dst, xtmp1, vec_enc); 6056 6057 // Exponent = Exponent + 1 6058 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6059 vpaddd(dst, dst, xtmp3, vec_enc); 6060 6061 // Replace -ve exponent with zero, exponent is -ve when src 6062 // lane contains a zero value. 6063 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6064 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6065 6066 // Rematerialize broadcast 32. 6067 vpslld(xtmp1, xtmp3, 5, vec_enc); 6068 // Exponent is 32 if corresponding source lane contains max_int value. 6069 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6070 // LZCNT = 32 - exponent 6071 vpsubd(dst, xtmp1, dst, vec_enc); 6072 6073 // Replace LZCNT with a value 1 if corresponding source lane 6074 // contains max_int value. 6075 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6076 6077 // Replace biased_exp with 0 if source lane value is less than zero. 6078 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6079 vblendvps(dst, dst, xtmp2, src, vec_enc); 6080 } 6081 6082 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6083 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6084 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6085 // Add zero counts of lower word and upper word of a double word if 6086 // upper word holds a zero value. 6087 vpsrld(xtmp3, src, 16, vec_enc); 6088 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6089 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6090 vpslld(xtmp2, dst, 16, vec_enc); 6091 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6092 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6093 vpsrld(dst, dst, 16, vec_enc); 6094 // Add zero counts of lower doubleword and upper doubleword of a 6095 // quadword if upper doubleword holds a zero value. 6096 vpsrlq(xtmp3, src, 32, vec_enc); 6097 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6098 vpsllq(xtmp2, dst, 32, vec_enc); 6099 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6100 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6101 vpsrlq(dst, dst, 32, vec_enc); 6102 } 6103 6104 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6105 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6106 Register rtmp, int vec_enc) { 6107 assert(is_integral_type(bt), "unexpected type"); 6108 assert(vec_enc < Assembler::AVX_512bit, ""); 6109 switch(bt) { 6110 case T_LONG: 6111 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6112 break; 6113 case T_INT: 6114 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6115 break; 6116 case T_SHORT: 6117 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6118 break; 6119 case T_BYTE: 6120 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6121 break; 6122 default: 6123 fatal("Unsupported type %s", type2name(bt)); 6124 break; 6125 } 6126 } 6127 6128 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6129 switch(bt) { 6130 case T_BYTE: 6131 vpsubb(dst, src1, src2, vec_enc); 6132 break; 6133 case T_SHORT: 6134 vpsubw(dst, src1, src2, vec_enc); 6135 break; 6136 case T_INT: 6137 vpsubd(dst, src1, src2, vec_enc); 6138 break; 6139 case T_LONG: 6140 vpsubq(dst, src1, src2, vec_enc); 6141 break; 6142 default: 6143 fatal("Unsupported type %s", type2name(bt)); 6144 break; 6145 } 6146 } 6147 6148 // Trailing zero count computation is based on leading zero count operation as per 6149 // following equation. All AVX3 targets support AVX512CD feature which offers 6150 // direct vector instruction to compute leading zero count. 6151 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6152 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6153 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6154 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6155 assert(is_integral_type(bt), ""); 6156 // xtmp = -1 6157 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6158 // xtmp = xtmp + src 6159 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6160 // xtmp = xtmp & ~src 6161 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6162 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6163 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6164 vpsub(bt, dst, xtmp4, dst, vec_enc); 6165 } 6166 6167 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6168 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6169 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6170 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6171 assert(is_integral_type(bt), ""); 6172 // xtmp = 0 6173 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6174 // xtmp = 0 - src 6175 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6176 // xtmp = xtmp | src 6177 vpor(xtmp3, xtmp3, src, vec_enc); 6178 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6179 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6180 vpsub(bt, dst, xtmp1, dst, vec_enc); 6181 } 6182 6183 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6184 Label done; 6185 Label neg_divisor_fastpath; 6186 cmpl(divisor, 0); 6187 jccb(Assembler::less, neg_divisor_fastpath); 6188 xorl(rdx, rdx); 6189 divl(divisor); 6190 jmpb(done); 6191 bind(neg_divisor_fastpath); 6192 // Fastpath for divisor < 0: 6193 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6194 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6195 movl(rdx, rax); 6196 subl(rdx, divisor); 6197 if (VM_Version::supports_bmi1()) { 6198 andnl(rax, rdx, rax); 6199 } else { 6200 notl(rdx); 6201 andl(rax, rdx); 6202 } 6203 shrl(rax, 31); 6204 bind(done); 6205 } 6206 6207 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6208 Label done; 6209 Label neg_divisor_fastpath; 6210 cmpl(divisor, 0); 6211 jccb(Assembler::less, neg_divisor_fastpath); 6212 xorl(rdx, rdx); 6213 divl(divisor); 6214 jmpb(done); 6215 bind(neg_divisor_fastpath); 6216 // Fastpath when divisor < 0: 6217 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6218 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6219 movl(rdx, rax); 6220 subl(rax, divisor); 6221 if (VM_Version::supports_bmi1()) { 6222 andnl(rax, rax, rdx); 6223 } else { 6224 notl(rax); 6225 andl(rax, rdx); 6226 } 6227 sarl(rax, 31); 6228 andl(rax, divisor); 6229 subl(rdx, rax); 6230 bind(done); 6231 } 6232 6233 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6234 Label done; 6235 Label neg_divisor_fastpath; 6236 6237 cmpl(divisor, 0); 6238 jccb(Assembler::less, neg_divisor_fastpath); 6239 xorl(rdx, rdx); 6240 divl(divisor); 6241 jmpb(done); 6242 bind(neg_divisor_fastpath); 6243 // Fastpath for divisor < 0: 6244 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6245 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6246 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6247 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6248 movl(rdx, rax); 6249 subl(rax, divisor); 6250 if (VM_Version::supports_bmi1()) { 6251 andnl(rax, rax, rdx); 6252 } else { 6253 notl(rax); 6254 andl(rax, rdx); 6255 } 6256 movl(tmp, rax); 6257 shrl(rax, 31); // quotient 6258 sarl(tmp, 31); 6259 andl(tmp, divisor); 6260 subl(rdx, tmp); // remainder 6261 bind(done); 6262 } 6263 6264 #ifdef _LP64 6265 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6266 XMMRegister xtmp2, Register rtmp) { 6267 if(VM_Version::supports_gfni()) { 6268 // Galois field instruction based bit reversal based on following algorithm. 6269 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6270 mov64(rtmp, 0x8040201008040201L); 6271 movq(xtmp1, src); 6272 movq(xtmp2, rtmp); 6273 gf2p8affineqb(xtmp1, xtmp2, 0); 6274 movq(dst, xtmp1); 6275 } else { 6276 // Swap even and odd numbered bits. 6277 movl(rtmp, src); 6278 andl(rtmp, 0x55555555); 6279 shll(rtmp, 1); 6280 movl(dst, src); 6281 andl(dst, 0xAAAAAAAA); 6282 shrl(dst, 1); 6283 orl(dst, rtmp); 6284 6285 // Swap LSB and MSB 2 bits of each nibble. 6286 movl(rtmp, dst); 6287 andl(rtmp, 0x33333333); 6288 shll(rtmp, 2); 6289 andl(dst, 0xCCCCCCCC); 6290 shrl(dst, 2); 6291 orl(dst, rtmp); 6292 6293 // Swap LSB and MSB 4 bits of each byte. 6294 movl(rtmp, dst); 6295 andl(rtmp, 0x0F0F0F0F); 6296 shll(rtmp, 4); 6297 andl(dst, 0xF0F0F0F0); 6298 shrl(dst, 4); 6299 orl(dst, rtmp); 6300 } 6301 bswapl(dst); 6302 } 6303 6304 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6305 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6306 if(VM_Version::supports_gfni()) { 6307 // Galois field instruction based bit reversal based on following algorithm. 6308 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6309 mov64(rtmp1, 0x8040201008040201L); 6310 movq(xtmp1, src); 6311 movq(xtmp2, rtmp1); 6312 gf2p8affineqb(xtmp1, xtmp2, 0); 6313 movq(dst, xtmp1); 6314 } else { 6315 // Swap even and odd numbered bits. 6316 movq(rtmp1, src); 6317 mov64(rtmp2, 0x5555555555555555L); 6318 andq(rtmp1, rtmp2); 6319 shlq(rtmp1, 1); 6320 movq(dst, src); 6321 notq(rtmp2); 6322 andq(dst, rtmp2); 6323 shrq(dst, 1); 6324 orq(dst, rtmp1); 6325 6326 // Swap LSB and MSB 2 bits of each nibble. 6327 movq(rtmp1, dst); 6328 mov64(rtmp2, 0x3333333333333333L); 6329 andq(rtmp1, rtmp2); 6330 shlq(rtmp1, 2); 6331 notq(rtmp2); 6332 andq(dst, rtmp2); 6333 shrq(dst, 2); 6334 orq(dst, rtmp1); 6335 6336 // Swap LSB and MSB 4 bits of each byte. 6337 movq(rtmp1, dst); 6338 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6339 andq(rtmp1, rtmp2); 6340 shlq(rtmp1, 4); 6341 notq(rtmp2); 6342 andq(dst, rtmp2); 6343 shrq(dst, 4); 6344 orq(dst, rtmp1); 6345 } 6346 bswapq(dst); 6347 } 6348 6349 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6350 Label done; 6351 Label neg_divisor_fastpath; 6352 cmpq(divisor, 0); 6353 jccb(Assembler::less, neg_divisor_fastpath); 6354 xorl(rdx, rdx); 6355 divq(divisor); 6356 jmpb(done); 6357 bind(neg_divisor_fastpath); 6358 // Fastpath for divisor < 0: 6359 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6360 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6361 movq(rdx, rax); 6362 subq(rdx, divisor); 6363 if (VM_Version::supports_bmi1()) { 6364 andnq(rax, rdx, rax); 6365 } else { 6366 notq(rdx); 6367 andq(rax, rdx); 6368 } 6369 shrq(rax, 63); 6370 bind(done); 6371 } 6372 6373 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6374 Label done; 6375 Label neg_divisor_fastpath; 6376 cmpq(divisor, 0); 6377 jccb(Assembler::less, neg_divisor_fastpath); 6378 xorq(rdx, rdx); 6379 divq(divisor); 6380 jmp(done); 6381 bind(neg_divisor_fastpath); 6382 // Fastpath when divisor < 0: 6383 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6384 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6385 movq(rdx, rax); 6386 subq(rax, divisor); 6387 if (VM_Version::supports_bmi1()) { 6388 andnq(rax, rax, rdx); 6389 } else { 6390 notq(rax); 6391 andq(rax, rdx); 6392 } 6393 sarq(rax, 63); 6394 andq(rax, divisor); 6395 subq(rdx, rax); 6396 bind(done); 6397 } 6398 6399 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6400 Label done; 6401 Label neg_divisor_fastpath; 6402 cmpq(divisor, 0); 6403 jccb(Assembler::less, neg_divisor_fastpath); 6404 xorq(rdx, rdx); 6405 divq(divisor); 6406 jmp(done); 6407 bind(neg_divisor_fastpath); 6408 // Fastpath for divisor < 0: 6409 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6410 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6411 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6412 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6413 movq(rdx, rax); 6414 subq(rax, divisor); 6415 if (VM_Version::supports_bmi1()) { 6416 andnq(rax, rax, rdx); 6417 } else { 6418 notq(rax); 6419 andq(rax, rdx); 6420 } 6421 movq(tmp, rax); 6422 shrq(rax, 63); // quotient 6423 sarq(tmp, 63); 6424 andq(tmp, divisor); 6425 subq(rdx, tmp); // remainder 6426 bind(done); 6427 } 6428 #endif 6429 6430 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6431 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6432 int vlen_enc) { 6433 assert(VM_Version::supports_avx512bw(), ""); 6434 // Byte shuffles are inlane operations and indices are determined using 6435 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6436 // normalized to index range 0-15. This makes sure that all the multiples 6437 // of an index value are placed at same relative position in 128 bit 6438 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6439 // will be 16th element in their respective 128 bit lanes. 6440 movl(rtmp, 16); 6441 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6442 6443 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6444 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6445 // original shuffle indices and move the shuffled lanes corresponding to true 6446 // mask to destination vector. 6447 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6448 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6449 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6450 6451 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6452 // and broadcasting second 128 bit lane. 6453 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6454 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6455 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6456 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6457 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6458 6459 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6460 // and broadcasting third 128 bit lane. 6461 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6462 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6463 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6464 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6465 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6466 6467 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6468 // and broadcasting third 128 bit lane. 6469 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6470 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6471 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6472 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6473 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6474 } 6475 6476 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6477 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6478 if (vlen_enc == AVX_128bit) { 6479 vpermilps(dst, src, shuffle, vlen_enc); 6480 } else if (bt == T_INT) { 6481 vpermd(dst, shuffle, src, vlen_enc); 6482 } else { 6483 assert(bt == T_FLOAT, ""); 6484 vpermps(dst, shuffle, src, vlen_enc); 6485 } 6486 }