1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/globals.hpp" 37 #include "runtime/objectMonitor.hpp" 38 #include "runtime/stubRoutines.hpp" 39 #include "utilities/checkedCast.hpp" 40 #include "utilities/globalDefinitions.hpp" 41 #include "utilities/powerOfTwo.hpp" 42 #include "utilities/sizes.hpp" 43 44 #ifdef PRODUCT 45 #define BLOCK_COMMENT(str) /* nothing */ 46 #define STOP(error) stop(error) 47 #else 48 #define BLOCK_COMMENT(str) block_comment(str) 49 #define STOP(error) block_comment(error); stop(error) 50 #endif 51 52 // C2 compiled method's prolog code. 53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 54 55 // WARNING: Initial instruction MUST be 5 bytes or longer so that 56 // NativeJump::patch_verified_entry will be able to patch out the entry 57 // code safely. The push to verify stack depth is ok at 5 bytes, 58 // the frame allocation can be either 3 or 6 bytes. So if we don't do 59 // stack bang then we must use the 6 byte frame allocation even if 60 // we have no frame. :-( 61 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 62 63 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 64 // Remove word for return addr 65 framesize -= wordSize; 66 stack_bang_size -= wordSize; 67 68 // Calls to C2R adapters often do not accept exceptional returns. 69 // We require that their callers must bang for them. But be careful, because 70 // some VM calls (such as call site linkage) can use several kilobytes of 71 // stack. But the stack safety zone should account for that. 72 // See bugs 4446381, 4468289, 4497237. 73 if (stack_bang_size > 0) { 74 generate_stack_overflow_check(stack_bang_size); 75 76 // We always push rbp, so that on return to interpreter rbp, will be 77 // restored correctly and we can correct the stack. 78 push(rbp); 79 // Save caller's stack pointer into RBP if the frame pointer is preserved. 80 if (PreserveFramePointer) { 81 mov(rbp, rsp); 82 } 83 // Remove word for ebp 84 framesize -= wordSize; 85 86 // Create frame 87 if (framesize) { 88 subptr(rsp, framesize); 89 } 90 } else { 91 // Create frame (force generation of a 4 byte immediate value) 92 subptr_imm32(rsp, framesize); 93 94 // Save RBP register now. 95 framesize -= wordSize; 96 movptr(Address(rsp, framesize), rbp); 97 // Save caller's stack pointer into RBP if the frame pointer is preserved. 98 if (PreserveFramePointer) { 99 movptr(rbp, rsp); 100 if (framesize > 0) { 101 addptr(rbp, framesize); 102 } 103 } 104 } 105 106 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 107 framesize -= wordSize; 108 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 109 } 110 111 #ifndef _LP64 112 // If method sets FPU control word do it now 113 if (fp_mode_24b) { 114 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 115 } 116 if (UseSSE >= 2 && VerifyFPU) { 117 verify_FPU(0, "FPU stack must be clean on entry"); 118 } 119 #endif 120 121 #ifdef ASSERT 122 if (VerifyStackAtCalls) { 123 Label L; 124 push(rax); 125 mov(rax, rsp); 126 andptr(rax, StackAlignmentInBytes-1); 127 cmpptr(rax, StackAlignmentInBytes-wordSize); 128 pop(rax); 129 jcc(Assembler::equal, L); 130 STOP("Stack is not properly aligned!"); 131 bind(L); 132 } 133 #endif 134 135 if (!is_stub) { 136 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 137 #ifdef _LP64 138 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 139 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 140 Label dummy_slow_path; 141 Label dummy_continuation; 142 Label* slow_path = &dummy_slow_path; 143 Label* continuation = &dummy_continuation; 144 if (!Compile::current()->output()->in_scratch_emit_size()) { 145 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 146 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 147 Compile::current()->output()->add_stub(stub); 148 slow_path = &stub->entry(); 149 continuation = &stub->continuation(); 150 } 151 bs->nmethod_entry_barrier(this, slow_path, continuation); 152 } 153 #else 154 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 155 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 156 #endif 157 } 158 } 159 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 161 switch (vlen_in_bytes) { 162 case 4: // fall-through 163 case 8: // fall-through 164 case 16: return Assembler::AVX_128bit; 165 case 32: return Assembler::AVX_256bit; 166 case 64: return Assembler::AVX_512bit; 167 168 default: { 169 ShouldNotReachHere(); 170 return Assembler::AVX_NoVec; 171 } 172 } 173 } 174 175 // fast_lock and fast_unlock used by C2 176 177 // Because the transitions from emitted code to the runtime 178 // monitorenter/exit helper stubs are so slow it's critical that 179 // we inline both the stack-locking fast path and the inflated fast path. 180 // 181 // See also: cmpFastLock and cmpFastUnlock. 182 // 183 // What follows is a specialized inline transliteration of the code 184 // in enter() and exit(). If we're concerned about I$ bloat another 185 // option would be to emit TrySlowEnter and TrySlowExit methods 186 // at startup-time. These methods would accept arguments as 187 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 188 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 189 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 190 // In practice, however, the # of lock sites is bounded and is usually small. 191 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 192 // if the processor uses simple bimodal branch predictors keyed by EIP 193 // Since the helper routines would be called from multiple synchronization 194 // sites. 195 // 196 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 197 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 198 // to those specialized methods. That'd give us a mostly platform-independent 199 // implementation that the JITs could optimize and inline at their pleasure. 200 // Done correctly, the only time we'd need to cross to native could would be 201 // to park() or unpark() threads. We'd also need a few more unsafe operators 202 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 203 // (b) explicit barriers or fence operations. 204 // 205 // TODO: 206 // 207 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 208 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 209 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 210 // the lock operators would typically be faster than reifying Self. 211 // 212 // * Ideally I'd define the primitives as: 213 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 214 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 215 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 216 // Instead, we're stuck with a rather awkward and brittle register assignments below. 217 // Furthermore the register assignments are overconstrained, possibly resulting in 218 // sub-optimal code near the synchronization site. 219 // 220 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 221 // Alternately, use a better sp-proximity test. 222 // 223 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 224 // Either one is sufficient to uniquely identify a thread. 225 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 226 // 227 // * Intrinsify notify() and notifyAll() for the common cases where the 228 // object is locked by the calling thread but the waitlist is empty. 229 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 230 // 231 // * use jccb and jmpb instead of jcc and jmp to improve code density. 232 // But beware of excessive branch density on AMD Opterons. 233 // 234 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 235 // or failure of the fast path. If the fast path fails then we pass 236 // control to the slow path, typically in C. In fast_lock and 237 // fast_unlock we often branch to DONE_LABEL, just to find that C2 238 // will emit a conditional branch immediately after the node. 239 // So we have branches to branches and lots of ICC.ZF games. 240 // Instead, it might be better to have C2 pass a "FailureLabel" 241 // into fast_lock and fast_unlock. In the case of success, control 242 // will drop through the node. ICC.ZF is undefined at exit. 243 // In the case of failure, the node will branch directly to the 244 // FailureLabel 245 246 247 // obj: object to lock 248 // box: on-stack box address (displaced header location) - KILLED 249 // rax,: tmp -- KILLED 250 // scr: tmp -- KILLED 251 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 252 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 253 Metadata* method_data) { 254 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 255 // Ensure the register assignments are disjoint 256 assert(tmpReg == rax, ""); 257 assert(cx1Reg == noreg, ""); 258 assert(cx2Reg == noreg, ""); 259 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 260 261 // Possible cases that we'll encounter in fast_lock 262 // ------------------------------------------------ 263 // * Inflated 264 // -- unlocked 265 // -- Locked 266 // = by self 267 // = by other 268 // * neutral 269 // * stack-locked 270 // -- by self 271 // = sp-proximity test hits 272 // = sp-proximity test generates false-negative 273 // -- by other 274 // 275 276 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 277 278 if (DiagnoseSyncOnValueBasedClasses != 0) { 279 load_klass(tmpReg, objReg, scrReg); 280 testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 281 jcc(Assembler::notZero, DONE_LABEL); 282 } 283 284 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 285 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 286 jcc(Assembler::notZero, IsInflated); 287 288 if (LockingMode == LM_MONITOR) { 289 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 290 testptr(objReg, objReg); 291 } else { 292 assert(LockingMode == LM_LEGACY, "must be"); 293 // Attempt stack-locking ... 294 orptr (tmpReg, markWord::unlocked_value); 295 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 296 lock(); 297 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 298 jcc(Assembler::equal, COUNT); // Success 299 300 // Recursive locking. 301 // The object is stack-locked: markword contains stack pointer to BasicLock. 302 // Locked by current thread if difference with current SP is less than one page. 303 subptr(tmpReg, rsp); 304 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 305 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 306 movptr(Address(boxReg, 0), tmpReg); 307 } 308 jmp(DONE_LABEL); 309 310 bind(IsInflated); 311 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 312 313 #ifndef _LP64 314 // The object is inflated. 315 316 // boxReg refers to the on-stack BasicLock in the current frame. 317 // We'd like to write: 318 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 319 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 320 // additional latency as we have another ST in the store buffer that must drain. 321 322 // avoid ST-before-CAS 323 // register juggle because we need tmpReg for cmpxchgptr below 324 movptr(scrReg, boxReg); 325 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 326 327 // Optimistic form: consider XORL tmpReg,tmpReg 328 movptr(tmpReg, NULL_WORD); 329 330 // Appears unlocked - try to swing _owner from null to non-null. 331 // Ideally, I'd manifest "Self" with get_thread and then attempt 332 // to CAS the register containing Self into m->Owner. 333 // But we don't have enough registers, so instead we can either try to CAS 334 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 335 // we later store "Self" into m->Owner. Transiently storing a stack address 336 // (rsp or the address of the box) into m->owner is harmless. 337 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 338 lock(); 339 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 340 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 341 // If we weren't able to swing _owner from null to the BasicLock 342 // then take the slow path. 343 jccb (Assembler::notZero, NO_COUNT); 344 // update _owner from BasicLock to thread 345 get_thread (scrReg); // beware: clobbers ICCs 346 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 347 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 348 349 // If the CAS fails we can either retry or pass control to the slow path. 350 // We use the latter tactic. 351 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 352 // If the CAS was successful ... 353 // Self has acquired the lock 354 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 355 // Intentional fall-through into DONE_LABEL ... 356 #else // _LP64 357 // It's inflated and we use scrReg for ObjectMonitor* in this section. 358 movq(scrReg, tmpReg); 359 xorq(tmpReg, tmpReg); 360 lock(); 361 cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 362 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 363 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 364 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 365 // Propagate ICC.ZF from CAS above into DONE_LABEL. 366 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 367 368 cmpptr(thread, rax); // Check if we are already the owner (recursive lock) 369 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 370 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 371 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 372 #endif // _LP64 373 bind(DONE_LABEL); 374 375 // ZFlag == 1 count in fast path 376 // ZFlag == 0 count in slow path 377 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 378 379 bind(COUNT); 380 // Count monitors in fast path 381 increment(Address(thread, JavaThread::held_monitor_count_offset())); 382 383 xorl(tmpReg, tmpReg); // Set ZF == 1 384 385 bind(NO_COUNT); 386 387 // At NO_COUNT the icc ZFlag is set as follows ... 388 // fast_unlock uses the same protocol. 389 // ZFlag == 1 -> Success 390 // ZFlag == 0 -> Failure - force control through the slow path 391 } 392 393 // obj: object to unlock 394 // box: box address (displaced header location), killed. Must be EAX. 395 // tmp: killed, cannot be obj nor box. 396 // 397 // Some commentary on balanced locking: 398 // 399 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 400 // Methods that don't have provably balanced locking are forced to run in the 401 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 402 // The interpreter provides two properties: 403 // I1: At return-time the interpreter automatically and quietly unlocks any 404 // objects acquired the current activation (frame). Recall that the 405 // interpreter maintains an on-stack list of locks currently held by 406 // a frame. 407 // I2: If a method attempts to unlock an object that is not held by the 408 // the frame the interpreter throws IMSX. 409 // 410 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 411 // B() doesn't have provably balanced locking so it runs in the interpreter. 412 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 413 // is still locked by A(). 414 // 415 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 416 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 417 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 418 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 419 // Arguably given that the spec legislates the JNI case as undefined our implementation 420 // could reasonably *avoid* checking owner in fast_unlock(). 421 // In the interest of performance we elide m->Owner==Self check in unlock. 422 // A perfectly viable alternative is to elide the owner check except when 423 // Xcheck:jni is enabled. 424 425 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { 426 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 427 assert(boxReg == rax, ""); 428 assert_different_registers(objReg, boxReg, tmpReg); 429 430 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 431 432 if (LockingMode == LM_LEGACY) { 433 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 434 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 435 } 436 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 437 if (LockingMode != LM_MONITOR) { 438 testptr(tmpReg, markWord::monitor_value); // Inflated? 439 jcc(Assembler::zero, Stacked); 440 } 441 442 // It's inflated. 443 444 // Despite our balanced locking property we still check that m->_owner == Self 445 // as java routines or native JNI code called by this thread might 446 // have released the lock. 447 // Refer to the comments in synchronizer.cpp for how we might encode extra 448 // state in _succ so we can avoid fetching EntryList|cxq. 449 // 450 // If there's no contention try a 1-0 exit. That is, exit without 451 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 452 // we detect and recover from the race that the 1-0 exit admits. 453 // 454 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 455 // before it STs null into _owner, releasing the lock. Updates 456 // to data protected by the critical section must be visible before 457 // we drop the lock (and thus before any other thread could acquire 458 // the lock and observe the fields protected by the lock). 459 // IA32's memory-model is SPO, so STs are ordered with respect to 460 // each other and there's no need for an explicit barrier (fence). 461 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 462 Label LSuccess, LNotRecursive; 463 464 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 465 jccb(Assembler::equal, LNotRecursive); 466 467 // Recursive inflated unlock 468 decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 469 jmpb(LSuccess); 470 471 bind(LNotRecursive); 472 473 // Set owner to null. 474 // Release to satisfy the JMM 475 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 476 // We need a full fence after clearing owner to avoid stranding. 477 // StoreLoad achieves this. 478 membar(StoreLoad); 479 480 // Check if the entry lists are empty (EntryList first - by convention). 481 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 482 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 483 jccb(Assembler::zero, LSuccess); // If so we are done. 484 485 // Check if there is a successor. 486 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 487 jccb(Assembler::notZero, LSuccess); // If so we are done. 488 489 // Save the monitor pointer in the current thread, so we can try to 490 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 491 andptr(tmpReg, ~(int32_t)markWord::monitor_value); 492 #ifndef _LP64 493 get_thread(boxReg); 494 movptr(Address(boxReg, JavaThread::unlocked_inflated_monitor_offset()), tmpReg); 495 #else // _LP64 496 movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg); 497 #endif 498 499 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 500 jmpb (DONE_LABEL); 501 502 bind (LSuccess); 503 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 504 jmpb (DONE_LABEL); 505 506 if (LockingMode == LM_LEGACY) { 507 bind (Stacked); 508 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 509 lock(); 510 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 511 // Intentional fall-thru into DONE_LABEL 512 } 513 514 bind(DONE_LABEL); 515 516 // ZFlag == 1 count in fast path 517 // ZFlag == 0 count in slow path 518 jccb(Assembler::notZero, NO_COUNT); 519 520 bind(COUNT); 521 // Count monitors in fast path 522 #ifndef _LP64 523 get_thread(tmpReg); 524 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 525 #else // _LP64 526 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 527 #endif 528 529 xorl(tmpReg, tmpReg); // Set ZF == 1 530 531 bind(NO_COUNT); 532 } 533 534 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 535 Register t, Register thread) { 536 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 537 assert(rax_reg == rax, "Used for CAS"); 538 assert_different_registers(obj, box, rax_reg, t, thread); 539 540 // Handle inflated monitor. 541 Label inflated; 542 // Finish fast lock successfully. ZF value is irrelevant. 543 Label locked; 544 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 545 Label slow_path; 546 547 if (UseObjectMonitorTable) { 548 // Clear cache in case fast locking succeeds. 549 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 550 } 551 552 if (DiagnoseSyncOnValueBasedClasses != 0) { 553 load_klass(rax_reg, obj, t); 554 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 555 jcc(Assembler::notZero, slow_path); 556 } 557 558 const Register mark = t; 559 560 { // Lightweight Lock 561 562 Label push; 563 564 const Register top = UseObjectMonitorTable ? rax_reg : box; 565 566 // Load the mark. 567 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 568 569 // Prefetch top. 570 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 571 572 // Check for monitor (0b10). 573 testptr(mark, markWord::monitor_value); 574 jcc(Assembler::notZero, inflated); 575 576 // Check if lock-stack is full. 577 cmpl(top, LockStack::end_offset() - 1); 578 jcc(Assembler::greater, slow_path); 579 580 // Check if recursive. 581 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 582 jccb(Assembler::equal, push); 583 584 // Try to lock. Transition lock bits 0b01 => 0b00 585 movptr(rax_reg, mark); 586 orptr(rax_reg, markWord::unlocked_value); 587 andptr(mark, ~(int32_t)markWord::unlocked_value); 588 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 589 jcc(Assembler::notEqual, slow_path); 590 591 if (UseObjectMonitorTable) { 592 // Need to reload top, clobbered by CAS. 593 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 594 } 595 bind(push); 596 // After successful lock, push object on lock-stack. 597 movptr(Address(thread, top), obj); 598 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 599 jmpb(locked); 600 } 601 602 { // Handle inflated monitor. 603 bind(inflated); 604 605 const Register monitor = t; 606 607 if (!UseObjectMonitorTable) { 608 assert(mark == monitor, "should be the same here"); 609 } else { 610 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 611 // Fetch ObjectMonitor* from the cache or take the slow-path. 612 Label monitor_found; 613 614 // Load cache address 615 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 616 617 const int num_unrolled = 2; 618 for (int i = 0; i < num_unrolled; i++) { 619 cmpptr(obj, Address(t)); 620 jccb(Assembler::equal, monitor_found); 621 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 622 } 623 624 Label loop; 625 626 // Search for obj in cache. 627 bind(loop); 628 629 // Check for match. 630 cmpptr(obj, Address(t)); 631 jccb(Assembler::equal, monitor_found); 632 633 // Search until null encountered, guaranteed _null_sentinel at end. 634 cmpptr(Address(t), 1); 635 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 636 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 637 jmpb(loop); 638 639 // Cache hit. 640 bind(monitor_found); 641 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 642 } 643 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 644 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 645 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 646 647 Label monitor_locked; 648 // Lock the monitor. 649 650 // CAS owner (null => current thread). 651 xorptr(rax_reg, rax_reg); 652 lock(); cmpxchgptr(thread, owner_address); 653 jccb(Assembler::equal, monitor_locked); 654 655 // Check if recursive. 656 cmpptr(thread, rax_reg); 657 jccb(Assembler::notEqual, slow_path); 658 659 // Recursive. 660 increment(recursions_address); 661 662 bind(monitor_locked); 663 if (UseObjectMonitorTable) { 664 // Cache the monitor for unlock 665 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 666 } 667 } 668 669 bind(locked); 670 increment(Address(thread, JavaThread::held_monitor_count_offset())); 671 // Set ZF = 1 672 xorl(rax_reg, rax_reg); 673 674 #ifdef ASSERT 675 // Check that locked label is reached with ZF set. 676 Label zf_correct; 677 Label zf_bad_zero; 678 jcc(Assembler::zero, zf_correct); 679 jmp(zf_bad_zero); 680 #endif 681 682 bind(slow_path); 683 #ifdef ASSERT 684 // Check that slow_path label is reached with ZF not set. 685 jcc(Assembler::notZero, zf_correct); 686 stop("Fast Lock ZF != 0"); 687 bind(zf_bad_zero); 688 stop("Fast Lock ZF != 1"); 689 bind(zf_correct); 690 #endif 691 // C2 uses the value of ZF to determine the continuation. 692 } 693 694 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 695 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 696 assert(reg_rax == rax, "Used for CAS"); 697 assert_different_registers(obj, reg_rax, t); 698 699 // Handle inflated monitor. 700 Label inflated, inflated_check_lock_stack; 701 // Finish fast unlock successfully. MUST jump with ZF == 1 702 Label unlocked, slow_path; 703 704 const Register mark = t; 705 const Register monitor = t; 706 const Register top = UseObjectMonitorTable ? t : reg_rax; 707 const Register box = reg_rax; 708 709 Label dummy; 710 C2FastUnlockLightweightStub* stub = nullptr; 711 712 if (!Compile::current()->output()->in_scratch_emit_size()) { 713 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 714 Compile::current()->output()->add_stub(stub); 715 } 716 717 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 718 719 { // Lightweight Unlock 720 721 // Load top. 722 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 723 724 if (!UseObjectMonitorTable) { 725 // Prefetch mark. 726 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 727 } 728 729 // Check if obj is top of lock-stack. 730 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 731 // Top of lock stack was not obj. Must be monitor. 732 jcc(Assembler::notEqual, inflated_check_lock_stack); 733 734 // Pop lock-stack. 735 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 736 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 737 738 // Check if recursive. 739 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 740 jcc(Assembler::equal, unlocked); 741 742 // We elide the monitor check, let the CAS fail instead. 743 744 if (UseObjectMonitorTable) { 745 // Load mark. 746 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 747 } 748 749 // Try to unlock. Transition lock bits 0b00 => 0b01 750 movptr(reg_rax, mark); 751 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 752 orptr(mark, markWord::unlocked_value); 753 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 754 jcc(Assembler::notEqual, push_and_slow_path); 755 jmp(unlocked); 756 } 757 758 759 { // Handle inflated monitor. 760 bind(inflated_check_lock_stack); 761 #ifdef ASSERT 762 Label check_done; 763 subl(top, oopSize); 764 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 765 jcc(Assembler::below, check_done); 766 cmpptr(obj, Address(thread, top)); 767 jccb(Assembler::notEqual, inflated_check_lock_stack); 768 stop("Fast Unlock lock on stack"); 769 bind(check_done); 770 if (UseObjectMonitorTable) { 771 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 772 } 773 testptr(mark, markWord::monitor_value); 774 jccb(Assembler::notZero, inflated); 775 stop("Fast Unlock not monitor"); 776 #endif 777 778 bind(inflated); 779 780 if (!UseObjectMonitorTable) { 781 assert(mark == monitor, "should be the same here"); 782 } else { 783 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 784 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 785 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 786 cmpptr(monitor, alignof(ObjectMonitor*)); 787 jcc(Assembler::below, slow_path); 788 } 789 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 790 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 791 const Address cxq_address{monitor, ObjectMonitor::cxq_offset() - monitor_tag}; 792 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 793 const Address EntryList_address{monitor, ObjectMonitor::EntryList_offset() - monitor_tag}; 794 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 795 796 Label recursive; 797 798 // Check if recursive. 799 cmpptr(recursions_address, 0); 800 jccb(Assembler::notZero, recursive); 801 802 // Set owner to null. 803 // Release to satisfy the JMM 804 movptr(owner_address, NULL_WORD); 805 // We need a full fence after clearing owner to avoid stranding. 806 // StoreLoad achieves this. 807 membar(StoreLoad); 808 809 // Check if the entry lists are empty (EntryList first - by convention). 810 movptr(reg_rax, EntryList_address); 811 orptr(reg_rax, cxq_address); 812 jccb(Assembler::zero, unlocked); // If so we are done. 813 814 // Check if there is a successor. 815 cmpptr(succ_address, NULL_WORD); 816 jccb(Assembler::notZero, unlocked); // If so we are done. 817 818 // Save the monitor pointer in the current thread, so we can try to 819 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 820 if (!UseObjectMonitorTable) { 821 andptr(monitor, ~(int32_t)markWord::monitor_value); 822 } 823 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 824 825 orl(t, 1); // Fast Unlock ZF = 0 826 jmpb(slow_path); 827 828 // Recursive unlock. 829 bind(recursive); 830 decrement(recursions_address); 831 } 832 833 bind(unlocked); 834 decrement(Address(thread, JavaThread::held_monitor_count_offset())); 835 xorl(t, t); // Fast Unlock ZF = 1 836 837 #ifdef ASSERT 838 // Check that unlocked label is reached with ZF set. 839 Label zf_correct; 840 Label zf_bad_zero; 841 jcc(Assembler::zero, zf_correct); 842 jmp(zf_bad_zero); 843 #endif 844 845 bind(slow_path); 846 if (stub != nullptr) { 847 bind(stub->slow_path_continuation()); 848 } 849 #ifdef ASSERT 850 // Check that stub->continuation() label is reached with ZF not set. 851 jcc(Assembler::notZero, zf_correct); 852 stop("Fast Unlock ZF != 0"); 853 bind(zf_bad_zero); 854 stop("Fast Unlock ZF != 1"); 855 bind(zf_correct); 856 #endif 857 // C2 uses the value of ZF to determine the continuation. 858 } 859 860 //------------------------------------------------------------------------------------------- 861 // Generic instructions support for use in .ad files C2 code generation 862 863 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 864 if (dst != src) { 865 movdqu(dst, src); 866 } 867 if (opcode == Op_AbsVD) { 868 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 869 } else { 870 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 871 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 872 } 873 } 874 875 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 876 if (opcode == Op_AbsVD) { 877 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 878 } else { 879 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 880 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 881 } 882 } 883 884 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 885 if (dst != src) { 886 movdqu(dst, src); 887 } 888 if (opcode == Op_AbsVF) { 889 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 890 } else { 891 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 892 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 893 } 894 } 895 896 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 897 if (opcode == Op_AbsVF) { 898 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 899 } else { 900 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 901 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 902 } 903 } 904 905 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 906 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 907 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 908 909 if (opcode == Op_MinV) { 910 if (elem_bt == T_BYTE) { 911 pminsb(dst, src); 912 } else if (elem_bt == T_SHORT) { 913 pminsw(dst, src); 914 } else if (elem_bt == T_INT) { 915 pminsd(dst, src); 916 } else { 917 assert(elem_bt == T_LONG, "required"); 918 assert(tmp == xmm0, "required"); 919 assert_different_registers(dst, src, tmp); 920 movdqu(xmm0, dst); 921 pcmpgtq(xmm0, src); 922 blendvpd(dst, src); // xmm0 as mask 923 } 924 } else { // opcode == Op_MaxV 925 if (elem_bt == T_BYTE) { 926 pmaxsb(dst, src); 927 } else if (elem_bt == T_SHORT) { 928 pmaxsw(dst, src); 929 } else if (elem_bt == T_INT) { 930 pmaxsd(dst, src); 931 } else { 932 assert(elem_bt == T_LONG, "required"); 933 assert(tmp == xmm0, "required"); 934 assert_different_registers(dst, src, tmp); 935 movdqu(xmm0, src); 936 pcmpgtq(xmm0, dst); 937 blendvpd(dst, src); // xmm0 as mask 938 } 939 } 940 } 941 942 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 943 XMMRegister src1, Address src2, int vlen_enc) { 944 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 945 if (opcode == Op_UMinV) { 946 switch(elem_bt) { 947 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 948 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 949 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 950 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 951 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 952 } 953 } else { 954 assert(opcode == Op_UMaxV, "required"); 955 switch(elem_bt) { 956 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 957 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 958 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 959 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 960 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 961 } 962 } 963 } 964 965 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 966 // For optimality, leverage a full vector width of 512 bits 967 // for operations over smaller vector sizes on AVX512 targets. 968 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { 969 if (opcode == Op_UMaxV) { 970 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 971 } else { 972 assert(opcode == Op_UMinV, "required"); 973 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 974 } 975 } else { 976 // T1 = -1 977 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc); 978 // T1 = -1 << 63 979 vpsllq(xtmp1, xtmp1, 63, vlen_enc); 980 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2 981 vpaddq(xtmp2, xtmp1, src2, vlen_enc); 982 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1 983 vpaddq(xtmp1, xtmp1, src1, vlen_enc); 984 // Mask = T2 > T1 985 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc); 986 if (opcode == Op_UMaxV) { 987 // Res = Mask ? Src2 : Src1 988 vpblendvb(dst, src1, src2, xtmp1, vlen_enc); 989 } else { 990 // Res = Mask ? Src1 : Src2 991 vpblendvb(dst, src2, src1, xtmp1, vlen_enc); 992 } 993 } 994 } 995 996 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 997 XMMRegister src1, XMMRegister src2, int vlen_enc) { 998 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 999 if (opcode == Op_UMinV) { 1000 switch(elem_bt) { 1001 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 1002 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 1003 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 1004 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 1005 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 1006 } 1007 } else { 1008 assert(opcode == Op_UMaxV, "required"); 1009 switch(elem_bt) { 1010 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 1011 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 1012 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 1013 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 1014 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 1015 } 1016 } 1017 } 1018 1019 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1020 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1021 int vlen_enc) { 1022 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1023 1024 if (opcode == Op_MinV) { 1025 if (elem_bt == T_BYTE) { 1026 vpminsb(dst, src1, src2, vlen_enc); 1027 } else if (elem_bt == T_SHORT) { 1028 vpminsw(dst, src1, src2, vlen_enc); 1029 } else if (elem_bt == T_INT) { 1030 vpminsd(dst, src1, src2, vlen_enc); 1031 } else { 1032 assert(elem_bt == T_LONG, "required"); 1033 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1034 vpminsq(dst, src1, src2, vlen_enc); 1035 } else { 1036 assert_different_registers(dst, src1, src2); 1037 vpcmpgtq(dst, src1, src2, vlen_enc); 1038 vblendvpd(dst, src1, src2, dst, vlen_enc); 1039 } 1040 } 1041 } else { // opcode == Op_MaxV 1042 if (elem_bt == T_BYTE) { 1043 vpmaxsb(dst, src1, src2, vlen_enc); 1044 } else if (elem_bt == T_SHORT) { 1045 vpmaxsw(dst, src1, src2, vlen_enc); 1046 } else if (elem_bt == T_INT) { 1047 vpmaxsd(dst, src1, src2, vlen_enc); 1048 } else { 1049 assert(elem_bt == T_LONG, "required"); 1050 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1051 vpmaxsq(dst, src1, src2, vlen_enc); 1052 } else { 1053 assert_different_registers(dst, src1, src2); 1054 vpcmpgtq(dst, src1, src2, vlen_enc); 1055 vblendvpd(dst, src2, src1, dst, vlen_enc); 1056 } 1057 } 1058 } 1059 } 1060 1061 // Float/Double min max 1062 1063 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1064 XMMRegister dst, XMMRegister a, XMMRegister b, 1065 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1066 int vlen_enc) { 1067 assert(UseAVX > 0, "required"); 1068 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1069 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1070 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1071 assert_different_registers(a, tmp, atmp, btmp); 1072 assert_different_registers(b, tmp, atmp, btmp); 1073 1074 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1075 bool is_double_word = is_double_word_type(elem_bt); 1076 1077 /* Note on 'non-obvious' assembly sequence: 1078 * 1079 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1080 * and Java on how they handle floats: 1081 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1082 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1083 * 1084 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1085 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1086 * (only useful when signs differ, noop otherwise) 1087 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1088 1089 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1090 * btmp = (b < +0.0) ? a : b 1091 * atmp = (b < +0.0) ? b : a 1092 * Tmp = Max_Float(atmp , btmp) 1093 * Res = (atmp == NaN) ? atmp : Tmp 1094 */ 1095 1096 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1097 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1098 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1099 XMMRegister mask; 1100 1101 if (!is_double_word && is_min) { 1102 mask = a; 1103 vblend = &MacroAssembler::vblendvps; 1104 vmaxmin = &MacroAssembler::vminps; 1105 vcmp = &MacroAssembler::vcmpps; 1106 } else if (!is_double_word && !is_min) { 1107 mask = b; 1108 vblend = &MacroAssembler::vblendvps; 1109 vmaxmin = &MacroAssembler::vmaxps; 1110 vcmp = &MacroAssembler::vcmpps; 1111 } else if (is_double_word && is_min) { 1112 mask = a; 1113 vblend = &MacroAssembler::vblendvpd; 1114 vmaxmin = &MacroAssembler::vminpd; 1115 vcmp = &MacroAssembler::vcmppd; 1116 } else { 1117 assert(is_double_word && !is_min, "sanity"); 1118 mask = b; 1119 vblend = &MacroAssembler::vblendvpd; 1120 vmaxmin = &MacroAssembler::vmaxpd; 1121 vcmp = &MacroAssembler::vcmppd; 1122 } 1123 1124 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1125 XMMRegister maxmin, scratch; 1126 if (dst == btmp) { 1127 maxmin = btmp; 1128 scratch = tmp; 1129 } else { 1130 maxmin = tmp; 1131 scratch = btmp; 1132 } 1133 1134 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1135 if (precompute_mask && !is_double_word) { 1136 vpsrad(tmp, mask, 32, vlen_enc); 1137 mask = tmp; 1138 } else if (precompute_mask && is_double_word) { 1139 vpxor(tmp, tmp, tmp, vlen_enc); 1140 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1141 mask = tmp; 1142 } 1143 1144 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1145 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1146 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1147 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1148 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1149 } 1150 1151 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1152 XMMRegister dst, XMMRegister a, XMMRegister b, 1153 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1154 int vlen_enc) { 1155 assert(UseAVX > 2, "required"); 1156 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1157 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1158 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1159 assert_different_registers(dst, a, atmp, btmp); 1160 assert_different_registers(dst, b, atmp, btmp); 1161 1162 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1163 bool is_double_word = is_double_word_type(elem_bt); 1164 bool merge = true; 1165 1166 if (!is_double_word && is_min) { 1167 evpmovd2m(ktmp, a, vlen_enc); 1168 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1169 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1170 vminps(dst, atmp, btmp, vlen_enc); 1171 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1172 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1173 } else if (!is_double_word && !is_min) { 1174 evpmovd2m(ktmp, b, vlen_enc); 1175 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1176 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1177 vmaxps(dst, atmp, btmp, vlen_enc); 1178 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1179 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1180 } else if (is_double_word && is_min) { 1181 evpmovq2m(ktmp, a, vlen_enc); 1182 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1183 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1184 vminpd(dst, atmp, btmp, vlen_enc); 1185 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1186 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1187 } else { 1188 assert(is_double_word && !is_min, "sanity"); 1189 evpmovq2m(ktmp, b, vlen_enc); 1190 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1191 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1192 vmaxpd(dst, atmp, btmp, vlen_enc); 1193 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1194 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1195 } 1196 } 1197 1198 // Float/Double signum 1199 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1200 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1201 1202 Label DONE_LABEL; 1203 1204 if (opcode == Op_SignumF) { 1205 assert(UseSSE > 0, "required"); 1206 ucomiss(dst, zero); 1207 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1208 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1209 movflt(dst, one); 1210 jcc(Assembler::above, DONE_LABEL); 1211 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1212 } else if (opcode == Op_SignumD) { 1213 assert(UseSSE > 1, "required"); 1214 ucomisd(dst, zero); 1215 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1216 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1217 movdbl(dst, one); 1218 jcc(Assembler::above, DONE_LABEL); 1219 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1220 } 1221 1222 bind(DONE_LABEL); 1223 } 1224 1225 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1226 if (sign) { 1227 pmovsxbw(dst, src); 1228 } else { 1229 pmovzxbw(dst, src); 1230 } 1231 } 1232 1233 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1234 if (sign) { 1235 vpmovsxbw(dst, src, vector_len); 1236 } else { 1237 vpmovzxbw(dst, src, vector_len); 1238 } 1239 } 1240 1241 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1242 if (sign) { 1243 vpmovsxbd(dst, src, vector_len); 1244 } else { 1245 vpmovzxbd(dst, src, vector_len); 1246 } 1247 } 1248 1249 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1250 if (sign) { 1251 vpmovsxwd(dst, src, vector_len); 1252 } else { 1253 vpmovzxwd(dst, src, vector_len); 1254 } 1255 } 1256 1257 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1258 int shift, int vector_len) { 1259 if (opcode == Op_RotateLeftV) { 1260 if (etype == T_INT) { 1261 evprold(dst, src, shift, vector_len); 1262 } else { 1263 assert(etype == T_LONG, "expected type T_LONG"); 1264 evprolq(dst, src, shift, vector_len); 1265 } 1266 } else { 1267 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1268 if (etype == T_INT) { 1269 evprord(dst, src, shift, vector_len); 1270 } else { 1271 assert(etype == T_LONG, "expected type T_LONG"); 1272 evprorq(dst, src, shift, vector_len); 1273 } 1274 } 1275 } 1276 1277 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1278 XMMRegister shift, int vector_len) { 1279 if (opcode == Op_RotateLeftV) { 1280 if (etype == T_INT) { 1281 evprolvd(dst, src, shift, vector_len); 1282 } else { 1283 assert(etype == T_LONG, "expected type T_LONG"); 1284 evprolvq(dst, src, shift, vector_len); 1285 } 1286 } else { 1287 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1288 if (etype == T_INT) { 1289 evprorvd(dst, src, shift, vector_len); 1290 } else { 1291 assert(etype == T_LONG, "expected type T_LONG"); 1292 evprorvq(dst, src, shift, vector_len); 1293 } 1294 } 1295 } 1296 1297 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1298 if (opcode == Op_RShiftVI) { 1299 psrad(dst, shift); 1300 } else if (opcode == Op_LShiftVI) { 1301 pslld(dst, shift); 1302 } else { 1303 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1304 psrld(dst, shift); 1305 } 1306 } 1307 1308 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1309 switch (opcode) { 1310 case Op_RShiftVI: psrad(dst, shift); break; 1311 case Op_LShiftVI: pslld(dst, shift); break; 1312 case Op_URShiftVI: psrld(dst, shift); break; 1313 1314 default: assert(false, "%s", NodeClassNames[opcode]); 1315 } 1316 } 1317 1318 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1319 if (opcode == Op_RShiftVI) { 1320 vpsrad(dst, nds, shift, vector_len); 1321 } else if (opcode == Op_LShiftVI) { 1322 vpslld(dst, nds, shift, vector_len); 1323 } else { 1324 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1325 vpsrld(dst, nds, shift, vector_len); 1326 } 1327 } 1328 1329 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1330 switch (opcode) { 1331 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1332 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1333 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1334 1335 default: assert(false, "%s", NodeClassNames[opcode]); 1336 } 1337 } 1338 1339 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1340 switch (opcode) { 1341 case Op_RShiftVB: // fall-through 1342 case Op_RShiftVS: psraw(dst, shift); break; 1343 1344 case Op_LShiftVB: // fall-through 1345 case Op_LShiftVS: psllw(dst, shift); break; 1346 1347 case Op_URShiftVS: // fall-through 1348 case Op_URShiftVB: psrlw(dst, shift); break; 1349 1350 default: assert(false, "%s", NodeClassNames[opcode]); 1351 } 1352 } 1353 1354 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1355 switch (opcode) { 1356 case Op_RShiftVB: // fall-through 1357 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1358 1359 case Op_LShiftVB: // fall-through 1360 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1361 1362 case Op_URShiftVS: // fall-through 1363 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1364 1365 default: assert(false, "%s", NodeClassNames[opcode]); 1366 } 1367 } 1368 1369 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1370 switch (opcode) { 1371 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1372 case Op_LShiftVL: psllq(dst, shift); break; 1373 case Op_URShiftVL: psrlq(dst, shift); break; 1374 1375 default: assert(false, "%s", NodeClassNames[opcode]); 1376 } 1377 } 1378 1379 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1380 if (opcode == Op_RShiftVL) { 1381 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1382 } else if (opcode == Op_LShiftVL) { 1383 psllq(dst, shift); 1384 } else { 1385 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1386 psrlq(dst, shift); 1387 } 1388 } 1389 1390 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1391 switch (opcode) { 1392 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1393 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1394 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1395 1396 default: assert(false, "%s", NodeClassNames[opcode]); 1397 } 1398 } 1399 1400 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1401 if (opcode == Op_RShiftVL) { 1402 evpsraq(dst, nds, shift, vector_len); 1403 } else if (opcode == Op_LShiftVL) { 1404 vpsllq(dst, nds, shift, vector_len); 1405 } else { 1406 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1407 vpsrlq(dst, nds, shift, vector_len); 1408 } 1409 } 1410 1411 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1412 switch (opcode) { 1413 case Op_RShiftVB: // fall-through 1414 case Op_RShiftVS: // fall-through 1415 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1416 1417 case Op_LShiftVB: // fall-through 1418 case Op_LShiftVS: // fall-through 1419 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1420 1421 case Op_URShiftVB: // fall-through 1422 case Op_URShiftVS: // fall-through 1423 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1424 1425 default: assert(false, "%s", NodeClassNames[opcode]); 1426 } 1427 } 1428 1429 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1430 switch (opcode) { 1431 case Op_RShiftVB: // fall-through 1432 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1433 1434 case Op_LShiftVB: // fall-through 1435 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1436 1437 case Op_URShiftVB: // fall-through 1438 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1439 1440 default: assert(false, "%s", NodeClassNames[opcode]); 1441 } 1442 } 1443 1444 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1445 assert(UseAVX >= 2, "required"); 1446 switch (opcode) { 1447 case Op_RShiftVL: { 1448 if (UseAVX > 2) { 1449 assert(tmp == xnoreg, "not used"); 1450 if (!VM_Version::supports_avx512vl()) { 1451 vlen_enc = Assembler::AVX_512bit; 1452 } 1453 evpsravq(dst, src, shift, vlen_enc); 1454 } else { 1455 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1456 vpsrlvq(dst, src, shift, vlen_enc); 1457 vpsrlvq(tmp, tmp, shift, vlen_enc); 1458 vpxor(dst, dst, tmp, vlen_enc); 1459 vpsubq(dst, dst, tmp, vlen_enc); 1460 } 1461 break; 1462 } 1463 case Op_LShiftVL: { 1464 assert(tmp == xnoreg, "not used"); 1465 vpsllvq(dst, src, shift, vlen_enc); 1466 break; 1467 } 1468 case Op_URShiftVL: { 1469 assert(tmp == xnoreg, "not used"); 1470 vpsrlvq(dst, src, shift, vlen_enc); 1471 break; 1472 } 1473 default: assert(false, "%s", NodeClassNames[opcode]); 1474 } 1475 } 1476 1477 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1478 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1479 assert(opcode == Op_LShiftVB || 1480 opcode == Op_RShiftVB || 1481 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1482 bool sign = (opcode != Op_URShiftVB); 1483 assert(vector_len == 0, "required"); 1484 vextendbd(sign, dst, src, 1); 1485 vpmovzxbd(vtmp, shift, 1); 1486 varshiftd(opcode, dst, dst, vtmp, 1); 1487 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1488 vextracti128_high(vtmp, dst); 1489 vpackusdw(dst, dst, vtmp, 0); 1490 } 1491 1492 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1493 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1494 assert(opcode == Op_LShiftVB || 1495 opcode == Op_RShiftVB || 1496 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1497 bool sign = (opcode != Op_URShiftVB); 1498 int ext_vector_len = vector_len + 1; 1499 vextendbw(sign, dst, src, ext_vector_len); 1500 vpmovzxbw(vtmp, shift, ext_vector_len); 1501 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1502 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1503 if (vector_len == 0) { 1504 vextracti128_high(vtmp, dst); 1505 vpackuswb(dst, dst, vtmp, vector_len); 1506 } else { 1507 vextracti64x4_high(vtmp, dst); 1508 vpackuswb(dst, dst, vtmp, vector_len); 1509 vpermq(dst, dst, 0xD8, vector_len); 1510 } 1511 } 1512 1513 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1514 switch(typ) { 1515 case T_BYTE: 1516 pinsrb(dst, val, idx); 1517 break; 1518 case T_SHORT: 1519 pinsrw(dst, val, idx); 1520 break; 1521 case T_INT: 1522 pinsrd(dst, val, idx); 1523 break; 1524 case T_LONG: 1525 pinsrq(dst, val, idx); 1526 break; 1527 default: 1528 assert(false,"Should not reach here."); 1529 break; 1530 } 1531 } 1532 1533 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1534 switch(typ) { 1535 case T_BYTE: 1536 vpinsrb(dst, src, val, idx); 1537 break; 1538 case T_SHORT: 1539 vpinsrw(dst, src, val, idx); 1540 break; 1541 case T_INT: 1542 vpinsrd(dst, src, val, idx); 1543 break; 1544 case T_LONG: 1545 vpinsrq(dst, src, val, idx); 1546 break; 1547 default: 1548 assert(false,"Should not reach here."); 1549 break; 1550 } 1551 } 1552 1553 #ifdef _LP64 1554 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1555 XMMRegister dst, Register base, 1556 Register idx_base, 1557 Register offset, Register mask, 1558 Register mask_idx, Register rtmp, 1559 int vlen_enc) { 1560 vpxor(dst, dst, dst, vlen_enc); 1561 if (elem_bt == T_SHORT) { 1562 for (int i = 0; i < 4; i++) { 1563 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1564 Label skip_load; 1565 btq(mask, mask_idx); 1566 jccb(Assembler::carryClear, skip_load); 1567 movl(rtmp, Address(idx_base, i * 4)); 1568 if (offset != noreg) { 1569 addl(rtmp, offset); 1570 } 1571 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1572 bind(skip_load); 1573 incq(mask_idx); 1574 } 1575 } else { 1576 assert(elem_bt == T_BYTE, ""); 1577 for (int i = 0; i < 8; i++) { 1578 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1579 Label skip_load; 1580 btq(mask, mask_idx); 1581 jccb(Assembler::carryClear, skip_load); 1582 movl(rtmp, Address(idx_base, i * 4)); 1583 if (offset != noreg) { 1584 addl(rtmp, offset); 1585 } 1586 pinsrb(dst, Address(base, rtmp), i); 1587 bind(skip_load); 1588 incq(mask_idx); 1589 } 1590 } 1591 } 1592 #endif // _LP64 1593 1594 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1595 Register base, Register idx_base, 1596 Register offset, Register rtmp, 1597 int vlen_enc) { 1598 vpxor(dst, dst, dst, vlen_enc); 1599 if (elem_bt == T_SHORT) { 1600 for (int i = 0; i < 4; i++) { 1601 // dst[i] = src[offset + idx_base[i]] 1602 movl(rtmp, Address(idx_base, i * 4)); 1603 if (offset != noreg) { 1604 addl(rtmp, offset); 1605 } 1606 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1607 } 1608 } else { 1609 assert(elem_bt == T_BYTE, ""); 1610 for (int i = 0; i < 8; i++) { 1611 // dst[i] = src[offset + idx_base[i]] 1612 movl(rtmp, Address(idx_base, i * 4)); 1613 if (offset != noreg) { 1614 addl(rtmp, offset); 1615 } 1616 pinsrb(dst, Address(base, rtmp), i); 1617 } 1618 } 1619 } 1620 1621 /* 1622 * Gather using hybrid algorithm, first partially unroll scalar loop 1623 * to accumulate values from gather indices into a quad-word(64bit) slice. 1624 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1625 * permutation to place the slice into appropriate vector lane 1626 * locations in destination vector. Following pseudo code describes the 1627 * algorithm in detail: 1628 * 1629 * DST_VEC = ZERO_VEC 1630 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1631 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1632 * FOREACH_ITER: 1633 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1634 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1635 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1636 * PERM_INDEX = PERM_INDEX - TWO_VEC 1637 * 1638 * With each iteration, doubleword permute indices (0,1) corresponding 1639 * to gathered quadword gets right shifted by two lane positions. 1640 * 1641 */ 1642 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1643 Register base, Register idx_base, 1644 Register offset, Register mask, 1645 XMMRegister xtmp1, XMMRegister xtmp2, 1646 XMMRegister temp_dst, Register rtmp, 1647 Register mask_idx, Register length, 1648 int vector_len, int vlen_enc) { 1649 Label GATHER8_LOOP; 1650 assert(is_subword_type(elem_ty), ""); 1651 movl(length, vector_len); 1652 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1653 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1654 vallones(xtmp2, vlen_enc); 1655 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1656 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1657 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1658 1659 bind(GATHER8_LOOP); 1660 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1661 if (mask == noreg) { 1662 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1663 } else { 1664 LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc)); 1665 } 1666 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1667 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1668 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1669 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1670 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1671 vpor(dst, dst, temp_dst, vlen_enc); 1672 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1673 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1674 jcc(Assembler::notEqual, GATHER8_LOOP); 1675 } 1676 1677 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1678 switch(typ) { 1679 case T_INT: 1680 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1681 break; 1682 case T_FLOAT: 1683 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1684 break; 1685 case T_LONG: 1686 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1687 break; 1688 case T_DOUBLE: 1689 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1690 break; 1691 default: 1692 assert(false,"Should not reach here."); 1693 break; 1694 } 1695 } 1696 1697 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1698 switch(typ) { 1699 case T_INT: 1700 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1701 break; 1702 case T_FLOAT: 1703 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1704 break; 1705 case T_LONG: 1706 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1707 break; 1708 case T_DOUBLE: 1709 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1710 break; 1711 default: 1712 assert(false,"Should not reach here."); 1713 break; 1714 } 1715 } 1716 1717 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1718 switch(typ) { 1719 case T_INT: 1720 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1721 break; 1722 case T_FLOAT: 1723 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1724 break; 1725 case T_LONG: 1726 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1727 break; 1728 case T_DOUBLE: 1729 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1730 break; 1731 default: 1732 assert(false,"Should not reach here."); 1733 break; 1734 } 1735 } 1736 1737 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1738 if (vlen_in_bytes <= 16) { 1739 pxor (dst, dst); 1740 psubb(dst, src); 1741 switch (elem_bt) { 1742 case T_BYTE: /* nothing to do */ break; 1743 case T_SHORT: pmovsxbw(dst, dst); break; 1744 case T_INT: pmovsxbd(dst, dst); break; 1745 case T_FLOAT: pmovsxbd(dst, dst); break; 1746 case T_LONG: pmovsxbq(dst, dst); break; 1747 case T_DOUBLE: pmovsxbq(dst, dst); break; 1748 1749 default: assert(false, "%s", type2name(elem_bt)); 1750 } 1751 } else { 1752 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1753 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1754 1755 vpxor (dst, dst, dst, vlen_enc); 1756 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1757 1758 switch (elem_bt) { 1759 case T_BYTE: /* nothing to do */ break; 1760 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1761 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1762 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1763 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1764 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1765 1766 default: assert(false, "%s", type2name(elem_bt)); 1767 } 1768 } 1769 } 1770 1771 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1772 if (novlbwdq) { 1773 vpmovsxbd(xtmp, src, vlen_enc); 1774 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1775 Assembler::eq, true, vlen_enc, noreg); 1776 } else { 1777 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1778 vpsubb(xtmp, xtmp, src, vlen_enc); 1779 evpmovb2m(dst, xtmp, vlen_enc); 1780 } 1781 } 1782 1783 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1784 switch (vlen_in_bytes) { 1785 case 4: movdl(dst, src); break; 1786 case 8: movq(dst, src); break; 1787 case 16: movdqu(dst, src); break; 1788 case 32: vmovdqu(dst, src); break; 1789 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1790 default: ShouldNotReachHere(); 1791 } 1792 } 1793 1794 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1795 assert(rscratch != noreg || always_reachable(src), "missing"); 1796 1797 if (reachable(src)) { 1798 load_vector(dst, as_Address(src), vlen_in_bytes); 1799 } else { 1800 lea(rscratch, src); 1801 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1802 } 1803 } 1804 1805 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1806 int vlen_enc = vector_length_encoding(vlen); 1807 if (VM_Version::supports_avx()) { 1808 if (bt == T_LONG) { 1809 if (VM_Version::supports_avx2()) { 1810 vpbroadcastq(dst, src, vlen_enc); 1811 } else { 1812 vmovddup(dst, src, vlen_enc); 1813 } 1814 } else if (bt == T_DOUBLE) { 1815 if (vlen_enc != Assembler::AVX_128bit) { 1816 vbroadcastsd(dst, src, vlen_enc, noreg); 1817 } else { 1818 vmovddup(dst, src, vlen_enc); 1819 } 1820 } else { 1821 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1822 vpbroadcastd(dst, src, vlen_enc); 1823 } else { 1824 vbroadcastss(dst, src, vlen_enc); 1825 } 1826 } 1827 } else if (VM_Version::supports_sse3()) { 1828 movddup(dst, src); 1829 } else { 1830 movq(dst, src); 1831 if (vlen == 16) { 1832 punpcklqdq(dst, dst); 1833 } 1834 } 1835 } 1836 1837 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1838 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1839 int offset = exact_log2(type2aelembytes(bt)) << 6; 1840 if (is_floating_point_type(bt)) { 1841 offset += 128; 1842 } 1843 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1844 load_vector(dst, addr, vlen_in_bytes); 1845 } 1846 1847 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1848 1849 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1850 int vector_len = Assembler::AVX_128bit; 1851 1852 switch (opcode) { 1853 case Op_AndReductionV: pand(dst, src); break; 1854 case Op_OrReductionV: por (dst, src); break; 1855 case Op_XorReductionV: pxor(dst, src); break; 1856 case Op_MinReductionV: 1857 switch (typ) { 1858 case T_BYTE: pminsb(dst, src); break; 1859 case T_SHORT: pminsw(dst, src); break; 1860 case T_INT: pminsd(dst, src); break; 1861 case T_LONG: assert(UseAVX > 2, "required"); 1862 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1863 default: assert(false, "wrong type"); 1864 } 1865 break; 1866 case Op_MaxReductionV: 1867 switch (typ) { 1868 case T_BYTE: pmaxsb(dst, src); break; 1869 case T_SHORT: pmaxsw(dst, src); break; 1870 case T_INT: pmaxsd(dst, src); break; 1871 case T_LONG: assert(UseAVX > 2, "required"); 1872 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1873 default: assert(false, "wrong type"); 1874 } 1875 break; 1876 case Op_AddReductionVF: addss(dst, src); break; 1877 case Op_AddReductionVD: addsd(dst, src); break; 1878 case Op_AddReductionVI: 1879 switch (typ) { 1880 case T_BYTE: paddb(dst, src); break; 1881 case T_SHORT: paddw(dst, src); break; 1882 case T_INT: paddd(dst, src); break; 1883 default: assert(false, "wrong type"); 1884 } 1885 break; 1886 case Op_AddReductionVL: paddq(dst, src); break; 1887 case Op_MulReductionVF: mulss(dst, src); break; 1888 case Op_MulReductionVD: mulsd(dst, src); break; 1889 case Op_MulReductionVI: 1890 switch (typ) { 1891 case T_SHORT: pmullw(dst, src); break; 1892 case T_INT: pmulld(dst, src); break; 1893 default: assert(false, "wrong type"); 1894 } 1895 break; 1896 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1897 evpmullq(dst, dst, src, vector_len); break; 1898 default: assert(false, "wrong opcode"); 1899 } 1900 } 1901 1902 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1903 switch (opcode) { 1904 case Op_AddReductionVF: addps(dst, src); break; 1905 case Op_AddReductionVD: addpd(dst, src); break; 1906 case Op_MulReductionVF: mulps(dst, src); break; 1907 case Op_MulReductionVD: mulpd(dst, src); break; 1908 default: assert(false, "%s", NodeClassNames[opcode]); 1909 } 1910 } 1911 1912 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1913 int vector_len = Assembler::AVX_256bit; 1914 1915 switch (opcode) { 1916 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1917 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1918 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1919 case Op_MinReductionV: 1920 switch (typ) { 1921 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1922 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1923 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1924 case T_LONG: assert(UseAVX > 2, "required"); 1925 vpminsq(dst, src1, src2, vector_len); break; 1926 default: assert(false, "wrong type"); 1927 } 1928 break; 1929 case Op_MaxReductionV: 1930 switch (typ) { 1931 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1932 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1933 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1934 case T_LONG: assert(UseAVX > 2, "required"); 1935 vpmaxsq(dst, src1, src2, vector_len); break; 1936 default: assert(false, "wrong type"); 1937 } 1938 break; 1939 case Op_AddReductionVI: 1940 switch (typ) { 1941 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1942 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1943 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1944 default: assert(false, "wrong type"); 1945 } 1946 break; 1947 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1948 case Op_MulReductionVI: 1949 switch (typ) { 1950 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1951 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1952 default: assert(false, "wrong type"); 1953 } 1954 break; 1955 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1956 default: assert(false, "wrong opcode"); 1957 } 1958 } 1959 1960 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1961 int vector_len = Assembler::AVX_256bit; 1962 1963 switch (opcode) { 1964 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 1965 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 1966 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 1967 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 1968 default: assert(false, "%s", NodeClassNames[opcode]); 1969 } 1970 } 1971 1972 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1973 XMMRegister dst, XMMRegister src, 1974 XMMRegister vtmp1, XMMRegister vtmp2) { 1975 switch (opcode) { 1976 case Op_AddReductionVF: 1977 case Op_MulReductionVF: 1978 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1979 break; 1980 1981 case Op_AddReductionVD: 1982 case Op_MulReductionVD: 1983 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1984 break; 1985 1986 default: assert(false, "wrong opcode"); 1987 } 1988 } 1989 1990 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 1991 XMMRegister dst, XMMRegister src, 1992 XMMRegister vtmp1, XMMRegister vtmp2) { 1993 switch (opcode) { 1994 case Op_AddReductionVF: 1995 case Op_MulReductionVF: 1996 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1997 break; 1998 1999 case Op_AddReductionVD: 2000 case Op_MulReductionVD: 2001 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 2002 break; 2003 2004 default: assert(false, "%s", NodeClassNames[opcode]); 2005 } 2006 } 2007 2008 void C2_MacroAssembler::reduceB(int opcode, int vlen, 2009 Register dst, Register src1, XMMRegister src2, 2010 XMMRegister vtmp1, XMMRegister vtmp2) { 2011 switch (vlen) { 2012 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2013 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2014 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2015 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2016 2017 default: assert(false, "wrong vector length"); 2018 } 2019 } 2020 2021 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 2022 Register dst, Register src1, XMMRegister src2, 2023 XMMRegister vtmp1, XMMRegister vtmp2) { 2024 switch (vlen) { 2025 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2026 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2027 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2028 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2029 2030 default: assert(false, "wrong vector length"); 2031 } 2032 } 2033 2034 void C2_MacroAssembler::reduceS(int opcode, int vlen, 2035 Register dst, Register src1, XMMRegister src2, 2036 XMMRegister vtmp1, XMMRegister vtmp2) { 2037 switch (vlen) { 2038 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2039 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2040 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2041 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2042 2043 default: assert(false, "wrong vector length"); 2044 } 2045 } 2046 2047 void C2_MacroAssembler::reduceI(int opcode, int vlen, 2048 Register dst, Register src1, XMMRegister src2, 2049 XMMRegister vtmp1, XMMRegister vtmp2) { 2050 switch (vlen) { 2051 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2052 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2053 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2054 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2055 2056 default: assert(false, "wrong vector length"); 2057 } 2058 } 2059 2060 #ifdef _LP64 2061 void C2_MacroAssembler::reduceL(int opcode, int vlen, 2062 Register dst, Register src1, XMMRegister src2, 2063 XMMRegister vtmp1, XMMRegister vtmp2) { 2064 switch (vlen) { 2065 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2066 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2067 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2068 2069 default: assert(false, "wrong vector length"); 2070 } 2071 } 2072 #endif // _LP64 2073 2074 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2075 switch (vlen) { 2076 case 2: 2077 assert(vtmp2 == xnoreg, ""); 2078 reduce2F(opcode, dst, src, vtmp1); 2079 break; 2080 case 4: 2081 assert(vtmp2 == xnoreg, ""); 2082 reduce4F(opcode, dst, src, vtmp1); 2083 break; 2084 case 8: 2085 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2086 break; 2087 case 16: 2088 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2089 break; 2090 default: assert(false, "wrong vector length"); 2091 } 2092 } 2093 2094 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2095 switch (vlen) { 2096 case 2: 2097 assert(vtmp2 == xnoreg, ""); 2098 reduce2D(opcode, dst, src, vtmp1); 2099 break; 2100 case 4: 2101 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2102 break; 2103 case 8: 2104 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2105 break; 2106 default: assert(false, "wrong vector length"); 2107 } 2108 } 2109 2110 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2111 switch (vlen) { 2112 case 2: 2113 assert(vtmp1 == xnoreg, ""); 2114 assert(vtmp2 == xnoreg, ""); 2115 unorderedReduce2F(opcode, dst, src); 2116 break; 2117 case 4: 2118 assert(vtmp2 == xnoreg, ""); 2119 unorderedReduce4F(opcode, dst, src, vtmp1); 2120 break; 2121 case 8: 2122 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 2123 break; 2124 case 16: 2125 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 2126 break; 2127 default: assert(false, "wrong vector length"); 2128 } 2129 } 2130 2131 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2132 switch (vlen) { 2133 case 2: 2134 assert(vtmp1 == xnoreg, ""); 2135 assert(vtmp2 == xnoreg, ""); 2136 unorderedReduce2D(opcode, dst, src); 2137 break; 2138 case 4: 2139 assert(vtmp2 == xnoreg, ""); 2140 unorderedReduce4D(opcode, dst, src, vtmp1); 2141 break; 2142 case 8: 2143 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2144 break; 2145 default: assert(false, "wrong vector length"); 2146 } 2147 } 2148 2149 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2150 if (opcode == Op_AddReductionVI) { 2151 if (vtmp1 != src2) { 2152 movdqu(vtmp1, src2); 2153 } 2154 phaddd(vtmp1, vtmp1); 2155 } else { 2156 pshufd(vtmp1, src2, 0x1); 2157 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2158 } 2159 movdl(vtmp2, src1); 2160 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2161 movdl(dst, vtmp1); 2162 } 2163 2164 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2165 if (opcode == Op_AddReductionVI) { 2166 if (vtmp1 != src2) { 2167 movdqu(vtmp1, src2); 2168 } 2169 phaddd(vtmp1, src2); 2170 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2171 } else { 2172 pshufd(vtmp2, src2, 0xE); 2173 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2174 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2175 } 2176 } 2177 2178 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2179 if (opcode == Op_AddReductionVI) { 2180 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2181 vextracti128_high(vtmp2, vtmp1); 2182 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2183 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2184 } else { 2185 vextracti128_high(vtmp1, src2); 2186 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2187 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2188 } 2189 } 2190 2191 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2192 vextracti64x4_high(vtmp2, src2); 2193 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2194 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2195 } 2196 2197 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2198 pshufd(vtmp2, src2, 0x1); 2199 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2200 movdqu(vtmp1, vtmp2); 2201 psrldq(vtmp1, 2); 2202 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2203 movdqu(vtmp2, vtmp1); 2204 psrldq(vtmp2, 1); 2205 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2206 movdl(vtmp2, src1); 2207 pmovsxbd(vtmp1, vtmp1); 2208 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2209 pextrb(dst, vtmp1, 0x0); 2210 movsbl(dst, dst); 2211 } 2212 2213 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2214 pshufd(vtmp1, src2, 0xE); 2215 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2216 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2217 } 2218 2219 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2220 vextracti128_high(vtmp2, src2); 2221 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2222 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2223 } 2224 2225 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2226 vextracti64x4_high(vtmp1, src2); 2227 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2228 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2229 } 2230 2231 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2232 pmovsxbw(vtmp2, src2); 2233 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2234 } 2235 2236 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2237 if (UseAVX > 1) { 2238 int vector_len = Assembler::AVX_256bit; 2239 vpmovsxbw(vtmp1, src2, vector_len); 2240 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2241 } else { 2242 pmovsxbw(vtmp2, src2); 2243 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2244 pshufd(vtmp2, src2, 0x1); 2245 pmovsxbw(vtmp2, src2); 2246 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2247 } 2248 } 2249 2250 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2251 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2252 int vector_len = Assembler::AVX_512bit; 2253 vpmovsxbw(vtmp1, src2, vector_len); 2254 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2255 } else { 2256 assert(UseAVX >= 2,"Should not reach here."); 2257 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2258 vextracti128_high(vtmp2, src2); 2259 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2260 } 2261 } 2262 2263 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2264 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2265 vextracti64x4_high(vtmp2, src2); 2266 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2267 } 2268 2269 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2270 if (opcode == Op_AddReductionVI) { 2271 if (vtmp1 != src2) { 2272 movdqu(vtmp1, src2); 2273 } 2274 phaddw(vtmp1, vtmp1); 2275 phaddw(vtmp1, vtmp1); 2276 } else { 2277 pshufd(vtmp2, src2, 0x1); 2278 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2279 movdqu(vtmp1, vtmp2); 2280 psrldq(vtmp1, 2); 2281 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2282 } 2283 movdl(vtmp2, src1); 2284 pmovsxwd(vtmp1, vtmp1); 2285 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2286 pextrw(dst, vtmp1, 0x0); 2287 movswl(dst, dst); 2288 } 2289 2290 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2291 if (opcode == Op_AddReductionVI) { 2292 if (vtmp1 != src2) { 2293 movdqu(vtmp1, src2); 2294 } 2295 phaddw(vtmp1, src2); 2296 } else { 2297 pshufd(vtmp1, src2, 0xE); 2298 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2299 } 2300 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2301 } 2302 2303 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2304 if (opcode == Op_AddReductionVI) { 2305 int vector_len = Assembler::AVX_256bit; 2306 vphaddw(vtmp2, src2, src2, vector_len); 2307 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2308 } else { 2309 vextracti128_high(vtmp2, src2); 2310 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2311 } 2312 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2313 } 2314 2315 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2316 int vector_len = Assembler::AVX_256bit; 2317 vextracti64x4_high(vtmp1, src2); 2318 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2319 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2320 } 2321 2322 #ifdef _LP64 2323 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2324 pshufd(vtmp2, src2, 0xE); 2325 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2326 movdq(vtmp1, src1); 2327 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2328 movdq(dst, vtmp1); 2329 } 2330 2331 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2332 vextracti128_high(vtmp1, src2); 2333 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2334 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2335 } 2336 2337 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2338 vextracti64x4_high(vtmp2, src2); 2339 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2340 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2341 } 2342 2343 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2344 mov64(temp, -1L); 2345 bzhiq(temp, temp, len); 2346 kmovql(dst, temp); 2347 } 2348 #endif // _LP64 2349 2350 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2351 reduce_operation_128(T_FLOAT, opcode, dst, src); 2352 pshufd(vtmp, src, 0x1); 2353 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2354 } 2355 2356 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2357 reduce2F(opcode, dst, src, vtmp); 2358 pshufd(vtmp, src, 0x2); 2359 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2360 pshufd(vtmp, src, 0x3); 2361 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2362 } 2363 2364 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2365 reduce4F(opcode, dst, src, vtmp2); 2366 vextractf128_high(vtmp2, src); 2367 reduce4F(opcode, dst, vtmp2, vtmp1); 2368 } 2369 2370 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2371 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2372 vextracti64x4_high(vtmp1, src); 2373 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2374 } 2375 2376 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2377 pshufd(dst, src, 0x1); 2378 reduce_operation_128(T_FLOAT, opcode, dst, src); 2379 } 2380 2381 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2382 pshufd(vtmp, src, 0xE); 2383 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2384 unorderedReduce2F(opcode, dst, vtmp); 2385 } 2386 2387 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2388 vextractf128_high(vtmp1, src); 2389 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2390 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2391 } 2392 2393 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2394 vextractf64x4_high(vtmp2, src); 2395 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2396 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2397 } 2398 2399 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2400 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2401 pshufd(vtmp, src, 0xE); 2402 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2403 } 2404 2405 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2406 reduce2D(opcode, dst, src, vtmp2); 2407 vextractf128_high(vtmp2, src); 2408 reduce2D(opcode, dst, vtmp2, vtmp1); 2409 } 2410 2411 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2412 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2413 vextracti64x4_high(vtmp1, src); 2414 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2415 } 2416 2417 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2418 pshufd(dst, src, 0xE); 2419 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2420 } 2421 2422 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2423 vextractf128_high(vtmp, src); 2424 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2425 unorderedReduce2D(opcode, dst, vtmp); 2426 } 2427 2428 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2429 vextractf64x4_high(vtmp2, src); 2430 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2431 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2432 } 2433 2434 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2435 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2436 } 2437 2438 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2439 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2440 } 2441 2442 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 2443 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2444 } 2445 2446 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2447 int vec_enc) { 2448 switch(elem_bt) { 2449 case T_INT: 2450 case T_FLOAT: 2451 vmaskmovps(dst, src, mask, vec_enc); 2452 break; 2453 case T_LONG: 2454 case T_DOUBLE: 2455 vmaskmovpd(dst, src, mask, vec_enc); 2456 break; 2457 default: 2458 fatal("Unsupported type %s", type2name(elem_bt)); 2459 break; 2460 } 2461 } 2462 2463 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2464 int vec_enc) { 2465 switch(elem_bt) { 2466 case T_INT: 2467 case T_FLOAT: 2468 vmaskmovps(dst, src, mask, vec_enc); 2469 break; 2470 case T_LONG: 2471 case T_DOUBLE: 2472 vmaskmovpd(dst, src, mask, vec_enc); 2473 break; 2474 default: 2475 fatal("Unsupported type %s", type2name(elem_bt)); 2476 break; 2477 } 2478 } 2479 2480 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2481 XMMRegister dst, XMMRegister src, 2482 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2483 XMMRegister xmm_0, XMMRegister xmm_1) { 2484 const int permconst[] = {1, 14}; 2485 XMMRegister wsrc = src; 2486 XMMRegister wdst = xmm_0; 2487 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2488 2489 int vlen_enc = Assembler::AVX_128bit; 2490 if (vlen == 16) { 2491 vlen_enc = Assembler::AVX_256bit; 2492 } 2493 2494 for (int i = log2(vlen) - 1; i >=0; i--) { 2495 if (i == 0 && !is_dst_valid) { 2496 wdst = dst; 2497 } 2498 if (i == 3) { 2499 vextracti64x4_high(wtmp, wsrc); 2500 } else if (i == 2) { 2501 vextracti128_high(wtmp, wsrc); 2502 } else { // i = [0,1] 2503 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2504 } 2505 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2506 wsrc = wdst; 2507 vlen_enc = Assembler::AVX_128bit; 2508 } 2509 if (is_dst_valid) { 2510 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2511 } 2512 } 2513 2514 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2515 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2516 XMMRegister xmm_0, XMMRegister xmm_1) { 2517 XMMRegister wsrc = src; 2518 XMMRegister wdst = xmm_0; 2519 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2520 int vlen_enc = Assembler::AVX_128bit; 2521 if (vlen == 8) { 2522 vlen_enc = Assembler::AVX_256bit; 2523 } 2524 for (int i = log2(vlen) - 1; i >=0; i--) { 2525 if (i == 0 && !is_dst_valid) { 2526 wdst = dst; 2527 } 2528 if (i == 1) { 2529 vextracti128_high(wtmp, wsrc); 2530 } else if (i == 2) { 2531 vextracti64x4_high(wtmp, wsrc); 2532 } else { 2533 assert(i == 0, "%d", i); 2534 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2535 } 2536 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2537 wsrc = wdst; 2538 vlen_enc = Assembler::AVX_128bit; 2539 } 2540 if (is_dst_valid) { 2541 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2542 } 2543 } 2544 2545 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2546 switch (bt) { 2547 case T_BYTE: pextrb(dst, src, idx); break; 2548 case T_SHORT: pextrw(dst, src, idx); break; 2549 case T_INT: pextrd(dst, src, idx); break; 2550 case T_LONG: pextrq(dst, src, idx); break; 2551 2552 default: 2553 assert(false,"Should not reach here."); 2554 break; 2555 } 2556 } 2557 2558 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2559 int esize = type2aelembytes(typ); 2560 int elem_per_lane = 16/esize; 2561 int lane = elemindex / elem_per_lane; 2562 int eindex = elemindex % elem_per_lane; 2563 2564 if (lane >= 2) { 2565 assert(UseAVX > 2, "required"); 2566 vextractf32x4(dst, src, lane & 3); 2567 return dst; 2568 } else if (lane > 0) { 2569 assert(UseAVX > 0, "required"); 2570 vextractf128(dst, src, lane); 2571 return dst; 2572 } else { 2573 return src; 2574 } 2575 } 2576 2577 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2578 if (typ == T_BYTE) { 2579 movsbl(dst, dst); 2580 } else if (typ == T_SHORT) { 2581 movswl(dst, dst); 2582 } 2583 } 2584 2585 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2586 int esize = type2aelembytes(typ); 2587 int elem_per_lane = 16/esize; 2588 int eindex = elemindex % elem_per_lane; 2589 assert(is_integral_type(typ),"required"); 2590 2591 if (eindex == 0) { 2592 if (typ == T_LONG) { 2593 movq(dst, src); 2594 } else { 2595 movdl(dst, src); 2596 movsxl(typ, dst); 2597 } 2598 } else { 2599 extract(typ, dst, src, eindex); 2600 movsxl(typ, dst); 2601 } 2602 } 2603 2604 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2605 int esize = type2aelembytes(typ); 2606 int elem_per_lane = 16/esize; 2607 int eindex = elemindex % elem_per_lane; 2608 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2609 2610 if (eindex == 0) { 2611 movq(dst, src); 2612 } else { 2613 if (typ == T_FLOAT) { 2614 if (UseAVX == 0) { 2615 movdqu(dst, src); 2616 shufps(dst, dst, eindex); 2617 } else { 2618 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2619 } 2620 } else { 2621 if (UseAVX == 0) { 2622 movdqu(dst, src); 2623 psrldq(dst, eindex*esize); 2624 } else { 2625 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2626 } 2627 movq(dst, dst); 2628 } 2629 } 2630 // Zero upper bits 2631 if (typ == T_FLOAT) { 2632 if (UseAVX == 0) { 2633 assert(vtmp != xnoreg, "required."); 2634 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2635 pand(dst, vtmp); 2636 } else { 2637 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2638 } 2639 } 2640 } 2641 2642 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2643 switch(typ) { 2644 case T_BYTE: 2645 case T_BOOLEAN: 2646 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2647 break; 2648 case T_SHORT: 2649 case T_CHAR: 2650 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2651 break; 2652 case T_INT: 2653 case T_FLOAT: 2654 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2655 break; 2656 case T_LONG: 2657 case T_DOUBLE: 2658 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2659 break; 2660 default: 2661 assert(false,"Should not reach here."); 2662 break; 2663 } 2664 } 2665 2666 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2667 assert(rscratch != noreg || always_reachable(src2), "missing"); 2668 2669 switch(typ) { 2670 case T_BOOLEAN: 2671 case T_BYTE: 2672 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2673 break; 2674 case T_CHAR: 2675 case T_SHORT: 2676 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2677 break; 2678 case T_INT: 2679 case T_FLOAT: 2680 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2681 break; 2682 case T_LONG: 2683 case T_DOUBLE: 2684 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2685 break; 2686 default: 2687 assert(false,"Should not reach here."); 2688 break; 2689 } 2690 } 2691 2692 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2693 switch(typ) { 2694 case T_BYTE: 2695 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2696 break; 2697 case T_SHORT: 2698 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2699 break; 2700 case T_INT: 2701 case T_FLOAT: 2702 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2703 break; 2704 case T_LONG: 2705 case T_DOUBLE: 2706 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2707 break; 2708 default: 2709 assert(false,"Should not reach here."); 2710 break; 2711 } 2712 } 2713 2714 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2715 assert(vlen_in_bytes <= 32, ""); 2716 int esize = type2aelembytes(bt); 2717 if (vlen_in_bytes == 32) { 2718 assert(vtmp == xnoreg, "required."); 2719 if (esize >= 4) { 2720 vtestps(src1, src2, AVX_256bit); 2721 } else { 2722 vptest(src1, src2, AVX_256bit); 2723 } 2724 return; 2725 } 2726 if (vlen_in_bytes < 16) { 2727 // Duplicate the lower part to fill the whole register, 2728 // Don't need to do so for src2 2729 assert(vtmp != xnoreg, "required"); 2730 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2731 pshufd(vtmp, src1, shuffle_imm); 2732 } else { 2733 assert(vtmp == xnoreg, "required"); 2734 vtmp = src1; 2735 } 2736 if (esize >= 4 && VM_Version::supports_avx()) { 2737 vtestps(vtmp, src2, AVX_128bit); 2738 } else { 2739 ptest(vtmp, src2); 2740 } 2741 } 2742 2743 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2744 #ifdef ASSERT 2745 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2746 bool is_bw_supported = VM_Version::supports_avx512bw(); 2747 if (is_bw && !is_bw_supported) { 2748 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2749 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2750 "XMM register should be 0-15"); 2751 } 2752 #endif // ASSERT 2753 switch (elem_bt) { 2754 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2755 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2756 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2757 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2758 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2759 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2760 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2761 } 2762 } 2763 2764 #ifdef _LP64 2765 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2766 assert(UseAVX >= 2, "required"); 2767 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2768 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2769 if ((UseAVX > 2) && 2770 (!is_bw || VM_Version::supports_avx512bw()) && 2771 (!is_vl || VM_Version::supports_avx512vl())) { 2772 switch (elem_bt) { 2773 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2774 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2775 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2776 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2777 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2778 } 2779 } else { 2780 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2781 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2782 switch (elem_bt) { 2783 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2784 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2785 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2786 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2787 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2788 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2789 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2790 } 2791 } 2792 } 2793 #endif 2794 2795 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2796 switch (to_elem_bt) { 2797 case T_SHORT: 2798 vpmovsxbw(dst, src, vlen_enc); 2799 break; 2800 case T_INT: 2801 vpmovsxbd(dst, src, vlen_enc); 2802 break; 2803 case T_FLOAT: 2804 vpmovsxbd(dst, src, vlen_enc); 2805 vcvtdq2ps(dst, dst, vlen_enc); 2806 break; 2807 case T_LONG: 2808 vpmovsxbq(dst, src, vlen_enc); 2809 break; 2810 case T_DOUBLE: { 2811 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2812 vpmovsxbd(dst, src, mid_vlen_enc); 2813 vcvtdq2pd(dst, dst, vlen_enc); 2814 break; 2815 } 2816 default: 2817 fatal("Unsupported type %s", type2name(to_elem_bt)); 2818 break; 2819 } 2820 } 2821 2822 //------------------------------------------------------------------------------------------- 2823 2824 // IndexOf for constant substrings with size >= 8 chars 2825 // which don't need to be loaded through stack. 2826 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2827 Register cnt1, Register cnt2, 2828 int int_cnt2, Register result, 2829 XMMRegister vec, Register tmp, 2830 int ae) { 2831 ShortBranchVerifier sbv(this); 2832 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2833 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2834 2835 // This method uses the pcmpestri instruction with bound registers 2836 // inputs: 2837 // xmm - substring 2838 // rax - substring length (elements count) 2839 // mem - scanned string 2840 // rdx - string length (elements count) 2841 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2842 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2843 // outputs: 2844 // rcx - matched index in string 2845 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2846 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2847 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2848 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2849 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2850 2851 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2852 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2853 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2854 2855 // Note, inline_string_indexOf() generates checks: 2856 // if (substr.count > string.count) return -1; 2857 // if (substr.count == 0) return 0; 2858 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2859 2860 // Load substring. 2861 if (ae == StrIntrinsicNode::UL) { 2862 pmovzxbw(vec, Address(str2, 0)); 2863 } else { 2864 movdqu(vec, Address(str2, 0)); 2865 } 2866 movl(cnt2, int_cnt2); 2867 movptr(result, str1); // string addr 2868 2869 if (int_cnt2 > stride) { 2870 jmpb(SCAN_TO_SUBSTR); 2871 2872 // Reload substr for rescan, this code 2873 // is executed only for large substrings (> 8 chars) 2874 bind(RELOAD_SUBSTR); 2875 if (ae == StrIntrinsicNode::UL) { 2876 pmovzxbw(vec, Address(str2, 0)); 2877 } else { 2878 movdqu(vec, Address(str2, 0)); 2879 } 2880 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2881 2882 bind(RELOAD_STR); 2883 // We came here after the beginning of the substring was 2884 // matched but the rest of it was not so we need to search 2885 // again. Start from the next element after the previous match. 2886 2887 // cnt2 is number of substring reminding elements and 2888 // cnt1 is number of string reminding elements when cmp failed. 2889 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2890 subl(cnt1, cnt2); 2891 addl(cnt1, int_cnt2); 2892 movl(cnt2, int_cnt2); // Now restore cnt2 2893 2894 decrementl(cnt1); // Shift to next element 2895 cmpl(cnt1, cnt2); 2896 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2897 2898 addptr(result, (1<<scale1)); 2899 2900 } // (int_cnt2 > 8) 2901 2902 // Scan string for start of substr in 16-byte vectors 2903 bind(SCAN_TO_SUBSTR); 2904 pcmpestri(vec, Address(result, 0), mode); 2905 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2906 subl(cnt1, stride); 2907 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2908 cmpl(cnt1, cnt2); 2909 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2910 addptr(result, 16); 2911 jmpb(SCAN_TO_SUBSTR); 2912 2913 // Found a potential substr 2914 bind(FOUND_CANDIDATE); 2915 // Matched whole vector if first element matched (tmp(rcx) == 0). 2916 if (int_cnt2 == stride) { 2917 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2918 } else { // int_cnt2 > 8 2919 jccb(Assembler::overflow, FOUND_SUBSTR); 2920 } 2921 // After pcmpestri tmp(rcx) contains matched element index 2922 // Compute start addr of substr 2923 lea(result, Address(result, tmp, scale1)); 2924 2925 // Make sure string is still long enough 2926 subl(cnt1, tmp); 2927 cmpl(cnt1, cnt2); 2928 if (int_cnt2 == stride) { 2929 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2930 } else { // int_cnt2 > 8 2931 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2932 } 2933 // Left less then substring. 2934 2935 bind(RET_NOT_FOUND); 2936 movl(result, -1); 2937 jmp(EXIT); 2938 2939 if (int_cnt2 > stride) { 2940 // This code is optimized for the case when whole substring 2941 // is matched if its head is matched. 2942 bind(MATCH_SUBSTR_HEAD); 2943 pcmpestri(vec, Address(result, 0), mode); 2944 // Reload only string if does not match 2945 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2946 2947 Label CONT_SCAN_SUBSTR; 2948 // Compare the rest of substring (> 8 chars). 2949 bind(FOUND_SUBSTR); 2950 // First 8 chars are already matched. 2951 negptr(cnt2); 2952 addptr(cnt2, stride); 2953 2954 bind(SCAN_SUBSTR); 2955 subl(cnt1, stride); 2956 cmpl(cnt2, -stride); // Do not read beyond substring 2957 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2958 // Back-up strings to avoid reading beyond substring: 2959 // cnt1 = cnt1 - cnt2 + 8 2960 addl(cnt1, cnt2); // cnt2 is negative 2961 addl(cnt1, stride); 2962 movl(cnt2, stride); negptr(cnt2); 2963 bind(CONT_SCAN_SUBSTR); 2964 if (int_cnt2 < (int)G) { 2965 int tail_off1 = int_cnt2<<scale1; 2966 int tail_off2 = int_cnt2<<scale2; 2967 if (ae == StrIntrinsicNode::UL) { 2968 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2969 } else { 2970 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2971 } 2972 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2973 } else { 2974 // calculate index in register to avoid integer overflow (int_cnt2*2) 2975 movl(tmp, int_cnt2); 2976 addptr(tmp, cnt2); 2977 if (ae == StrIntrinsicNode::UL) { 2978 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2979 } else { 2980 movdqu(vec, Address(str2, tmp, scale2, 0)); 2981 } 2982 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2983 } 2984 // Need to reload strings pointers if not matched whole vector 2985 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2986 addptr(cnt2, stride); 2987 jcc(Assembler::negative, SCAN_SUBSTR); 2988 // Fall through if found full substring 2989 2990 } // (int_cnt2 > 8) 2991 2992 bind(RET_FOUND); 2993 // Found result if we matched full small substring. 2994 // Compute substr offset 2995 subptr(result, str1); 2996 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2997 shrl(result, 1); // index 2998 } 2999 bind(EXIT); 3000 3001 } // string_indexofC8 3002 3003 // Small strings are loaded through stack if they cross page boundary. 3004 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 3005 Register cnt1, Register cnt2, 3006 int int_cnt2, Register result, 3007 XMMRegister vec, Register tmp, 3008 int ae) { 3009 ShortBranchVerifier sbv(this); 3010 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3011 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3012 3013 // 3014 // int_cnt2 is length of small (< 8 chars) constant substring 3015 // or (-1) for non constant substring in which case its length 3016 // is in cnt2 register. 3017 // 3018 // Note, inline_string_indexOf() generates checks: 3019 // if (substr.count > string.count) return -1; 3020 // if (substr.count == 0) return 0; 3021 // 3022 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 3023 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 3024 // This method uses the pcmpestri instruction with bound registers 3025 // inputs: 3026 // xmm - substring 3027 // rax - substring length (elements count) 3028 // mem - scanned string 3029 // rdx - string length (elements count) 3030 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 3031 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 3032 // outputs: 3033 // rcx - matched index in string 3034 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3035 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 3036 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 3037 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 3038 3039 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 3040 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 3041 FOUND_CANDIDATE; 3042 3043 { //======================================================== 3044 // We don't know where these strings are located 3045 // and we can't read beyond them. Load them through stack. 3046 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 3047 3048 movptr(tmp, rsp); // save old SP 3049 3050 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 3051 if (int_cnt2 == (1>>scale2)) { // One byte 3052 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 3053 load_unsigned_byte(result, Address(str2, 0)); 3054 movdl(vec, result); // move 32 bits 3055 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 3056 // Not enough header space in 32-bit VM: 12+3 = 15. 3057 movl(result, Address(str2, -1)); 3058 shrl(result, 8); 3059 movdl(vec, result); // move 32 bits 3060 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 3061 load_unsigned_short(result, Address(str2, 0)); 3062 movdl(vec, result); // move 32 bits 3063 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 3064 movdl(vec, Address(str2, 0)); // move 32 bits 3065 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 3066 movq(vec, Address(str2, 0)); // move 64 bits 3067 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 3068 // Array header size is 12 bytes in 32-bit VM 3069 // + 6 bytes for 3 chars == 18 bytes, 3070 // enough space to load vec and shift. 3071 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3072 if (ae == StrIntrinsicNode::UL) { 3073 int tail_off = int_cnt2-8; 3074 pmovzxbw(vec, Address(str2, tail_off)); 3075 psrldq(vec, -2*tail_off); 3076 } 3077 else { 3078 int tail_off = int_cnt2*(1<<scale2); 3079 movdqu(vec, Address(str2, tail_off-16)); 3080 psrldq(vec, 16-tail_off); 3081 } 3082 } 3083 } else { // not constant substring 3084 cmpl(cnt2, stride); 3085 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3086 3087 // We can read beyond string if srt+16 does not cross page boundary 3088 // since heaps are aligned and mapped by pages. 3089 assert(os::vm_page_size() < (int)G, "default page should be small"); 3090 movl(result, str2); // We need only low 32 bits 3091 andl(result, ((int)os::vm_page_size()-1)); 3092 cmpl(result, ((int)os::vm_page_size()-16)); 3093 jccb(Assembler::belowEqual, CHECK_STR); 3094 3095 // Move small strings to stack to allow load 16 bytes into vec. 3096 subptr(rsp, 16); 3097 int stk_offset = wordSize-(1<<scale2); 3098 push(cnt2); 3099 3100 bind(COPY_SUBSTR); 3101 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3102 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3103 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3104 } else if (ae == StrIntrinsicNode::UU) { 3105 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3106 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3107 } 3108 decrement(cnt2); 3109 jccb(Assembler::notZero, COPY_SUBSTR); 3110 3111 pop(cnt2); 3112 movptr(str2, rsp); // New substring address 3113 } // non constant 3114 3115 bind(CHECK_STR); 3116 cmpl(cnt1, stride); 3117 jccb(Assembler::aboveEqual, BIG_STRINGS); 3118 3119 // Check cross page boundary. 3120 movl(result, str1); // We need only low 32 bits 3121 andl(result, ((int)os::vm_page_size()-1)); 3122 cmpl(result, ((int)os::vm_page_size()-16)); 3123 jccb(Assembler::belowEqual, BIG_STRINGS); 3124 3125 subptr(rsp, 16); 3126 int stk_offset = -(1<<scale1); 3127 if (int_cnt2 < 0) { // not constant 3128 push(cnt2); 3129 stk_offset += wordSize; 3130 } 3131 movl(cnt2, cnt1); 3132 3133 bind(COPY_STR); 3134 if (ae == StrIntrinsicNode::LL) { 3135 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3136 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3137 } else { 3138 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3139 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3140 } 3141 decrement(cnt2); 3142 jccb(Assembler::notZero, COPY_STR); 3143 3144 if (int_cnt2 < 0) { // not constant 3145 pop(cnt2); 3146 } 3147 movptr(str1, rsp); // New string address 3148 3149 bind(BIG_STRINGS); 3150 // Load substring. 3151 if (int_cnt2 < 0) { // -1 3152 if (ae == StrIntrinsicNode::UL) { 3153 pmovzxbw(vec, Address(str2, 0)); 3154 } else { 3155 movdqu(vec, Address(str2, 0)); 3156 } 3157 push(cnt2); // substr count 3158 push(str2); // substr addr 3159 push(str1); // string addr 3160 } else { 3161 // Small (< 8 chars) constant substrings are loaded already. 3162 movl(cnt2, int_cnt2); 3163 } 3164 push(tmp); // original SP 3165 3166 } // Finished loading 3167 3168 //======================================================== 3169 // Start search 3170 // 3171 3172 movptr(result, str1); // string addr 3173 3174 if (int_cnt2 < 0) { // Only for non constant substring 3175 jmpb(SCAN_TO_SUBSTR); 3176 3177 // SP saved at sp+0 3178 // String saved at sp+1*wordSize 3179 // Substr saved at sp+2*wordSize 3180 // Substr count saved at sp+3*wordSize 3181 3182 // Reload substr for rescan, this code 3183 // is executed only for large substrings (> 8 chars) 3184 bind(RELOAD_SUBSTR); 3185 movptr(str2, Address(rsp, 2*wordSize)); 3186 movl(cnt2, Address(rsp, 3*wordSize)); 3187 if (ae == StrIntrinsicNode::UL) { 3188 pmovzxbw(vec, Address(str2, 0)); 3189 } else { 3190 movdqu(vec, Address(str2, 0)); 3191 } 3192 // We came here after the beginning of the substring was 3193 // matched but the rest of it was not so we need to search 3194 // again. Start from the next element after the previous match. 3195 subptr(str1, result); // Restore counter 3196 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3197 shrl(str1, 1); 3198 } 3199 addl(cnt1, str1); 3200 decrementl(cnt1); // Shift to next element 3201 cmpl(cnt1, cnt2); 3202 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3203 3204 addptr(result, (1<<scale1)); 3205 } // non constant 3206 3207 // Scan string for start of substr in 16-byte vectors 3208 bind(SCAN_TO_SUBSTR); 3209 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3210 pcmpestri(vec, Address(result, 0), mode); 3211 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3212 subl(cnt1, stride); 3213 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3214 cmpl(cnt1, cnt2); 3215 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3216 addptr(result, 16); 3217 3218 bind(ADJUST_STR); 3219 cmpl(cnt1, stride); // Do not read beyond string 3220 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3221 // Back-up string to avoid reading beyond string. 3222 lea(result, Address(result, cnt1, scale1, -16)); 3223 movl(cnt1, stride); 3224 jmpb(SCAN_TO_SUBSTR); 3225 3226 // Found a potential substr 3227 bind(FOUND_CANDIDATE); 3228 // After pcmpestri tmp(rcx) contains matched element index 3229 3230 // Make sure string is still long enough 3231 subl(cnt1, tmp); 3232 cmpl(cnt1, cnt2); 3233 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3234 // Left less then substring. 3235 3236 bind(RET_NOT_FOUND); 3237 movl(result, -1); 3238 jmp(CLEANUP); 3239 3240 bind(FOUND_SUBSTR); 3241 // Compute start addr of substr 3242 lea(result, Address(result, tmp, scale1)); 3243 if (int_cnt2 > 0) { // Constant substring 3244 // Repeat search for small substring (< 8 chars) 3245 // from new point without reloading substring. 3246 // Have to check that we don't read beyond string. 3247 cmpl(tmp, stride-int_cnt2); 3248 jccb(Assembler::greater, ADJUST_STR); 3249 // Fall through if matched whole substring. 3250 } else { // non constant 3251 assert(int_cnt2 == -1, "should be != 0"); 3252 3253 addl(tmp, cnt2); 3254 // Found result if we matched whole substring. 3255 cmpl(tmp, stride); 3256 jcc(Assembler::lessEqual, RET_FOUND); 3257 3258 // Repeat search for small substring (<= 8 chars) 3259 // from new point 'str1' without reloading substring. 3260 cmpl(cnt2, stride); 3261 // Have to check that we don't read beyond string. 3262 jccb(Assembler::lessEqual, ADJUST_STR); 3263 3264 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3265 // Compare the rest of substring (> 8 chars). 3266 movptr(str1, result); 3267 3268 cmpl(tmp, cnt2); 3269 // First 8 chars are already matched. 3270 jccb(Assembler::equal, CHECK_NEXT); 3271 3272 bind(SCAN_SUBSTR); 3273 pcmpestri(vec, Address(str1, 0), mode); 3274 // Need to reload strings pointers if not matched whole vector 3275 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3276 3277 bind(CHECK_NEXT); 3278 subl(cnt2, stride); 3279 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3280 addptr(str1, 16); 3281 if (ae == StrIntrinsicNode::UL) { 3282 addptr(str2, 8); 3283 } else { 3284 addptr(str2, 16); 3285 } 3286 subl(cnt1, stride); 3287 cmpl(cnt2, stride); // Do not read beyond substring 3288 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3289 // Back-up strings to avoid reading beyond substring. 3290 3291 if (ae == StrIntrinsicNode::UL) { 3292 lea(str2, Address(str2, cnt2, scale2, -8)); 3293 lea(str1, Address(str1, cnt2, scale1, -16)); 3294 } else { 3295 lea(str2, Address(str2, cnt2, scale2, -16)); 3296 lea(str1, Address(str1, cnt2, scale1, -16)); 3297 } 3298 subl(cnt1, cnt2); 3299 movl(cnt2, stride); 3300 addl(cnt1, stride); 3301 bind(CONT_SCAN_SUBSTR); 3302 if (ae == StrIntrinsicNode::UL) { 3303 pmovzxbw(vec, Address(str2, 0)); 3304 } else { 3305 movdqu(vec, Address(str2, 0)); 3306 } 3307 jmp(SCAN_SUBSTR); 3308 3309 bind(RET_FOUND_LONG); 3310 movptr(str1, Address(rsp, wordSize)); 3311 } // non constant 3312 3313 bind(RET_FOUND); 3314 // Compute substr offset 3315 subptr(result, str1); 3316 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3317 shrl(result, 1); // index 3318 } 3319 bind(CLEANUP); 3320 pop(rsp); // restore SP 3321 3322 } // string_indexof 3323 3324 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3325 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3326 ShortBranchVerifier sbv(this); 3327 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3328 3329 int stride = 8; 3330 3331 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3332 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3333 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3334 FOUND_SEQ_CHAR, DONE_LABEL; 3335 3336 movptr(result, str1); 3337 if (UseAVX >= 2) { 3338 cmpl(cnt1, stride); 3339 jcc(Assembler::less, SCAN_TO_CHAR); 3340 cmpl(cnt1, 2*stride); 3341 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3342 movdl(vec1, ch); 3343 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3344 vpxor(vec2, vec2); 3345 movl(tmp, cnt1); 3346 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3347 andl(cnt1,0x0000000F); //tail count (in chars) 3348 3349 bind(SCAN_TO_16_CHAR_LOOP); 3350 vmovdqu(vec3, Address(result, 0)); 3351 vpcmpeqw(vec3, vec3, vec1, 1); 3352 vptest(vec2, vec3); 3353 jcc(Assembler::carryClear, FOUND_CHAR); 3354 addptr(result, 32); 3355 subl(tmp, 2*stride); 3356 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3357 jmp(SCAN_TO_8_CHAR); 3358 bind(SCAN_TO_8_CHAR_INIT); 3359 movdl(vec1, ch); 3360 pshuflw(vec1, vec1, 0x00); 3361 pshufd(vec1, vec1, 0); 3362 pxor(vec2, vec2); 3363 } 3364 bind(SCAN_TO_8_CHAR); 3365 cmpl(cnt1, stride); 3366 jcc(Assembler::less, SCAN_TO_CHAR); 3367 if (UseAVX < 2) { 3368 movdl(vec1, ch); 3369 pshuflw(vec1, vec1, 0x00); 3370 pshufd(vec1, vec1, 0); 3371 pxor(vec2, vec2); 3372 } 3373 movl(tmp, cnt1); 3374 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3375 andl(cnt1,0x00000007); //tail count (in chars) 3376 3377 bind(SCAN_TO_8_CHAR_LOOP); 3378 movdqu(vec3, Address(result, 0)); 3379 pcmpeqw(vec3, vec1); 3380 ptest(vec2, vec3); 3381 jcc(Assembler::carryClear, FOUND_CHAR); 3382 addptr(result, 16); 3383 subl(tmp, stride); 3384 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3385 bind(SCAN_TO_CHAR); 3386 testl(cnt1, cnt1); 3387 jcc(Assembler::zero, RET_NOT_FOUND); 3388 bind(SCAN_TO_CHAR_LOOP); 3389 load_unsigned_short(tmp, Address(result, 0)); 3390 cmpl(ch, tmp); 3391 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3392 addptr(result, 2); 3393 subl(cnt1, 1); 3394 jccb(Assembler::zero, RET_NOT_FOUND); 3395 jmp(SCAN_TO_CHAR_LOOP); 3396 3397 bind(RET_NOT_FOUND); 3398 movl(result, -1); 3399 jmpb(DONE_LABEL); 3400 3401 bind(FOUND_CHAR); 3402 if (UseAVX >= 2) { 3403 vpmovmskb(tmp, vec3); 3404 } else { 3405 pmovmskb(tmp, vec3); 3406 } 3407 bsfl(ch, tmp); 3408 addptr(result, ch); 3409 3410 bind(FOUND_SEQ_CHAR); 3411 subptr(result, str1); 3412 shrl(result, 1); 3413 3414 bind(DONE_LABEL); 3415 } // string_indexof_char 3416 3417 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3418 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3419 ShortBranchVerifier sbv(this); 3420 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3421 3422 int stride = 16; 3423 3424 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3425 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3426 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3427 FOUND_SEQ_CHAR, DONE_LABEL; 3428 3429 movptr(result, str1); 3430 if (UseAVX >= 2) { 3431 cmpl(cnt1, stride); 3432 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3433 cmpl(cnt1, stride*2); 3434 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3435 movdl(vec1, ch); 3436 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3437 vpxor(vec2, vec2); 3438 movl(tmp, cnt1); 3439 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3440 andl(cnt1,0x0000001F); //tail count (in chars) 3441 3442 bind(SCAN_TO_32_CHAR_LOOP); 3443 vmovdqu(vec3, Address(result, 0)); 3444 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3445 vptest(vec2, vec3); 3446 jcc(Assembler::carryClear, FOUND_CHAR); 3447 addptr(result, 32); 3448 subl(tmp, stride*2); 3449 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3450 jmp(SCAN_TO_16_CHAR); 3451 3452 bind(SCAN_TO_16_CHAR_INIT); 3453 movdl(vec1, ch); 3454 pxor(vec2, vec2); 3455 pshufb(vec1, vec2); 3456 } 3457 3458 bind(SCAN_TO_16_CHAR); 3459 cmpl(cnt1, stride); 3460 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3461 if (UseAVX < 2) { 3462 movdl(vec1, ch); 3463 pxor(vec2, vec2); 3464 pshufb(vec1, vec2); 3465 } 3466 movl(tmp, cnt1); 3467 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3468 andl(cnt1,0x0000000F); //tail count (in bytes) 3469 3470 bind(SCAN_TO_16_CHAR_LOOP); 3471 movdqu(vec3, Address(result, 0)); 3472 pcmpeqb(vec3, vec1); 3473 ptest(vec2, vec3); 3474 jcc(Assembler::carryClear, FOUND_CHAR); 3475 addptr(result, 16); 3476 subl(tmp, stride); 3477 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3478 3479 bind(SCAN_TO_CHAR_INIT); 3480 testl(cnt1, cnt1); 3481 jcc(Assembler::zero, RET_NOT_FOUND); 3482 bind(SCAN_TO_CHAR_LOOP); 3483 load_unsigned_byte(tmp, Address(result, 0)); 3484 cmpl(ch, tmp); 3485 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3486 addptr(result, 1); 3487 subl(cnt1, 1); 3488 jccb(Assembler::zero, RET_NOT_FOUND); 3489 jmp(SCAN_TO_CHAR_LOOP); 3490 3491 bind(RET_NOT_FOUND); 3492 movl(result, -1); 3493 jmpb(DONE_LABEL); 3494 3495 bind(FOUND_CHAR); 3496 if (UseAVX >= 2) { 3497 vpmovmskb(tmp, vec3); 3498 } else { 3499 pmovmskb(tmp, vec3); 3500 } 3501 bsfl(ch, tmp); 3502 addptr(result, ch); 3503 3504 bind(FOUND_SEQ_CHAR); 3505 subptr(result, str1); 3506 3507 bind(DONE_LABEL); 3508 } // stringL_indexof_char 3509 3510 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3511 switch (eltype) { 3512 case T_BOOLEAN: return sizeof(jboolean); 3513 case T_BYTE: return sizeof(jbyte); 3514 case T_SHORT: return sizeof(jshort); 3515 case T_CHAR: return sizeof(jchar); 3516 case T_INT: return sizeof(jint); 3517 default: 3518 ShouldNotReachHere(); 3519 return -1; 3520 } 3521 } 3522 3523 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3524 switch (eltype) { 3525 // T_BOOLEAN used as surrogate for unsigned byte 3526 case T_BOOLEAN: movzbl(dst, src); break; 3527 case T_BYTE: movsbl(dst, src); break; 3528 case T_SHORT: movswl(dst, src); break; 3529 case T_CHAR: movzwl(dst, src); break; 3530 case T_INT: movl(dst, src); break; 3531 default: 3532 ShouldNotReachHere(); 3533 } 3534 } 3535 3536 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3537 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3538 } 3539 3540 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3541 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3542 } 3543 3544 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3545 const int vlen = Assembler::AVX_256bit; 3546 switch (eltype) { 3547 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3548 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3549 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3550 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3551 case T_INT: 3552 // do nothing 3553 break; 3554 default: 3555 ShouldNotReachHere(); 3556 } 3557 } 3558 3559 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3560 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3561 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3562 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3563 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3564 BasicType eltype) { 3565 ShortBranchVerifier sbv(this); 3566 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3567 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3568 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3569 3570 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3571 SHORT_UNROLLED_LOOP_EXIT, 3572 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3573 UNROLLED_VECTOR_LOOP_BEGIN, 3574 END; 3575 switch (eltype) { 3576 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3577 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3578 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3579 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3580 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3581 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3582 } 3583 3584 // For "renaming" for readibility of the code 3585 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3586 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3587 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3588 3589 const int elsize = arrays_hashcode_elsize(eltype); 3590 3591 /* 3592 if (cnt1 >= 2) { 3593 if (cnt1 >= 32) { 3594 UNROLLED VECTOR LOOP 3595 } 3596 UNROLLED SCALAR LOOP 3597 } 3598 SINGLE SCALAR 3599 */ 3600 3601 cmpl(cnt1, 32); 3602 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3603 3604 // cnt1 >= 32 && generate_vectorized_loop 3605 xorl(index, index); 3606 3607 // vresult = IntVector.zero(I256); 3608 for (int idx = 0; idx < 4; idx++) { 3609 vpxor(vresult[idx], vresult[idx]); 3610 } 3611 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3612 Register bound = tmp2; 3613 Register next = tmp3; 3614 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3615 movl(next, Address(tmp2, 0)); 3616 movdl(vnext, next); 3617 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3618 3619 // index = 0; 3620 // bound = cnt1 & ~(32 - 1); 3621 movl(bound, cnt1); 3622 andl(bound, ~(32 - 1)); 3623 // for (; index < bound; index += 32) { 3624 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3625 // result *= next; 3626 imull(result, next); 3627 // loop fission to upfront the cost of fetching from memory, OOO execution 3628 // can then hopefully do a better job of prefetching 3629 for (int idx = 0; idx < 4; idx++) { 3630 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3631 } 3632 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3633 for (int idx = 0; idx < 4; idx++) { 3634 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3635 arrays_hashcode_elvcast(vtmp[idx], eltype); 3636 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3637 } 3638 // index += 32; 3639 addl(index, 32); 3640 // index < bound; 3641 cmpl(index, bound); 3642 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3643 // } 3644 3645 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3646 subl(cnt1, bound); 3647 // release bound 3648 3649 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3650 for (int idx = 0; idx < 4; idx++) { 3651 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3652 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3653 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3654 } 3655 // result += vresult.reduceLanes(ADD); 3656 for (int idx = 0; idx < 4; idx++) { 3657 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3658 } 3659 3660 // } else if (cnt1 < 32) { 3661 3662 bind(SHORT_UNROLLED_BEGIN); 3663 // int i = 1; 3664 movl(index, 1); 3665 cmpl(index, cnt1); 3666 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3667 3668 // for (; i < cnt1 ; i += 2) { 3669 bind(SHORT_UNROLLED_LOOP_BEGIN); 3670 movl(tmp3, 961); 3671 imull(result, tmp3); 3672 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3673 movl(tmp3, tmp2); 3674 shll(tmp3, 5); 3675 subl(tmp3, tmp2); 3676 addl(result, tmp3); 3677 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3678 addl(result, tmp3); 3679 addl(index, 2); 3680 cmpl(index, cnt1); 3681 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3682 3683 // } 3684 // if (i >= cnt1) { 3685 bind(SHORT_UNROLLED_LOOP_EXIT); 3686 jccb(Assembler::greater, END); 3687 movl(tmp2, result); 3688 shll(result, 5); 3689 subl(result, tmp2); 3690 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3691 addl(result, tmp3); 3692 // } 3693 bind(END); 3694 3695 BLOCK_COMMENT("} // arrays_hashcode"); 3696 3697 } // arrays_hashcode 3698 3699 // helper function for string_compare 3700 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3701 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3702 Address::ScaleFactor scale2, Register index, int ae) { 3703 if (ae == StrIntrinsicNode::LL) { 3704 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3705 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3706 } else if (ae == StrIntrinsicNode::UU) { 3707 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3708 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3709 } else { 3710 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3711 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3712 } 3713 } 3714 3715 // Compare strings, used for char[] and byte[]. 3716 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3717 Register cnt1, Register cnt2, Register result, 3718 XMMRegister vec1, int ae, KRegister mask) { 3719 ShortBranchVerifier sbv(this); 3720 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3721 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3722 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3723 int stride2x2 = 0x40; 3724 Address::ScaleFactor scale = Address::no_scale; 3725 Address::ScaleFactor scale1 = Address::no_scale; 3726 Address::ScaleFactor scale2 = Address::no_scale; 3727 3728 if (ae != StrIntrinsicNode::LL) { 3729 stride2x2 = 0x20; 3730 } 3731 3732 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3733 shrl(cnt2, 1); 3734 } 3735 // Compute the minimum of the string lengths and the 3736 // difference of the string lengths (stack). 3737 // Do the conditional move stuff 3738 movl(result, cnt1); 3739 subl(cnt1, cnt2); 3740 push(cnt1); 3741 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3742 3743 // Is the minimum length zero? 3744 testl(cnt2, cnt2); 3745 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3746 if (ae == StrIntrinsicNode::LL) { 3747 // Load first bytes 3748 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3749 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3750 } else if (ae == StrIntrinsicNode::UU) { 3751 // Load first characters 3752 load_unsigned_short(result, Address(str1, 0)); 3753 load_unsigned_short(cnt1, Address(str2, 0)); 3754 } else { 3755 load_unsigned_byte(result, Address(str1, 0)); 3756 load_unsigned_short(cnt1, Address(str2, 0)); 3757 } 3758 subl(result, cnt1); 3759 jcc(Assembler::notZero, POP_LABEL); 3760 3761 if (ae == StrIntrinsicNode::UU) { 3762 // Divide length by 2 to get number of chars 3763 shrl(cnt2, 1); 3764 } 3765 cmpl(cnt2, 1); 3766 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3767 3768 // Check if the strings start at the same location and setup scale and stride 3769 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3770 cmpptr(str1, str2); 3771 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3772 if (ae == StrIntrinsicNode::LL) { 3773 scale = Address::times_1; 3774 stride = 16; 3775 } else { 3776 scale = Address::times_2; 3777 stride = 8; 3778 } 3779 } else { 3780 scale1 = Address::times_1; 3781 scale2 = Address::times_2; 3782 // scale not used 3783 stride = 8; 3784 } 3785 3786 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3787 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3788 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3789 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3790 Label COMPARE_TAIL_LONG; 3791 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3792 3793 int pcmpmask = 0x19; 3794 if (ae == StrIntrinsicNode::LL) { 3795 pcmpmask &= ~0x01; 3796 } 3797 3798 // Setup to compare 16-chars (32-bytes) vectors, 3799 // start from first character again because it has aligned address. 3800 if (ae == StrIntrinsicNode::LL) { 3801 stride2 = 32; 3802 } else { 3803 stride2 = 16; 3804 } 3805 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3806 adr_stride = stride << scale; 3807 } else { 3808 adr_stride1 = 8; //stride << scale1; 3809 adr_stride2 = 16; //stride << scale2; 3810 } 3811 3812 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3813 // rax and rdx are used by pcmpestri as elements counters 3814 movl(result, cnt2); 3815 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3816 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3817 3818 // fast path : compare first 2 8-char vectors. 3819 bind(COMPARE_16_CHARS); 3820 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3821 movdqu(vec1, Address(str1, 0)); 3822 } else { 3823 pmovzxbw(vec1, Address(str1, 0)); 3824 } 3825 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3826 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3827 3828 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3829 movdqu(vec1, Address(str1, adr_stride)); 3830 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3831 } else { 3832 pmovzxbw(vec1, Address(str1, adr_stride1)); 3833 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3834 } 3835 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3836 addl(cnt1, stride); 3837 3838 // Compare the characters at index in cnt1 3839 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3840 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3841 subl(result, cnt2); 3842 jmp(POP_LABEL); 3843 3844 // Setup the registers to start vector comparison loop 3845 bind(COMPARE_WIDE_VECTORS); 3846 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3847 lea(str1, Address(str1, result, scale)); 3848 lea(str2, Address(str2, result, scale)); 3849 } else { 3850 lea(str1, Address(str1, result, scale1)); 3851 lea(str2, Address(str2, result, scale2)); 3852 } 3853 subl(result, stride2); 3854 subl(cnt2, stride2); 3855 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3856 negptr(result); 3857 3858 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3859 bind(COMPARE_WIDE_VECTORS_LOOP); 3860 3861 #ifdef _LP64 3862 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3863 cmpl(cnt2, stride2x2); 3864 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3865 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3866 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3867 3868 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3869 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3870 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3871 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3872 } else { 3873 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3874 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3875 } 3876 kortestql(mask, mask); 3877 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3878 addptr(result, stride2x2); // update since we already compared at this addr 3879 subl(cnt2, stride2x2); // and sub the size too 3880 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3881 3882 vpxor(vec1, vec1); 3883 jmpb(COMPARE_WIDE_TAIL); 3884 }//if (VM_Version::supports_avx512vlbw()) 3885 #endif // _LP64 3886 3887 3888 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3889 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3890 vmovdqu(vec1, Address(str1, result, scale)); 3891 vpxor(vec1, Address(str2, result, scale)); 3892 } else { 3893 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3894 vpxor(vec1, Address(str2, result, scale2)); 3895 } 3896 vptest(vec1, vec1); 3897 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3898 addptr(result, stride2); 3899 subl(cnt2, stride2); 3900 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3901 // clean upper bits of YMM registers 3902 vpxor(vec1, vec1); 3903 3904 // compare wide vectors tail 3905 bind(COMPARE_WIDE_TAIL); 3906 testptr(result, result); 3907 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3908 3909 movl(result, stride2); 3910 movl(cnt2, result); 3911 negptr(result); 3912 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3913 3914 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3915 bind(VECTOR_NOT_EQUAL); 3916 // clean upper bits of YMM registers 3917 vpxor(vec1, vec1); 3918 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3919 lea(str1, Address(str1, result, scale)); 3920 lea(str2, Address(str2, result, scale)); 3921 } else { 3922 lea(str1, Address(str1, result, scale1)); 3923 lea(str2, Address(str2, result, scale2)); 3924 } 3925 jmp(COMPARE_16_CHARS); 3926 3927 // Compare tail chars, length between 1 to 15 chars 3928 bind(COMPARE_TAIL_LONG); 3929 movl(cnt2, result); 3930 cmpl(cnt2, stride); 3931 jcc(Assembler::less, COMPARE_SMALL_STR); 3932 3933 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3934 movdqu(vec1, Address(str1, 0)); 3935 } else { 3936 pmovzxbw(vec1, Address(str1, 0)); 3937 } 3938 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3939 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3940 subptr(cnt2, stride); 3941 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3942 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3943 lea(str1, Address(str1, result, scale)); 3944 lea(str2, Address(str2, result, scale)); 3945 } else { 3946 lea(str1, Address(str1, result, scale1)); 3947 lea(str2, Address(str2, result, scale2)); 3948 } 3949 negptr(cnt2); 3950 jmpb(WHILE_HEAD_LABEL); 3951 3952 bind(COMPARE_SMALL_STR); 3953 } else if (UseSSE42Intrinsics) { 3954 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3955 int pcmpmask = 0x19; 3956 // Setup to compare 8-char (16-byte) vectors, 3957 // start from first character again because it has aligned address. 3958 movl(result, cnt2); 3959 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3960 if (ae == StrIntrinsicNode::LL) { 3961 pcmpmask &= ~0x01; 3962 } 3963 jcc(Assembler::zero, COMPARE_TAIL); 3964 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3965 lea(str1, Address(str1, result, scale)); 3966 lea(str2, Address(str2, result, scale)); 3967 } else { 3968 lea(str1, Address(str1, result, scale1)); 3969 lea(str2, Address(str2, result, scale2)); 3970 } 3971 negptr(result); 3972 3973 // pcmpestri 3974 // inputs: 3975 // vec1- substring 3976 // rax - negative string length (elements count) 3977 // mem - scanned string 3978 // rdx - string length (elements count) 3979 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3980 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3981 // outputs: 3982 // rcx - first mismatched element index 3983 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3984 3985 bind(COMPARE_WIDE_VECTORS); 3986 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3987 movdqu(vec1, Address(str1, result, scale)); 3988 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3989 } else { 3990 pmovzxbw(vec1, Address(str1, result, scale1)); 3991 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3992 } 3993 // After pcmpestri cnt1(rcx) contains mismatched element index 3994 3995 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3996 addptr(result, stride); 3997 subptr(cnt2, stride); 3998 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3999 4000 // compare wide vectors tail 4001 testptr(result, result); 4002 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 4003 4004 movl(cnt2, stride); 4005 movl(result, stride); 4006 negptr(result); 4007 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4008 movdqu(vec1, Address(str1, result, scale)); 4009 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 4010 } else { 4011 pmovzxbw(vec1, Address(str1, result, scale1)); 4012 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4013 } 4014 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 4015 4016 // Mismatched characters in the vectors 4017 bind(VECTOR_NOT_EQUAL); 4018 addptr(cnt1, result); 4019 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 4020 subl(result, cnt2); 4021 jmpb(POP_LABEL); 4022 4023 bind(COMPARE_TAIL); // limit is zero 4024 movl(cnt2, result); 4025 // Fallthru to tail compare 4026 } 4027 // Shift str2 and str1 to the end of the arrays, negate min 4028 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4029 lea(str1, Address(str1, cnt2, scale)); 4030 lea(str2, Address(str2, cnt2, scale)); 4031 } else { 4032 lea(str1, Address(str1, cnt2, scale1)); 4033 lea(str2, Address(str2, cnt2, scale2)); 4034 } 4035 decrementl(cnt2); // first character was compared already 4036 negptr(cnt2); 4037 4038 // Compare the rest of the elements 4039 bind(WHILE_HEAD_LABEL); 4040 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 4041 subl(result, cnt1); 4042 jccb(Assembler::notZero, POP_LABEL); 4043 increment(cnt2); 4044 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 4045 4046 // Strings are equal up to min length. Return the length difference. 4047 bind(LENGTH_DIFF_LABEL); 4048 pop(result); 4049 if (ae == StrIntrinsicNode::UU) { 4050 // Divide diff by 2 to get number of chars 4051 sarl(result, 1); 4052 } 4053 jmpb(DONE_LABEL); 4054 4055 #ifdef _LP64 4056 if (VM_Version::supports_avx512vlbw()) { 4057 4058 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 4059 4060 kmovql(cnt1, mask); 4061 notq(cnt1); 4062 bsfq(cnt2, cnt1); 4063 if (ae != StrIntrinsicNode::LL) { 4064 // Divide diff by 2 to get number of chars 4065 sarl(cnt2, 1); 4066 } 4067 addq(result, cnt2); 4068 if (ae == StrIntrinsicNode::LL) { 4069 load_unsigned_byte(cnt1, Address(str2, result)); 4070 load_unsigned_byte(result, Address(str1, result)); 4071 } else if (ae == StrIntrinsicNode::UU) { 4072 load_unsigned_short(cnt1, Address(str2, result, scale)); 4073 load_unsigned_short(result, Address(str1, result, scale)); 4074 } else { 4075 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4076 load_unsigned_byte(result, Address(str1, result, scale1)); 4077 } 4078 subl(result, cnt1); 4079 jmpb(POP_LABEL); 4080 }//if (VM_Version::supports_avx512vlbw()) 4081 #endif // _LP64 4082 4083 // Discard the stored length difference 4084 bind(POP_LABEL); 4085 pop(cnt1); 4086 4087 // That's it 4088 bind(DONE_LABEL); 4089 if(ae == StrIntrinsicNode::UL) { 4090 negl(result); 4091 } 4092 4093 } 4094 4095 // Search for Non-ASCII character (Negative byte value) in a byte array, 4096 // return the index of the first such character, otherwise the length 4097 // of the array segment searched. 4098 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4099 // @IntrinsicCandidate 4100 // public static int countPositives(byte[] ba, int off, int len) { 4101 // for (int i = off; i < off + len; i++) { 4102 // if (ba[i] < 0) { 4103 // return i - off; 4104 // } 4105 // } 4106 // return len; 4107 // } 4108 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4109 Register result, Register tmp1, 4110 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4111 // rsi: byte array 4112 // rcx: len 4113 // rax: result 4114 ShortBranchVerifier sbv(this); 4115 assert_different_registers(ary1, len, result, tmp1); 4116 assert_different_registers(vec1, vec2); 4117 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4118 4119 movl(result, len); // copy 4120 // len == 0 4121 testl(len, len); 4122 jcc(Assembler::zero, DONE); 4123 4124 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4125 VM_Version::supports_avx512vlbw() && 4126 VM_Version::supports_bmi2()) { 4127 4128 Label test_64_loop, test_tail, BREAK_LOOP; 4129 movl(tmp1, len); 4130 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4131 4132 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4133 andl(len, 0xffffffc0); // vector count (in chars) 4134 jccb(Assembler::zero, test_tail); 4135 4136 lea(ary1, Address(ary1, len, Address::times_1)); 4137 negptr(len); 4138 4139 bind(test_64_loop); 4140 // Check whether our 64 elements of size byte contain negatives 4141 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4142 kortestql(mask1, mask1); 4143 jcc(Assembler::notZero, BREAK_LOOP); 4144 4145 addptr(len, 64); 4146 jccb(Assembler::notZero, test_64_loop); 4147 4148 bind(test_tail); 4149 // bail out when there is nothing to be done 4150 testl(tmp1, -1); 4151 jcc(Assembler::zero, DONE); 4152 4153 4154 // check the tail for absense of negatives 4155 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4156 #ifdef _LP64 4157 { 4158 Register tmp3_aliased = len; 4159 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4160 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4161 notq(tmp3_aliased); 4162 kmovql(mask2, tmp3_aliased); 4163 } 4164 #else 4165 Label k_init; 4166 jmp(k_init); 4167 4168 // We could not read 64-bits from a general purpose register thus we move 4169 // data required to compose 64 1's to the instruction stream 4170 // We emit 64 byte wide series of elements from 0..63 which later on would 4171 // be used as a compare targets with tail count contained in tmp1 register. 4172 // Result would be a k register having tmp1 consecutive number or 1 4173 // counting from least significant bit. 4174 address tmp = pc(); 4175 emit_int64(0x0706050403020100); 4176 emit_int64(0x0F0E0D0C0B0A0908); 4177 emit_int64(0x1716151413121110); 4178 emit_int64(0x1F1E1D1C1B1A1918); 4179 emit_int64(0x2726252423222120); 4180 emit_int64(0x2F2E2D2C2B2A2928); 4181 emit_int64(0x3736353433323130); 4182 emit_int64(0x3F3E3D3C3B3A3938); 4183 4184 bind(k_init); 4185 lea(len, InternalAddress(tmp)); 4186 // create mask to test for negative byte inside a vector 4187 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 4188 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 4189 4190 #endif 4191 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4192 ktestq(mask1, mask2); 4193 jcc(Assembler::zero, DONE); 4194 4195 // do a full check for negative registers in the tail 4196 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4197 // ary1 already pointing to the right place 4198 jmpb(TAIL_START); 4199 4200 bind(BREAK_LOOP); 4201 // At least one byte in the last 64 byte block was negative. 4202 // Set up to look at the last 64 bytes as if they were a tail 4203 lea(ary1, Address(ary1, len, Address::times_1)); 4204 addptr(result, len); 4205 // Ignore the very last byte: if all others are positive, 4206 // it must be negative, so we can skip right to the 2+1 byte 4207 // end comparison at this point 4208 orl(result, 63); 4209 movl(len, 63); 4210 // Fallthru to tail compare 4211 } else { 4212 4213 if (UseAVX >= 2 && UseSSE >= 2) { 4214 // With AVX2, use 32-byte vector compare 4215 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4216 4217 // Compare 32-byte vectors 4218 testl(len, 0xffffffe0); // vector count (in bytes) 4219 jccb(Assembler::zero, TAIL_START); 4220 4221 andl(len, 0xffffffe0); 4222 lea(ary1, Address(ary1, len, Address::times_1)); 4223 negptr(len); 4224 4225 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4226 movdl(vec2, tmp1); 4227 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4228 4229 bind(COMPARE_WIDE_VECTORS); 4230 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4231 vptest(vec1, vec2); 4232 jccb(Assembler::notZero, BREAK_LOOP); 4233 addptr(len, 32); 4234 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4235 4236 testl(result, 0x0000001f); // any bytes remaining? 4237 jcc(Assembler::zero, DONE); 4238 4239 // Quick test using the already prepared vector mask 4240 movl(len, result); 4241 andl(len, 0x0000001f); 4242 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4243 vptest(vec1, vec2); 4244 jcc(Assembler::zero, DONE); 4245 // There are zeros, jump to the tail to determine exactly where 4246 jmpb(TAIL_START); 4247 4248 bind(BREAK_LOOP); 4249 // At least one byte in the last 32-byte vector is negative. 4250 // Set up to look at the last 32 bytes as if they were a tail 4251 lea(ary1, Address(ary1, len, Address::times_1)); 4252 addptr(result, len); 4253 // Ignore the very last byte: if all others are positive, 4254 // it must be negative, so we can skip right to the 2+1 byte 4255 // end comparison at this point 4256 orl(result, 31); 4257 movl(len, 31); 4258 // Fallthru to tail compare 4259 } else if (UseSSE42Intrinsics) { 4260 // With SSE4.2, use double quad vector compare 4261 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4262 4263 // Compare 16-byte vectors 4264 testl(len, 0xfffffff0); // vector count (in bytes) 4265 jcc(Assembler::zero, TAIL_START); 4266 4267 andl(len, 0xfffffff0); 4268 lea(ary1, Address(ary1, len, Address::times_1)); 4269 negptr(len); 4270 4271 movl(tmp1, 0x80808080); 4272 movdl(vec2, tmp1); 4273 pshufd(vec2, vec2, 0); 4274 4275 bind(COMPARE_WIDE_VECTORS); 4276 movdqu(vec1, Address(ary1, len, Address::times_1)); 4277 ptest(vec1, vec2); 4278 jccb(Assembler::notZero, BREAK_LOOP); 4279 addptr(len, 16); 4280 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4281 4282 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4283 jcc(Assembler::zero, DONE); 4284 4285 // Quick test using the already prepared vector mask 4286 movl(len, result); 4287 andl(len, 0x0000000f); // tail count (in bytes) 4288 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4289 ptest(vec1, vec2); 4290 jcc(Assembler::zero, DONE); 4291 jmpb(TAIL_START); 4292 4293 bind(BREAK_LOOP); 4294 // At least one byte in the last 16-byte vector is negative. 4295 // Set up and look at the last 16 bytes as if they were a tail 4296 lea(ary1, Address(ary1, len, Address::times_1)); 4297 addptr(result, len); 4298 // Ignore the very last byte: if all others are positive, 4299 // it must be negative, so we can skip right to the 2+1 byte 4300 // end comparison at this point 4301 orl(result, 15); 4302 movl(len, 15); 4303 // Fallthru to tail compare 4304 } 4305 } 4306 4307 bind(TAIL_START); 4308 // Compare 4-byte vectors 4309 andl(len, 0xfffffffc); // vector count (in bytes) 4310 jccb(Assembler::zero, COMPARE_CHAR); 4311 4312 lea(ary1, Address(ary1, len, Address::times_1)); 4313 negptr(len); 4314 4315 bind(COMPARE_VECTORS); 4316 movl(tmp1, Address(ary1, len, Address::times_1)); 4317 andl(tmp1, 0x80808080); 4318 jccb(Assembler::notZero, TAIL_ADJUST); 4319 addptr(len, 4); 4320 jccb(Assembler::notZero, COMPARE_VECTORS); 4321 4322 // Compare trailing char (final 2-3 bytes), if any 4323 bind(COMPARE_CHAR); 4324 4325 testl(result, 0x2); // tail char 4326 jccb(Assembler::zero, COMPARE_BYTE); 4327 load_unsigned_short(tmp1, Address(ary1, 0)); 4328 andl(tmp1, 0x00008080); 4329 jccb(Assembler::notZero, CHAR_ADJUST); 4330 lea(ary1, Address(ary1, 2)); 4331 4332 bind(COMPARE_BYTE); 4333 testl(result, 0x1); // tail byte 4334 jccb(Assembler::zero, DONE); 4335 load_unsigned_byte(tmp1, Address(ary1, 0)); 4336 testl(tmp1, 0x00000080); 4337 jccb(Assembler::zero, DONE); 4338 subptr(result, 1); 4339 jmpb(DONE); 4340 4341 bind(TAIL_ADJUST); 4342 // there are negative bits in the last 4 byte block. 4343 // Adjust result and check the next three bytes 4344 addptr(result, len); 4345 orl(result, 3); 4346 lea(ary1, Address(ary1, len, Address::times_1)); 4347 jmpb(COMPARE_CHAR); 4348 4349 bind(CHAR_ADJUST); 4350 // We are looking at a char + optional byte tail, and found that one 4351 // of the bytes in the char is negative. Adjust the result, check the 4352 // first byte and readjust if needed. 4353 andl(result, 0xfffffffc); 4354 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4355 jccb(Assembler::notZero, DONE); 4356 addptr(result, 1); 4357 4358 // That's it 4359 bind(DONE); 4360 if (UseAVX >= 2 && UseSSE >= 2) { 4361 // clean upper bits of YMM registers 4362 vpxor(vec1, vec1); 4363 vpxor(vec2, vec2); 4364 } 4365 } 4366 4367 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4368 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4369 Register limit, Register result, Register chr, 4370 XMMRegister vec1, XMMRegister vec2, bool is_char, 4371 KRegister mask, bool expand_ary2) { 4372 // for expand_ary2, limit is the (smaller) size of the second array. 4373 ShortBranchVerifier sbv(this); 4374 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4375 4376 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4377 "Expansion only implemented for AVX2"); 4378 4379 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4380 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4381 4382 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4383 int scaleIncr = expand_ary2 ? 8 : 16; 4384 4385 if (is_array_equ) { 4386 // Check the input args 4387 cmpoop(ary1, ary2); 4388 jcc(Assembler::equal, TRUE_LABEL); 4389 4390 // Need additional checks for arrays_equals. 4391 testptr(ary1, ary1); 4392 jcc(Assembler::zero, FALSE_LABEL); 4393 testptr(ary2, ary2); 4394 jcc(Assembler::zero, FALSE_LABEL); 4395 4396 // Check the lengths 4397 movl(limit, Address(ary1, length_offset)); 4398 cmpl(limit, Address(ary2, length_offset)); 4399 jcc(Assembler::notEqual, FALSE_LABEL); 4400 } 4401 4402 // count == 0 4403 testl(limit, limit); 4404 jcc(Assembler::zero, TRUE_LABEL); 4405 4406 if (is_array_equ) { 4407 // Load array address 4408 lea(ary1, Address(ary1, base_offset)); 4409 lea(ary2, Address(ary2, base_offset)); 4410 } 4411 4412 if (is_array_equ && is_char) { 4413 // arrays_equals when used for char[]. 4414 shll(limit, 1); // byte count != 0 4415 } 4416 movl(result, limit); // copy 4417 4418 if (UseAVX >= 2) { 4419 // With AVX2, use 32-byte vector compare 4420 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4421 4422 // Compare 32-byte vectors 4423 if (expand_ary2) { 4424 andl(result, 0x0000000f); // tail count (in bytes) 4425 andl(limit, 0xfffffff0); // vector count (in bytes) 4426 jcc(Assembler::zero, COMPARE_TAIL); 4427 } else { 4428 andl(result, 0x0000001f); // tail count (in bytes) 4429 andl(limit, 0xffffffe0); // vector count (in bytes) 4430 jcc(Assembler::zero, COMPARE_TAIL_16); 4431 } 4432 4433 lea(ary1, Address(ary1, limit, scaleFactor)); 4434 lea(ary2, Address(ary2, limit, Address::times_1)); 4435 negptr(limit); 4436 4437 #ifdef _LP64 4438 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4439 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4440 4441 cmpl(limit, -64); 4442 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4443 4444 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4445 4446 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4447 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4448 kortestql(mask, mask); 4449 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4450 addptr(limit, 64); // update since we already compared at this addr 4451 cmpl(limit, -64); 4452 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4453 4454 // At this point we may still need to compare -limit+result bytes. 4455 // We could execute the next two instruction and just continue via non-wide path: 4456 // cmpl(limit, 0); 4457 // jcc(Assembler::equal, COMPARE_TAIL); // true 4458 // But since we stopped at the points ary{1,2}+limit which are 4459 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4460 // (|limit| <= 32 and result < 32), 4461 // we may just compare the last 64 bytes. 4462 // 4463 addptr(result, -64); // it is safe, bc we just came from this area 4464 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4465 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4466 kortestql(mask, mask); 4467 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4468 4469 jmp(TRUE_LABEL); 4470 4471 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4472 4473 }//if (VM_Version::supports_avx512vlbw()) 4474 #endif //_LP64 4475 bind(COMPARE_WIDE_VECTORS); 4476 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4477 if (expand_ary2) { 4478 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4479 } else { 4480 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4481 } 4482 vpxor(vec1, vec2); 4483 4484 vptest(vec1, vec1); 4485 jcc(Assembler::notZero, FALSE_LABEL); 4486 addptr(limit, scaleIncr * 2); 4487 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4488 4489 testl(result, result); 4490 jcc(Assembler::zero, TRUE_LABEL); 4491 4492 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4493 if (expand_ary2) { 4494 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4495 } else { 4496 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4497 } 4498 vpxor(vec1, vec2); 4499 4500 vptest(vec1, vec1); 4501 jcc(Assembler::notZero, FALSE_LABEL); 4502 jmp(TRUE_LABEL); 4503 4504 bind(COMPARE_TAIL_16); // limit is zero 4505 movl(limit, result); 4506 4507 // Compare 16-byte chunks 4508 andl(result, 0x0000000f); // tail count (in bytes) 4509 andl(limit, 0xfffffff0); // vector count (in bytes) 4510 jcc(Assembler::zero, COMPARE_TAIL); 4511 4512 lea(ary1, Address(ary1, limit, scaleFactor)); 4513 lea(ary2, Address(ary2, limit, Address::times_1)); 4514 negptr(limit); 4515 4516 bind(COMPARE_WIDE_VECTORS_16); 4517 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4518 if (expand_ary2) { 4519 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4520 } else { 4521 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4522 } 4523 pxor(vec1, vec2); 4524 4525 ptest(vec1, vec1); 4526 jcc(Assembler::notZero, FALSE_LABEL); 4527 addptr(limit, scaleIncr); 4528 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4529 4530 bind(COMPARE_TAIL); // limit is zero 4531 movl(limit, result); 4532 // Fallthru to tail compare 4533 } else if (UseSSE42Intrinsics) { 4534 // With SSE4.2, use double quad vector compare 4535 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4536 4537 // Compare 16-byte vectors 4538 andl(result, 0x0000000f); // tail count (in bytes) 4539 andl(limit, 0xfffffff0); // vector count (in bytes) 4540 jcc(Assembler::zero, COMPARE_TAIL); 4541 4542 lea(ary1, Address(ary1, limit, Address::times_1)); 4543 lea(ary2, Address(ary2, limit, Address::times_1)); 4544 negptr(limit); 4545 4546 bind(COMPARE_WIDE_VECTORS); 4547 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4548 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4549 pxor(vec1, vec2); 4550 4551 ptest(vec1, vec1); 4552 jcc(Assembler::notZero, FALSE_LABEL); 4553 addptr(limit, 16); 4554 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4555 4556 testl(result, result); 4557 jcc(Assembler::zero, TRUE_LABEL); 4558 4559 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4560 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4561 pxor(vec1, vec2); 4562 4563 ptest(vec1, vec1); 4564 jccb(Assembler::notZero, FALSE_LABEL); 4565 jmpb(TRUE_LABEL); 4566 4567 bind(COMPARE_TAIL); // limit is zero 4568 movl(limit, result); 4569 // Fallthru to tail compare 4570 } 4571 4572 // Compare 4-byte vectors 4573 if (expand_ary2) { 4574 testl(result, result); 4575 jccb(Assembler::zero, TRUE_LABEL); 4576 } else { 4577 andl(limit, 0xfffffffc); // vector count (in bytes) 4578 jccb(Assembler::zero, COMPARE_CHAR); 4579 } 4580 4581 lea(ary1, Address(ary1, limit, scaleFactor)); 4582 lea(ary2, Address(ary2, limit, Address::times_1)); 4583 negptr(limit); 4584 4585 bind(COMPARE_VECTORS); 4586 if (expand_ary2) { 4587 // There are no "vector" operations for bytes to shorts 4588 movzbl(chr, Address(ary2, limit, Address::times_1)); 4589 cmpw(Address(ary1, limit, Address::times_2), chr); 4590 jccb(Assembler::notEqual, FALSE_LABEL); 4591 addptr(limit, 1); 4592 jcc(Assembler::notZero, COMPARE_VECTORS); 4593 jmp(TRUE_LABEL); 4594 } else { 4595 movl(chr, Address(ary1, limit, Address::times_1)); 4596 cmpl(chr, Address(ary2, limit, Address::times_1)); 4597 jccb(Assembler::notEqual, FALSE_LABEL); 4598 addptr(limit, 4); 4599 jcc(Assembler::notZero, COMPARE_VECTORS); 4600 } 4601 4602 // Compare trailing char (final 2 bytes), if any 4603 bind(COMPARE_CHAR); 4604 testl(result, 0x2); // tail char 4605 jccb(Assembler::zero, COMPARE_BYTE); 4606 load_unsigned_short(chr, Address(ary1, 0)); 4607 load_unsigned_short(limit, Address(ary2, 0)); 4608 cmpl(chr, limit); 4609 jccb(Assembler::notEqual, FALSE_LABEL); 4610 4611 if (is_array_equ && is_char) { 4612 bind(COMPARE_BYTE); 4613 } else { 4614 lea(ary1, Address(ary1, 2)); 4615 lea(ary2, Address(ary2, 2)); 4616 4617 bind(COMPARE_BYTE); 4618 testl(result, 0x1); // tail byte 4619 jccb(Assembler::zero, TRUE_LABEL); 4620 load_unsigned_byte(chr, Address(ary1, 0)); 4621 load_unsigned_byte(limit, Address(ary2, 0)); 4622 cmpl(chr, limit); 4623 jccb(Assembler::notEqual, FALSE_LABEL); 4624 } 4625 bind(TRUE_LABEL); 4626 movl(result, 1); // return true 4627 jmpb(DONE); 4628 4629 bind(FALSE_LABEL); 4630 xorl(result, result); // return false 4631 4632 // That's it 4633 bind(DONE); 4634 if (UseAVX >= 2) { 4635 // clean upper bits of YMM registers 4636 vpxor(vec1, vec1); 4637 vpxor(vec2, vec2); 4638 } 4639 } 4640 4641 #ifdef _LP64 4642 4643 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4644 #define __ masm. 4645 Register dst = stub.data<0>(); 4646 XMMRegister src = stub.data<1>(); 4647 address target = stub.data<2>(); 4648 __ bind(stub.entry()); 4649 __ subptr(rsp, 8); 4650 __ movdbl(Address(rsp), src); 4651 __ call(RuntimeAddress(target)); 4652 __ pop(dst); 4653 __ jmp(stub.continuation()); 4654 #undef __ 4655 } 4656 4657 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4658 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4659 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4660 4661 address slowpath_target; 4662 if (dst_bt == T_INT) { 4663 if (src_bt == T_FLOAT) { 4664 cvttss2sil(dst, src); 4665 cmpl(dst, 0x80000000); 4666 slowpath_target = StubRoutines::x86::f2i_fixup(); 4667 } else { 4668 cvttsd2sil(dst, src); 4669 cmpl(dst, 0x80000000); 4670 slowpath_target = StubRoutines::x86::d2i_fixup(); 4671 } 4672 } else { 4673 if (src_bt == T_FLOAT) { 4674 cvttss2siq(dst, src); 4675 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4676 slowpath_target = StubRoutines::x86::f2l_fixup(); 4677 } else { 4678 cvttsd2siq(dst, src); 4679 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4680 slowpath_target = StubRoutines::x86::d2l_fixup(); 4681 } 4682 } 4683 4684 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4685 jcc(Assembler::equal, stub->entry()); 4686 bind(stub->continuation()); 4687 } 4688 4689 #endif // _LP64 4690 4691 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4692 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4693 switch(ideal_opc) { 4694 case Op_LShiftVS: 4695 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4696 case Op_LShiftVI: 4697 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4698 case Op_LShiftVL: 4699 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4700 case Op_RShiftVS: 4701 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4702 case Op_RShiftVI: 4703 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4704 case Op_RShiftVL: 4705 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4706 case Op_URShiftVS: 4707 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4708 case Op_URShiftVI: 4709 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4710 case Op_URShiftVL: 4711 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4712 case Op_RotateRightV: 4713 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4714 case Op_RotateLeftV: 4715 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4716 default: 4717 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4718 break; 4719 } 4720 } 4721 4722 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4723 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) { 4724 if (is_unsigned) { 4725 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4726 } else { 4727 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4728 } 4729 } 4730 4731 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4732 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4733 switch (elem_bt) { 4734 case T_BYTE: 4735 if (ideal_opc == Op_SaturatingAddV) { 4736 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4737 } else { 4738 assert(ideal_opc == Op_SaturatingSubV, ""); 4739 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4740 } 4741 break; 4742 case T_SHORT: 4743 if (ideal_opc == Op_SaturatingAddV) { 4744 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4745 } else { 4746 assert(ideal_opc == Op_SaturatingSubV, ""); 4747 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4748 } 4749 break; 4750 default: 4751 fatal("Unsupported type %s", type2name(elem_bt)); 4752 break; 4753 } 4754 } 4755 4756 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4757 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4758 switch (elem_bt) { 4759 case T_BYTE: 4760 if (ideal_opc == Op_SaturatingAddV) { 4761 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4762 } else { 4763 assert(ideal_opc == Op_SaturatingSubV, ""); 4764 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4765 } 4766 break; 4767 case T_SHORT: 4768 if (ideal_opc == Op_SaturatingAddV) { 4769 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4770 } else { 4771 assert(ideal_opc == Op_SaturatingSubV, ""); 4772 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4773 } 4774 break; 4775 default: 4776 fatal("Unsupported type %s", type2name(elem_bt)); 4777 break; 4778 } 4779 } 4780 4781 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4782 Address src2, bool is_unsigned, bool merge, int vlen_enc) { 4783 if (is_unsigned) { 4784 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4785 } else { 4786 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4787 } 4788 } 4789 4790 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4791 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4792 switch (elem_bt) { 4793 case T_BYTE: 4794 if (ideal_opc == Op_SaturatingAddV) { 4795 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4796 } else { 4797 assert(ideal_opc == Op_SaturatingSubV, ""); 4798 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4799 } 4800 break; 4801 case T_SHORT: 4802 if (ideal_opc == Op_SaturatingAddV) { 4803 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4804 } else { 4805 assert(ideal_opc == Op_SaturatingSubV, ""); 4806 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4807 } 4808 break; 4809 default: 4810 fatal("Unsupported type %s", type2name(elem_bt)); 4811 break; 4812 } 4813 } 4814 4815 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4816 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4817 switch (elem_bt) { 4818 case T_BYTE: 4819 if (ideal_opc == Op_SaturatingAddV) { 4820 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4821 } else { 4822 assert(ideal_opc == Op_SaturatingSubV, ""); 4823 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4824 } 4825 break; 4826 case T_SHORT: 4827 if (ideal_opc == Op_SaturatingAddV) { 4828 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4829 } else { 4830 assert(ideal_opc == Op_SaturatingSubV, ""); 4831 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4832 } 4833 break; 4834 default: 4835 fatal("Unsupported type %s", type2name(elem_bt)); 4836 break; 4837 } 4838 } 4839 4840 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4841 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4842 bool is_varshift) { 4843 switch (ideal_opc) { 4844 case Op_AddVB: 4845 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4846 case Op_AddVS: 4847 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4848 case Op_AddVI: 4849 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4850 case Op_AddVL: 4851 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4852 case Op_AddVF: 4853 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4854 case Op_AddVD: 4855 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4856 case Op_SubVB: 4857 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4858 case Op_SubVS: 4859 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4860 case Op_SubVI: 4861 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4862 case Op_SubVL: 4863 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4864 case Op_SubVF: 4865 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4866 case Op_SubVD: 4867 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4868 case Op_MulVS: 4869 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4870 case Op_MulVI: 4871 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4872 case Op_MulVL: 4873 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4874 case Op_MulVF: 4875 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4876 case Op_MulVD: 4877 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4878 case Op_DivVF: 4879 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4880 case Op_DivVD: 4881 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4882 case Op_SqrtVF: 4883 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4884 case Op_SqrtVD: 4885 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4886 case Op_AbsVB: 4887 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4888 case Op_AbsVS: 4889 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4890 case Op_AbsVI: 4891 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4892 case Op_AbsVL: 4893 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4894 case Op_FmaVF: 4895 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4896 case Op_FmaVD: 4897 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4898 case Op_VectorRearrange: 4899 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4900 case Op_LShiftVS: 4901 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4902 case Op_LShiftVI: 4903 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4904 case Op_LShiftVL: 4905 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4906 case Op_RShiftVS: 4907 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4908 case Op_RShiftVI: 4909 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4910 case Op_RShiftVL: 4911 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4912 case Op_URShiftVS: 4913 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4914 case Op_URShiftVI: 4915 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4916 case Op_URShiftVL: 4917 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4918 case Op_RotateLeftV: 4919 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4920 case Op_RotateRightV: 4921 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4922 case Op_MaxV: 4923 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4924 case Op_MinV: 4925 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4926 case Op_UMinV: 4927 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4928 case Op_UMaxV: 4929 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4930 case Op_XorV: 4931 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4932 case Op_OrV: 4933 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4934 case Op_AndV: 4935 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4936 default: 4937 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4938 break; 4939 } 4940 } 4941 4942 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4943 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4944 switch (ideal_opc) { 4945 case Op_AddVB: 4946 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4947 case Op_AddVS: 4948 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4949 case Op_AddVI: 4950 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4951 case Op_AddVL: 4952 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4953 case Op_AddVF: 4954 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4955 case Op_AddVD: 4956 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4957 case Op_SubVB: 4958 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4959 case Op_SubVS: 4960 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4961 case Op_SubVI: 4962 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4963 case Op_SubVL: 4964 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4965 case Op_SubVF: 4966 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4967 case Op_SubVD: 4968 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4969 case Op_MulVS: 4970 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4971 case Op_MulVI: 4972 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4973 case Op_MulVL: 4974 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4975 case Op_MulVF: 4976 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4977 case Op_MulVD: 4978 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4979 case Op_DivVF: 4980 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4981 case Op_DivVD: 4982 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4983 case Op_FmaVF: 4984 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4985 case Op_FmaVD: 4986 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4987 case Op_MaxV: 4988 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4989 case Op_MinV: 4990 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4991 case Op_UMaxV: 4992 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4993 case Op_UMinV: 4994 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4995 case Op_XorV: 4996 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4997 case Op_OrV: 4998 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4999 case Op_AndV: 5000 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5001 default: 5002 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 5003 break; 5004 } 5005 } 5006 5007 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 5008 KRegister src1, KRegister src2) { 5009 BasicType etype = T_ILLEGAL; 5010 switch(mask_len) { 5011 case 2: 5012 case 4: 5013 case 8: etype = T_BYTE; break; 5014 case 16: etype = T_SHORT; break; 5015 case 32: etype = T_INT; break; 5016 case 64: etype = T_LONG; break; 5017 default: fatal("Unsupported type"); break; 5018 } 5019 assert(etype != T_ILLEGAL, ""); 5020 switch(ideal_opc) { 5021 case Op_AndVMask: 5022 kand(etype, dst, src1, src2); break; 5023 case Op_OrVMask: 5024 kor(etype, dst, src1, src2); break; 5025 case Op_XorVMask: 5026 kxor(etype, dst, src1, src2); break; 5027 default: 5028 fatal("Unsupported masked operation"); break; 5029 } 5030 } 5031 5032 /* 5033 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5034 * If src is NaN, the result is 0. 5035 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 5036 * the result is equal to the value of Integer.MIN_VALUE. 5037 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 5038 * the result is equal to the value of Integer.MAX_VALUE. 5039 */ 5040 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5041 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5042 Register rscratch, AddressLiteral float_sign_flip, 5043 int vec_enc) { 5044 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5045 Label done; 5046 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 5047 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5048 vptest(xtmp2, xtmp2, vec_enc); 5049 jccb(Assembler::equal, done); 5050 5051 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 5052 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 5053 5054 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5055 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 5056 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 5057 5058 // Recompute the mask for remaining special value. 5059 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 5060 // Extract SRC values corresponding to TRUE mask lanes. 5061 vpand(xtmp4, xtmp2, src, vec_enc); 5062 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 5063 // values are set. 5064 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 5065 5066 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 5067 bind(done); 5068 } 5069 5070 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5071 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5072 Register rscratch, AddressLiteral float_sign_flip, 5073 int vec_enc) { 5074 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5075 Label done; 5076 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5077 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5078 kortestwl(ktmp1, ktmp1); 5079 jccb(Assembler::equal, done); 5080 5081 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5082 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5083 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5084 5085 kxorwl(ktmp1, ktmp1, ktmp2); 5086 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5087 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5088 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5089 bind(done); 5090 } 5091 5092 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5093 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5094 Register rscratch, AddressLiteral double_sign_flip, 5095 int vec_enc) { 5096 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5097 5098 Label done; 5099 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5100 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 5101 kortestwl(ktmp1, ktmp1); 5102 jccb(Assembler::equal, done); 5103 5104 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5105 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5106 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5107 5108 kxorwl(ktmp1, ktmp1, ktmp2); 5109 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5110 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5111 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5112 bind(done); 5113 } 5114 5115 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5116 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5117 Register rscratch, AddressLiteral float_sign_flip, 5118 int vec_enc) { 5119 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5120 Label done; 5121 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5122 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5123 kortestwl(ktmp1, ktmp1); 5124 jccb(Assembler::equal, done); 5125 5126 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5127 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5128 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5129 5130 kxorwl(ktmp1, ktmp1, ktmp2); 5131 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5132 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5133 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5134 bind(done); 5135 } 5136 5137 /* 5138 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5139 * If src is NaN, the result is 0. 5140 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 5141 * the result is equal to the value of Long.MIN_VALUE. 5142 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 5143 * the result is equal to the value of Long.MAX_VALUE. 5144 */ 5145 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5146 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5147 Register rscratch, AddressLiteral double_sign_flip, 5148 int vec_enc) { 5149 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5150 5151 Label done; 5152 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5153 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 5154 kortestwl(ktmp1, ktmp1); 5155 jccb(Assembler::equal, done); 5156 5157 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5158 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5159 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5160 5161 kxorwl(ktmp1, ktmp1, ktmp2); 5162 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5163 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5164 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5165 bind(done); 5166 } 5167 5168 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5169 XMMRegister xtmp, int index, int vec_enc) { 5170 assert(vec_enc < Assembler::AVX_512bit, ""); 5171 if (vec_enc == Assembler::AVX_256bit) { 5172 vextractf128_high(xtmp, src); 5173 vshufps(dst, src, xtmp, index, vec_enc); 5174 } else { 5175 vshufps(dst, src, zero, index, vec_enc); 5176 } 5177 } 5178 5179 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5180 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5181 AddressLiteral float_sign_flip, int src_vec_enc) { 5182 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5183 5184 Label done; 5185 // Compare the destination lanes with float_sign_flip 5186 // value to get mask for all special values. 5187 movdqu(xtmp1, float_sign_flip, rscratch); 5188 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5189 ptest(xtmp2, xtmp2); 5190 jccb(Assembler::equal, done); 5191 5192 // Flip float_sign_flip to get max integer value. 5193 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5194 pxor(xtmp1, xtmp4); 5195 5196 // Set detination lanes corresponding to unordered source lanes as zero. 5197 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5198 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5199 5200 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5201 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5202 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5203 5204 // Recompute the mask for remaining special value. 5205 pxor(xtmp2, xtmp3); 5206 // Extract mask corresponding to non-negative source lanes. 5207 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5208 5209 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5210 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5211 pand(xtmp3, xtmp2); 5212 5213 // Replace destination lanes holding special value(0x80000000) with max int 5214 // if corresponding source lane holds a +ve value. 5215 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5216 bind(done); 5217 } 5218 5219 5220 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5221 XMMRegister xtmp, Register rscratch, int vec_enc) { 5222 switch(to_elem_bt) { 5223 case T_SHORT: 5224 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5225 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5226 vpackusdw(dst, dst, zero, vec_enc); 5227 if (vec_enc == Assembler::AVX_256bit) { 5228 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5229 } 5230 break; 5231 case T_BYTE: 5232 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5233 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5234 vpackusdw(dst, dst, zero, vec_enc); 5235 if (vec_enc == Assembler::AVX_256bit) { 5236 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5237 } 5238 vpackuswb(dst, dst, zero, vec_enc); 5239 break; 5240 default: assert(false, "%s", type2name(to_elem_bt)); 5241 } 5242 } 5243 5244 /* 5245 * Algorithm for vector D2L and F2I conversions:- 5246 * a) Perform vector D2L/F2I cast. 5247 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5248 * It signifies that source value could be any of the special floating point 5249 * values(NaN,-Inf,Inf,Max,-Min). 5250 * c) Set destination to zero if source is NaN value. 5251 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5252 */ 5253 5254 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5255 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5256 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5257 int to_elem_sz = type2aelembytes(to_elem_bt); 5258 assert(to_elem_sz <= 4, ""); 5259 vcvttps2dq(dst, src, vec_enc); 5260 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5261 if (to_elem_sz < 4) { 5262 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5263 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5264 } 5265 } 5266 5267 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5268 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5269 Register rscratch, int vec_enc) { 5270 int to_elem_sz = type2aelembytes(to_elem_bt); 5271 assert(to_elem_sz <= 4, ""); 5272 vcvttps2dq(dst, src, vec_enc); 5273 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5274 switch(to_elem_bt) { 5275 case T_INT: 5276 break; 5277 case T_SHORT: 5278 evpmovdw(dst, dst, vec_enc); 5279 break; 5280 case T_BYTE: 5281 evpmovdb(dst, dst, vec_enc); 5282 break; 5283 default: assert(false, "%s", type2name(to_elem_bt)); 5284 } 5285 } 5286 5287 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5288 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5289 Register rscratch, int vec_enc) { 5290 evcvttps2qq(dst, src, vec_enc); 5291 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5292 } 5293 5294 // Handling for downcasting from double to integer or sub-word types on AVX2. 5295 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5296 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5297 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5298 int to_elem_sz = type2aelembytes(to_elem_bt); 5299 assert(to_elem_sz < 8, ""); 5300 vcvttpd2dq(dst, src, vec_enc); 5301 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5302 float_sign_flip, vec_enc); 5303 if (to_elem_sz < 4) { 5304 // xtmp4 holds all zero lanes. 5305 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5306 } 5307 } 5308 5309 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5310 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5311 KRegister ktmp2, AddressLiteral sign_flip, 5312 Register rscratch, int vec_enc) { 5313 if (VM_Version::supports_avx512dq()) { 5314 evcvttpd2qq(dst, src, vec_enc); 5315 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5316 switch(to_elem_bt) { 5317 case T_LONG: 5318 break; 5319 case T_INT: 5320 evpmovsqd(dst, dst, vec_enc); 5321 break; 5322 case T_SHORT: 5323 evpmovsqd(dst, dst, vec_enc); 5324 evpmovdw(dst, dst, vec_enc); 5325 break; 5326 case T_BYTE: 5327 evpmovsqd(dst, dst, vec_enc); 5328 evpmovdb(dst, dst, vec_enc); 5329 break; 5330 default: assert(false, "%s", type2name(to_elem_bt)); 5331 } 5332 } else { 5333 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5334 vcvttpd2dq(dst, src, vec_enc); 5335 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5336 switch(to_elem_bt) { 5337 case T_INT: 5338 break; 5339 case T_SHORT: 5340 evpmovdw(dst, dst, vec_enc); 5341 break; 5342 case T_BYTE: 5343 evpmovdb(dst, dst, vec_enc); 5344 break; 5345 default: assert(false, "%s", type2name(to_elem_bt)); 5346 } 5347 } 5348 } 5349 5350 #ifdef _LP64 5351 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5352 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5353 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5354 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5355 // and re-instantiate original MXCSR.RC mode after that. 5356 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5357 5358 mov64(tmp, julong_cast(0.5L)); 5359 evpbroadcastq(xtmp1, tmp, vec_enc); 5360 vaddpd(xtmp1, src , xtmp1, vec_enc); 5361 evcvtpd2qq(dst, xtmp1, vec_enc); 5362 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5363 double_sign_flip, vec_enc);; 5364 5365 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5366 } 5367 5368 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5369 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5370 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5371 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5372 // and re-instantiate original MXCSR.RC mode after that. 5373 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5374 5375 movl(tmp, jint_cast(0.5)); 5376 movq(xtmp1, tmp); 5377 vbroadcastss(xtmp1, xtmp1, vec_enc); 5378 vaddps(xtmp1, src , xtmp1, vec_enc); 5379 vcvtps2dq(dst, xtmp1, vec_enc); 5380 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5381 float_sign_flip, vec_enc); 5382 5383 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5384 } 5385 5386 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5387 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5388 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5389 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5390 // and re-instantiate original MXCSR.RC mode after that. 5391 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5392 5393 movl(tmp, jint_cast(0.5)); 5394 movq(xtmp1, tmp); 5395 vbroadcastss(xtmp1, xtmp1, vec_enc); 5396 vaddps(xtmp1, src , xtmp1, vec_enc); 5397 vcvtps2dq(dst, xtmp1, vec_enc); 5398 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5399 5400 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5401 } 5402 #endif // _LP64 5403 5404 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5405 BasicType from_elem_bt, BasicType to_elem_bt) { 5406 switch (from_elem_bt) { 5407 case T_BYTE: 5408 switch (to_elem_bt) { 5409 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5410 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5411 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5412 default: ShouldNotReachHere(); 5413 } 5414 break; 5415 case T_SHORT: 5416 switch (to_elem_bt) { 5417 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5418 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5419 default: ShouldNotReachHere(); 5420 } 5421 break; 5422 case T_INT: 5423 assert(to_elem_bt == T_LONG, ""); 5424 vpmovzxdq(dst, src, vlen_enc); 5425 break; 5426 default: 5427 ShouldNotReachHere(); 5428 } 5429 } 5430 5431 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5432 BasicType from_elem_bt, BasicType to_elem_bt) { 5433 switch (from_elem_bt) { 5434 case T_BYTE: 5435 switch (to_elem_bt) { 5436 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5437 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5438 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5439 default: ShouldNotReachHere(); 5440 } 5441 break; 5442 case T_SHORT: 5443 switch (to_elem_bt) { 5444 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5445 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5446 default: ShouldNotReachHere(); 5447 } 5448 break; 5449 case T_INT: 5450 assert(to_elem_bt == T_LONG, ""); 5451 vpmovsxdq(dst, src, vlen_enc); 5452 break; 5453 default: 5454 ShouldNotReachHere(); 5455 } 5456 } 5457 5458 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5459 BasicType dst_bt, BasicType src_bt, int vlen) { 5460 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5461 assert(vlen_enc != AVX_512bit, ""); 5462 5463 int dst_bt_size = type2aelembytes(dst_bt); 5464 int src_bt_size = type2aelembytes(src_bt); 5465 if (dst_bt_size > src_bt_size) { 5466 switch (dst_bt_size / src_bt_size) { 5467 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5468 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5469 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5470 default: ShouldNotReachHere(); 5471 } 5472 } else { 5473 assert(dst_bt_size < src_bt_size, ""); 5474 switch (src_bt_size / dst_bt_size) { 5475 case 2: { 5476 if (vlen_enc == AVX_128bit) { 5477 vpacksswb(dst, src, src, vlen_enc); 5478 } else { 5479 vpacksswb(dst, src, src, vlen_enc); 5480 vpermq(dst, dst, 0x08, vlen_enc); 5481 } 5482 break; 5483 } 5484 case 4: { 5485 if (vlen_enc == AVX_128bit) { 5486 vpackssdw(dst, src, src, vlen_enc); 5487 vpacksswb(dst, dst, dst, vlen_enc); 5488 } else { 5489 vpackssdw(dst, src, src, vlen_enc); 5490 vpermq(dst, dst, 0x08, vlen_enc); 5491 vpacksswb(dst, dst, dst, AVX_128bit); 5492 } 5493 break; 5494 } 5495 case 8: { 5496 if (vlen_enc == AVX_128bit) { 5497 vpshufd(dst, src, 0x08, vlen_enc); 5498 vpackssdw(dst, dst, dst, vlen_enc); 5499 vpacksswb(dst, dst, dst, vlen_enc); 5500 } else { 5501 vpshufd(dst, src, 0x08, vlen_enc); 5502 vpermq(dst, dst, 0x08, vlen_enc); 5503 vpackssdw(dst, dst, dst, AVX_128bit); 5504 vpacksswb(dst, dst, dst, AVX_128bit); 5505 } 5506 break; 5507 } 5508 default: ShouldNotReachHere(); 5509 } 5510 } 5511 } 5512 5513 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5514 bool merge, BasicType bt, int vlen_enc) { 5515 if (bt == T_INT) { 5516 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5517 } else { 5518 assert(bt == T_LONG, ""); 5519 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5520 } 5521 } 5522 5523 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5524 bool merge, BasicType bt, int vlen_enc) { 5525 if (bt == T_INT) { 5526 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5527 } else { 5528 assert(bt == T_LONG, ""); 5529 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5530 } 5531 } 5532 5533 #ifdef _LP64 5534 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5535 Register rtmp2, XMMRegister xtmp, int mask_len, 5536 int vec_enc) { 5537 int index = 0; 5538 int vindex = 0; 5539 mov64(rtmp1, 0x0101010101010101L); 5540 pdepq(rtmp1, src, rtmp1); 5541 if (mask_len > 8) { 5542 movq(rtmp2, src); 5543 vpxor(xtmp, xtmp, xtmp, vec_enc); 5544 movq(xtmp, rtmp1); 5545 } 5546 movq(dst, rtmp1); 5547 5548 mask_len -= 8; 5549 while (mask_len > 0) { 5550 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5551 index++; 5552 if ((index % 2) == 0) { 5553 pxor(xtmp, xtmp); 5554 } 5555 mov64(rtmp1, 0x0101010101010101L); 5556 shrq(rtmp2, 8); 5557 pdepq(rtmp1, rtmp2, rtmp1); 5558 pinsrq(xtmp, rtmp1, index % 2); 5559 vindex = index / 2; 5560 if (vindex) { 5561 // Write entire 16 byte vector when both 64 bit 5562 // lanes are update to save redundant instructions. 5563 if (index % 2) { 5564 vinsertf128(dst, dst, xtmp, vindex); 5565 } 5566 } else { 5567 vmovdqu(dst, xtmp); 5568 } 5569 mask_len -= 8; 5570 } 5571 } 5572 5573 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5574 switch(opc) { 5575 case Op_VectorMaskTrueCount: 5576 popcntq(dst, tmp); 5577 break; 5578 case Op_VectorMaskLastTrue: 5579 if (VM_Version::supports_lzcnt()) { 5580 lzcntq(tmp, tmp); 5581 movl(dst, 63); 5582 subl(dst, tmp); 5583 } else { 5584 movl(dst, -1); 5585 bsrq(tmp, tmp); 5586 cmov32(Assembler::notZero, dst, tmp); 5587 } 5588 break; 5589 case Op_VectorMaskFirstTrue: 5590 if (VM_Version::supports_bmi1()) { 5591 if (masklen < 32) { 5592 orl(tmp, 1 << masklen); 5593 tzcntl(dst, tmp); 5594 } else if (masklen == 32) { 5595 tzcntl(dst, tmp); 5596 } else { 5597 assert(masklen == 64, ""); 5598 tzcntq(dst, tmp); 5599 } 5600 } else { 5601 if (masklen < 32) { 5602 orl(tmp, 1 << masklen); 5603 bsfl(dst, tmp); 5604 } else { 5605 assert(masklen == 32 || masklen == 64, ""); 5606 movl(dst, masklen); 5607 if (masklen == 32) { 5608 bsfl(tmp, tmp); 5609 } else { 5610 bsfq(tmp, tmp); 5611 } 5612 cmov32(Assembler::notZero, dst, tmp); 5613 } 5614 } 5615 break; 5616 case Op_VectorMaskToLong: 5617 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5618 break; 5619 default: assert(false, "Unhandled mask operation"); 5620 } 5621 } 5622 5623 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5624 int masklen, int masksize, int vec_enc) { 5625 assert(VM_Version::supports_popcnt(), ""); 5626 5627 if(VM_Version::supports_avx512bw()) { 5628 kmovql(tmp, mask); 5629 } else { 5630 assert(masklen <= 16, ""); 5631 kmovwl(tmp, mask); 5632 } 5633 5634 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5635 // operations needs to be clipped. 5636 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5637 andq(tmp, (1 << masklen) - 1); 5638 } 5639 5640 vector_mask_operation_helper(opc, dst, tmp, masklen); 5641 } 5642 5643 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5644 Register tmp, int masklen, BasicType bt, int vec_enc) { 5645 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5646 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5647 assert(VM_Version::supports_popcnt(), ""); 5648 5649 bool need_clip = false; 5650 switch(bt) { 5651 case T_BOOLEAN: 5652 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5653 vpxor(xtmp, xtmp, xtmp, vec_enc); 5654 vpsubb(xtmp, xtmp, mask, vec_enc); 5655 vpmovmskb(tmp, xtmp, vec_enc); 5656 need_clip = masklen < 16; 5657 break; 5658 case T_BYTE: 5659 vpmovmskb(tmp, mask, vec_enc); 5660 need_clip = masklen < 16; 5661 break; 5662 case T_SHORT: 5663 vpacksswb(xtmp, mask, mask, vec_enc); 5664 if (masklen >= 16) { 5665 vpermpd(xtmp, xtmp, 8, vec_enc); 5666 } 5667 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5668 need_clip = masklen < 16; 5669 break; 5670 case T_INT: 5671 case T_FLOAT: 5672 vmovmskps(tmp, mask, vec_enc); 5673 need_clip = masklen < 4; 5674 break; 5675 case T_LONG: 5676 case T_DOUBLE: 5677 vmovmskpd(tmp, mask, vec_enc); 5678 need_clip = masklen < 2; 5679 break; 5680 default: assert(false, "Unhandled type, %s", type2name(bt)); 5681 } 5682 5683 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5684 // operations needs to be clipped. 5685 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5686 // need_clip implies masklen < 32 5687 andq(tmp, (1 << masklen) - 1); 5688 } 5689 5690 vector_mask_operation_helper(opc, dst, tmp, masklen); 5691 } 5692 5693 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5694 Register rtmp2, int mask_len) { 5695 kmov(rtmp1, src); 5696 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5697 mov64(rtmp2, -1L); 5698 pextq(rtmp2, rtmp2, rtmp1); 5699 kmov(dst, rtmp2); 5700 } 5701 5702 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5703 XMMRegister mask, Register rtmp, Register rscratch, 5704 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5705 int vec_enc) { 5706 assert(type2aelembytes(bt) >= 4, ""); 5707 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5708 address compress_perm_table = nullptr; 5709 address expand_perm_table = nullptr; 5710 if (type2aelembytes(bt) == 8) { 5711 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5712 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5713 vmovmskpd(rtmp, mask, vec_enc); 5714 } else { 5715 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5716 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5717 vmovmskps(rtmp, mask, vec_enc); 5718 } 5719 shlq(rtmp, 5); // for 32 byte permute row. 5720 if (opcode == Op_CompressV) { 5721 lea(rscratch, ExternalAddress(compress_perm_table)); 5722 } else { 5723 lea(rscratch, ExternalAddress(expand_perm_table)); 5724 } 5725 addptr(rtmp, rscratch); 5726 vmovdqu(permv, Address(rtmp)); 5727 vpermps(dst, permv, src, Assembler::AVX_256bit); 5728 vpxor(xtmp, xtmp, xtmp, vec_enc); 5729 // Blend the result with zero vector using permute mask, each column entry 5730 // in a permute table row contains either a valid permute index or a -1 (default) 5731 // value, this can potentially be used as a blending mask after 5732 // compressing/expanding the source vector lanes. 5733 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5734 } 5735 5736 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5737 bool merge, BasicType bt, int vec_enc) { 5738 if (opcode == Op_CompressV) { 5739 switch(bt) { 5740 case T_BYTE: 5741 evpcompressb(dst, mask, src, merge, vec_enc); 5742 break; 5743 case T_CHAR: 5744 case T_SHORT: 5745 evpcompressw(dst, mask, src, merge, vec_enc); 5746 break; 5747 case T_INT: 5748 evpcompressd(dst, mask, src, merge, vec_enc); 5749 break; 5750 case T_FLOAT: 5751 evcompressps(dst, mask, src, merge, vec_enc); 5752 break; 5753 case T_LONG: 5754 evpcompressq(dst, mask, src, merge, vec_enc); 5755 break; 5756 case T_DOUBLE: 5757 evcompresspd(dst, mask, src, merge, vec_enc); 5758 break; 5759 default: 5760 fatal("Unsupported type %s", type2name(bt)); 5761 break; 5762 } 5763 } else { 5764 assert(opcode == Op_ExpandV, ""); 5765 switch(bt) { 5766 case T_BYTE: 5767 evpexpandb(dst, mask, src, merge, vec_enc); 5768 break; 5769 case T_CHAR: 5770 case T_SHORT: 5771 evpexpandw(dst, mask, src, merge, vec_enc); 5772 break; 5773 case T_INT: 5774 evpexpandd(dst, mask, src, merge, vec_enc); 5775 break; 5776 case T_FLOAT: 5777 evexpandps(dst, mask, src, merge, vec_enc); 5778 break; 5779 case T_LONG: 5780 evpexpandq(dst, mask, src, merge, vec_enc); 5781 break; 5782 case T_DOUBLE: 5783 evexpandpd(dst, mask, src, merge, vec_enc); 5784 break; 5785 default: 5786 fatal("Unsupported type %s", type2name(bt)); 5787 break; 5788 } 5789 } 5790 } 5791 #endif 5792 5793 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5794 KRegister ktmp1, int vec_enc) { 5795 if (opcode == Op_SignumVD) { 5796 vsubpd(dst, zero, one, vec_enc); 5797 // if src < 0 ? -1 : 1 5798 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5799 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5800 // if src == NaN, -0.0 or 0.0 return src. 5801 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5802 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5803 } else { 5804 assert(opcode == Op_SignumVF, ""); 5805 vsubps(dst, zero, one, vec_enc); 5806 // if src < 0 ? -1 : 1 5807 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5808 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5809 // if src == NaN, -0.0 or 0.0 return src. 5810 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5811 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5812 } 5813 } 5814 5815 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5816 XMMRegister xtmp1, int vec_enc) { 5817 if (opcode == Op_SignumVD) { 5818 vsubpd(dst, zero, one, vec_enc); 5819 // if src < 0 ? -1 : 1 5820 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5821 // if src == NaN, -0.0 or 0.0 return src. 5822 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5823 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5824 } else { 5825 assert(opcode == Op_SignumVF, ""); 5826 vsubps(dst, zero, one, vec_enc); 5827 // if src < 0 ? -1 : 1 5828 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5829 // if src == NaN, -0.0 or 0.0 return src. 5830 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5831 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5832 } 5833 } 5834 5835 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5836 if (VM_Version::supports_avx512bw()) { 5837 if (mask_len > 32) { 5838 kmovql(dst, src); 5839 } else { 5840 kmovdl(dst, src); 5841 if (mask_len != 32) { 5842 kshiftrdl(dst, dst, 32 - mask_len); 5843 } 5844 } 5845 } else { 5846 assert(mask_len <= 16, ""); 5847 kmovwl(dst, src); 5848 if (mask_len != 16) { 5849 kshiftrwl(dst, dst, 16 - mask_len); 5850 } 5851 } 5852 } 5853 5854 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5855 int lane_size = type2aelembytes(bt); 5856 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5857 if ((is_LP64 || lane_size < 8) && 5858 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5859 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5860 movptr(rtmp, imm32); 5861 switch(lane_size) { 5862 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5863 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5864 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5865 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5866 fatal("Unsupported lane size %d", lane_size); 5867 break; 5868 } 5869 } else { 5870 movptr(rtmp, imm32); 5871 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5872 switch(lane_size) { 5873 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5874 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5875 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5876 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5877 fatal("Unsupported lane size %d", lane_size); 5878 break; 5879 } 5880 } 5881 } 5882 5883 // 5884 // Following is lookup table based popcount computation algorithm:- 5885 // Index Bit set count 5886 // [ 0000 -> 0, 5887 // 0001 -> 1, 5888 // 0010 -> 1, 5889 // 0011 -> 2, 5890 // 0100 -> 1, 5891 // 0101 -> 2, 5892 // 0110 -> 2, 5893 // 0111 -> 3, 5894 // 1000 -> 1, 5895 // 1001 -> 2, 5896 // 1010 -> 3, 5897 // 1011 -> 3, 5898 // 1100 -> 2, 5899 // 1101 -> 3, 5900 // 1111 -> 4 ] 5901 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5902 // shuffle indices for lookup table access. 5903 // b. Right shift each byte of vector lane by 4 positions. 5904 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5905 // shuffle indices for lookup table access. 5906 // d. Add the bitset count of upper and lower 4 bits of each byte. 5907 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5908 // count of all the bytes of a quadword. 5909 // f. Perform step e. for upper 128bit vector lane. 5910 // g. Pack the bitset count of quadwords back to double word. 5911 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5912 5913 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5914 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5915 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5916 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5917 vpsrlw(dst, src, 4, vec_enc); 5918 vpand(dst, dst, xtmp1, vec_enc); 5919 vpand(xtmp1, src, xtmp1, vec_enc); 5920 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5921 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5922 vpshufb(dst, xtmp2, dst, vec_enc); 5923 vpaddb(dst, dst, xtmp1, vec_enc); 5924 } 5925 5926 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5927 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5928 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5929 // Following code is as per steps e,f,g and h of above algorithm. 5930 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5931 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5932 vpsadbw(dst, dst, xtmp2, vec_enc); 5933 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5934 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5935 vpackuswb(dst, xtmp1, dst, vec_enc); 5936 } 5937 5938 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5939 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5940 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5941 // Add the popcount of upper and lower bytes of word. 5942 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5943 vpsrlw(dst, xtmp1, 8, vec_enc); 5944 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5945 vpaddw(dst, dst, xtmp1, vec_enc); 5946 } 5947 5948 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5949 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5950 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5951 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5952 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5953 } 5954 5955 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5956 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5957 switch(bt) { 5958 case T_LONG: 5959 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5960 break; 5961 case T_INT: 5962 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5963 break; 5964 case T_CHAR: 5965 case T_SHORT: 5966 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5967 break; 5968 case T_BYTE: 5969 case T_BOOLEAN: 5970 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5971 break; 5972 default: 5973 fatal("Unsupported type %s", type2name(bt)); 5974 break; 5975 } 5976 } 5977 5978 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5979 KRegister mask, bool merge, int vec_enc) { 5980 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5981 switch(bt) { 5982 case T_LONG: 5983 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5984 evpopcntq(dst, mask, src, merge, vec_enc); 5985 break; 5986 case T_INT: 5987 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5988 evpopcntd(dst, mask, src, merge, vec_enc); 5989 break; 5990 case T_CHAR: 5991 case T_SHORT: 5992 assert(VM_Version::supports_avx512_bitalg(), ""); 5993 evpopcntw(dst, mask, src, merge, vec_enc); 5994 break; 5995 case T_BYTE: 5996 case T_BOOLEAN: 5997 assert(VM_Version::supports_avx512_bitalg(), ""); 5998 evpopcntb(dst, mask, src, merge, vec_enc); 5999 break; 6000 default: 6001 fatal("Unsupported type %s", type2name(bt)); 6002 break; 6003 } 6004 } 6005 6006 #ifndef _LP64 6007 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 6008 assert(VM_Version::supports_avx512bw(), ""); 6009 kmovdl(tmp, src); 6010 kunpckdql(dst, tmp, tmp); 6011 } 6012 #endif 6013 6014 // Bit reversal algorithm first reverses the bits of each byte followed by 6015 // a byte level reversal for multi-byte primitive types (short/int/long). 6016 // Algorithm performs a lookup table access to get reverse bit sequence 6017 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 6018 // is obtained by swapping the reverse bit sequences of upper and lower 6019 // nibble of a byte. 6020 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6021 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6022 if (VM_Version::supports_avx512vlbw()) { 6023 6024 // Get the reverse bit sequence of lower nibble of each byte. 6025 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 6026 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6027 evpandq(dst, xtmp2, src, vec_enc); 6028 vpshufb(dst, xtmp1, dst, vec_enc); 6029 vpsllq(dst, dst, 4, vec_enc); 6030 6031 // Get the reverse bit sequence of upper nibble of each byte. 6032 vpandn(xtmp2, xtmp2, src, vec_enc); 6033 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6034 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6035 6036 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6037 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6038 evporq(xtmp2, dst, xtmp2, vec_enc); 6039 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6040 6041 } else if(vec_enc == Assembler::AVX_512bit) { 6042 // Shift based bit reversal. 6043 assert(bt == T_LONG || bt == T_INT, ""); 6044 6045 // Swap lower and upper nibble of each byte. 6046 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 6047 6048 // Swap two least and most significant bits of each nibble. 6049 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 6050 6051 // Swap adjacent pair of bits. 6052 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6053 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 6054 6055 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6056 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 6057 } else { 6058 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 6059 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6060 6061 // Get the reverse bit sequence of lower nibble of each byte. 6062 vpand(dst, xtmp2, src, vec_enc); 6063 vpshufb(dst, xtmp1, dst, vec_enc); 6064 vpsllq(dst, dst, 4, vec_enc); 6065 6066 // Get the reverse bit sequence of upper nibble of each byte. 6067 vpandn(xtmp2, xtmp2, src, vec_enc); 6068 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6069 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6070 6071 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6072 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6073 vpor(xtmp2, dst, xtmp2, vec_enc); 6074 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6075 } 6076 } 6077 6078 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 6079 XMMRegister xtmp, Register rscratch) { 6080 assert(VM_Version::supports_gfni(), ""); 6081 assert(rscratch != noreg || always_reachable(mask), "missing"); 6082 6083 // Galois field instruction based bit reversal based on following algorithm. 6084 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6085 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 6086 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 6087 vector_reverse_byte(bt, dst, xtmp, vec_enc); 6088 } 6089 6090 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 6091 XMMRegister xtmp1, Register rtmp, int vec_enc) { 6092 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 6093 evpandq(dst, xtmp1, src, vec_enc); 6094 vpsllq(dst, dst, nbits, vec_enc); 6095 vpandn(xtmp1, xtmp1, src, vec_enc); 6096 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 6097 evporq(dst, dst, xtmp1, vec_enc); 6098 } 6099 6100 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6101 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6102 // Shift based bit reversal. 6103 assert(VM_Version::supports_evex(), ""); 6104 switch(bt) { 6105 case T_LONG: 6106 // Swap upper and lower double word of each quad word. 6107 evprorq(xtmp1, k0, src, 32, true, vec_enc); 6108 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 6109 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6110 break; 6111 case T_INT: 6112 // Swap upper and lower word of each double word. 6113 evprord(xtmp1, k0, src, 16, true, vec_enc); 6114 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6115 break; 6116 case T_CHAR: 6117 case T_SHORT: 6118 // Swap upper and lower byte of each word. 6119 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 6120 break; 6121 case T_BYTE: 6122 evmovdquq(dst, k0, src, true, vec_enc); 6123 break; 6124 default: 6125 fatal("Unsupported type %s", type2name(bt)); 6126 break; 6127 } 6128 } 6129 6130 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 6131 if (bt == T_BYTE) { 6132 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 6133 evmovdquq(dst, k0, src, true, vec_enc); 6134 } else { 6135 vmovdqu(dst, src); 6136 } 6137 return; 6138 } 6139 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 6140 // pre-computed shuffle indices. 6141 switch(bt) { 6142 case T_LONG: 6143 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 6144 break; 6145 case T_INT: 6146 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 6147 break; 6148 case T_CHAR: 6149 case T_SHORT: 6150 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6151 break; 6152 default: 6153 fatal("Unsupported type %s", type2name(bt)); 6154 break; 6155 } 6156 vpshufb(dst, src, dst, vec_enc); 6157 } 6158 6159 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6160 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6161 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6162 assert(is_integral_type(bt), ""); 6163 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6164 assert(VM_Version::supports_avx512cd(), ""); 6165 switch(bt) { 6166 case T_LONG: 6167 evplzcntq(dst, ktmp, src, merge, vec_enc); 6168 break; 6169 case T_INT: 6170 evplzcntd(dst, ktmp, src, merge, vec_enc); 6171 break; 6172 case T_SHORT: 6173 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6174 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6175 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6176 vpunpckhwd(dst, xtmp1, src, vec_enc); 6177 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6178 vpackusdw(dst, xtmp2, dst, vec_enc); 6179 break; 6180 case T_BYTE: 6181 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6182 // accessing the lookup table. 6183 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6184 // accessing the lookup table. 6185 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6186 assert(VM_Version::supports_avx512bw(), ""); 6187 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6188 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6189 vpand(xtmp2, dst, src, vec_enc); 6190 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6191 vpsrlw(xtmp3, src, 4, vec_enc); 6192 vpand(xtmp3, dst, xtmp3, vec_enc); 6193 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6194 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6195 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6196 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6197 break; 6198 default: 6199 fatal("Unsupported type %s", type2name(bt)); 6200 break; 6201 } 6202 } 6203 6204 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6205 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6206 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6207 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6208 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6209 // accessing the lookup table. 6210 vpand(dst, xtmp2, src, vec_enc); 6211 vpshufb(dst, xtmp1, dst, vec_enc); 6212 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6213 // accessing the lookup table. 6214 vpsrlw(xtmp3, src, 4, vec_enc); 6215 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6216 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6217 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6218 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6219 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6220 vpaddb(dst, dst, xtmp2, vec_enc); 6221 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6222 } 6223 6224 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6225 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6226 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6227 // Add zero counts of lower byte and upper byte of a word if 6228 // upper byte holds a zero value. 6229 vpsrlw(xtmp3, src, 8, vec_enc); 6230 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6231 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6232 vpsllw(xtmp2, dst, 8, vec_enc); 6233 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6234 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6235 vpsrlw(dst, dst, 8, vec_enc); 6236 } 6237 6238 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6239 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6240 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6241 // hence biased exponent can be used to compute leading zero count as per 6242 // following formula:- 6243 // LZCNT = 32 - (biased_exp - 127) 6244 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6245 6246 // Broadcast 0xFF 6247 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6248 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6249 6250 // Extract biased exponent. 6251 vcvtdq2ps(dst, src, vec_enc); 6252 vpsrld(dst, dst, 23, vec_enc); 6253 vpand(dst, dst, xtmp1, vec_enc); 6254 6255 // Broadcast 127. 6256 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6257 // Exponent = biased_exp - 127 6258 vpsubd(dst, dst, xtmp1, vec_enc); 6259 6260 // Exponent = Exponent + 1 6261 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6262 vpaddd(dst, dst, xtmp3, vec_enc); 6263 6264 // Replace -ve exponent with zero, exponent is -ve when src 6265 // lane contains a zero value. 6266 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6267 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6268 6269 // Rematerialize broadcast 32. 6270 vpslld(xtmp1, xtmp3, 5, vec_enc); 6271 // Exponent is 32 if corresponding source lane contains max_int value. 6272 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6273 // LZCNT = 32 - exponent 6274 vpsubd(dst, xtmp1, dst, vec_enc); 6275 6276 // Replace LZCNT with a value 1 if corresponding source lane 6277 // contains max_int value. 6278 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6279 6280 // Replace biased_exp with 0 if source lane value is less than zero. 6281 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6282 vblendvps(dst, dst, xtmp2, src, vec_enc); 6283 } 6284 6285 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6286 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6287 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6288 // Add zero counts of lower word and upper word of a double word if 6289 // upper word holds a zero value. 6290 vpsrld(xtmp3, src, 16, vec_enc); 6291 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6292 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6293 vpslld(xtmp2, dst, 16, vec_enc); 6294 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6295 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6296 vpsrld(dst, dst, 16, vec_enc); 6297 // Add zero counts of lower doubleword and upper doubleword of a 6298 // quadword if upper doubleword holds a zero value. 6299 vpsrlq(xtmp3, src, 32, vec_enc); 6300 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6301 vpsllq(xtmp2, dst, 32, vec_enc); 6302 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6303 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6304 vpsrlq(dst, dst, 32, vec_enc); 6305 } 6306 6307 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6308 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6309 Register rtmp, int vec_enc) { 6310 assert(is_integral_type(bt), "unexpected type"); 6311 assert(vec_enc < Assembler::AVX_512bit, ""); 6312 switch(bt) { 6313 case T_LONG: 6314 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6315 break; 6316 case T_INT: 6317 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6318 break; 6319 case T_SHORT: 6320 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6321 break; 6322 case T_BYTE: 6323 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6324 break; 6325 default: 6326 fatal("Unsupported type %s", type2name(bt)); 6327 break; 6328 } 6329 } 6330 6331 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6332 switch(bt) { 6333 case T_BYTE: 6334 vpsubb(dst, src1, src2, vec_enc); 6335 break; 6336 case T_SHORT: 6337 vpsubw(dst, src1, src2, vec_enc); 6338 break; 6339 case T_INT: 6340 vpsubd(dst, src1, src2, vec_enc); 6341 break; 6342 case T_LONG: 6343 vpsubq(dst, src1, src2, vec_enc); 6344 break; 6345 default: 6346 fatal("Unsupported type %s", type2name(bt)); 6347 break; 6348 } 6349 } 6350 6351 // Trailing zero count computation is based on leading zero count operation as per 6352 // following equation. All AVX3 targets support AVX512CD feature which offers 6353 // direct vector instruction to compute leading zero count. 6354 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6355 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6356 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6357 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6358 assert(is_integral_type(bt), ""); 6359 // xtmp = -1 6360 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6361 // xtmp = xtmp + src 6362 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6363 // xtmp = xtmp & ~src 6364 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6365 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6366 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6367 vpsub(bt, dst, xtmp4, dst, vec_enc); 6368 } 6369 6370 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6371 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6372 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6373 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6374 assert(is_integral_type(bt), ""); 6375 // xtmp = 0 6376 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6377 // xtmp = 0 - src 6378 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6379 // xtmp = xtmp | src 6380 vpor(xtmp3, xtmp3, src, vec_enc); 6381 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6382 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6383 vpsub(bt, dst, xtmp1, dst, vec_enc); 6384 } 6385 6386 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6387 Label done; 6388 Label neg_divisor_fastpath; 6389 cmpl(divisor, 0); 6390 jccb(Assembler::less, neg_divisor_fastpath); 6391 xorl(rdx, rdx); 6392 divl(divisor); 6393 jmpb(done); 6394 bind(neg_divisor_fastpath); 6395 // Fastpath for divisor < 0: 6396 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6397 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6398 movl(rdx, rax); 6399 subl(rdx, divisor); 6400 if (VM_Version::supports_bmi1()) { 6401 andnl(rax, rdx, rax); 6402 } else { 6403 notl(rdx); 6404 andl(rax, rdx); 6405 } 6406 shrl(rax, 31); 6407 bind(done); 6408 } 6409 6410 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6411 Label done; 6412 Label neg_divisor_fastpath; 6413 cmpl(divisor, 0); 6414 jccb(Assembler::less, neg_divisor_fastpath); 6415 xorl(rdx, rdx); 6416 divl(divisor); 6417 jmpb(done); 6418 bind(neg_divisor_fastpath); 6419 // Fastpath when divisor < 0: 6420 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6421 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6422 movl(rdx, rax); 6423 subl(rax, divisor); 6424 if (VM_Version::supports_bmi1()) { 6425 andnl(rax, rax, rdx); 6426 } else { 6427 notl(rax); 6428 andl(rax, rdx); 6429 } 6430 sarl(rax, 31); 6431 andl(rax, divisor); 6432 subl(rdx, rax); 6433 bind(done); 6434 } 6435 6436 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6437 Label done; 6438 Label neg_divisor_fastpath; 6439 6440 cmpl(divisor, 0); 6441 jccb(Assembler::less, neg_divisor_fastpath); 6442 xorl(rdx, rdx); 6443 divl(divisor); 6444 jmpb(done); 6445 bind(neg_divisor_fastpath); 6446 // Fastpath for divisor < 0: 6447 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6448 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6449 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6450 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6451 movl(rdx, rax); 6452 subl(rax, divisor); 6453 if (VM_Version::supports_bmi1()) { 6454 andnl(rax, rax, rdx); 6455 } else { 6456 notl(rax); 6457 andl(rax, rdx); 6458 } 6459 movl(tmp, rax); 6460 shrl(rax, 31); // quotient 6461 sarl(tmp, 31); 6462 andl(tmp, divisor); 6463 subl(rdx, tmp); // remainder 6464 bind(done); 6465 } 6466 6467 #ifdef _LP64 6468 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6469 XMMRegister xtmp2, Register rtmp) { 6470 if(VM_Version::supports_gfni()) { 6471 // Galois field instruction based bit reversal based on following algorithm. 6472 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6473 mov64(rtmp, 0x8040201008040201L); 6474 movq(xtmp1, src); 6475 movq(xtmp2, rtmp); 6476 gf2p8affineqb(xtmp1, xtmp2, 0); 6477 movq(dst, xtmp1); 6478 } else { 6479 // Swap even and odd numbered bits. 6480 movl(rtmp, src); 6481 andl(rtmp, 0x55555555); 6482 shll(rtmp, 1); 6483 movl(dst, src); 6484 andl(dst, 0xAAAAAAAA); 6485 shrl(dst, 1); 6486 orl(dst, rtmp); 6487 6488 // Swap LSB and MSB 2 bits of each nibble. 6489 movl(rtmp, dst); 6490 andl(rtmp, 0x33333333); 6491 shll(rtmp, 2); 6492 andl(dst, 0xCCCCCCCC); 6493 shrl(dst, 2); 6494 orl(dst, rtmp); 6495 6496 // Swap LSB and MSB 4 bits of each byte. 6497 movl(rtmp, dst); 6498 andl(rtmp, 0x0F0F0F0F); 6499 shll(rtmp, 4); 6500 andl(dst, 0xF0F0F0F0); 6501 shrl(dst, 4); 6502 orl(dst, rtmp); 6503 } 6504 bswapl(dst); 6505 } 6506 6507 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6508 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6509 if(VM_Version::supports_gfni()) { 6510 // Galois field instruction based bit reversal based on following algorithm. 6511 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6512 mov64(rtmp1, 0x8040201008040201L); 6513 movq(xtmp1, src); 6514 movq(xtmp2, rtmp1); 6515 gf2p8affineqb(xtmp1, xtmp2, 0); 6516 movq(dst, xtmp1); 6517 } else { 6518 // Swap even and odd numbered bits. 6519 movq(rtmp1, src); 6520 mov64(rtmp2, 0x5555555555555555L); 6521 andq(rtmp1, rtmp2); 6522 shlq(rtmp1, 1); 6523 movq(dst, src); 6524 notq(rtmp2); 6525 andq(dst, rtmp2); 6526 shrq(dst, 1); 6527 orq(dst, rtmp1); 6528 6529 // Swap LSB and MSB 2 bits of each nibble. 6530 movq(rtmp1, dst); 6531 mov64(rtmp2, 0x3333333333333333L); 6532 andq(rtmp1, rtmp2); 6533 shlq(rtmp1, 2); 6534 notq(rtmp2); 6535 andq(dst, rtmp2); 6536 shrq(dst, 2); 6537 orq(dst, rtmp1); 6538 6539 // Swap LSB and MSB 4 bits of each byte. 6540 movq(rtmp1, dst); 6541 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6542 andq(rtmp1, rtmp2); 6543 shlq(rtmp1, 4); 6544 notq(rtmp2); 6545 andq(dst, rtmp2); 6546 shrq(dst, 4); 6547 orq(dst, rtmp1); 6548 } 6549 bswapq(dst); 6550 } 6551 6552 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6553 Label done; 6554 Label neg_divisor_fastpath; 6555 cmpq(divisor, 0); 6556 jccb(Assembler::less, neg_divisor_fastpath); 6557 xorl(rdx, rdx); 6558 divq(divisor); 6559 jmpb(done); 6560 bind(neg_divisor_fastpath); 6561 // Fastpath for divisor < 0: 6562 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6563 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6564 movq(rdx, rax); 6565 subq(rdx, divisor); 6566 if (VM_Version::supports_bmi1()) { 6567 andnq(rax, rdx, rax); 6568 } else { 6569 notq(rdx); 6570 andq(rax, rdx); 6571 } 6572 shrq(rax, 63); 6573 bind(done); 6574 } 6575 6576 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6577 Label done; 6578 Label neg_divisor_fastpath; 6579 cmpq(divisor, 0); 6580 jccb(Assembler::less, neg_divisor_fastpath); 6581 xorq(rdx, rdx); 6582 divq(divisor); 6583 jmp(done); 6584 bind(neg_divisor_fastpath); 6585 // Fastpath when divisor < 0: 6586 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6587 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6588 movq(rdx, rax); 6589 subq(rax, divisor); 6590 if (VM_Version::supports_bmi1()) { 6591 andnq(rax, rax, rdx); 6592 } else { 6593 notq(rax); 6594 andq(rax, rdx); 6595 } 6596 sarq(rax, 63); 6597 andq(rax, divisor); 6598 subq(rdx, rax); 6599 bind(done); 6600 } 6601 6602 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6603 Label done; 6604 Label neg_divisor_fastpath; 6605 cmpq(divisor, 0); 6606 jccb(Assembler::less, neg_divisor_fastpath); 6607 xorq(rdx, rdx); 6608 divq(divisor); 6609 jmp(done); 6610 bind(neg_divisor_fastpath); 6611 // Fastpath for divisor < 0: 6612 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6613 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6614 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6615 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6616 movq(rdx, rax); 6617 subq(rax, divisor); 6618 if (VM_Version::supports_bmi1()) { 6619 andnq(rax, rax, rdx); 6620 } else { 6621 notq(rax); 6622 andq(rax, rdx); 6623 } 6624 movq(tmp, rax); 6625 shrq(rax, 63); // quotient 6626 sarq(tmp, 63); 6627 andq(tmp, divisor); 6628 subq(rdx, tmp); // remainder 6629 bind(done); 6630 } 6631 #endif 6632 6633 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6634 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6635 int vlen_enc) { 6636 assert(VM_Version::supports_avx512bw(), ""); 6637 // Byte shuffles are inlane operations and indices are determined using 6638 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6639 // normalized to index range 0-15. This makes sure that all the multiples 6640 // of an index value are placed at same relative position in 128 bit 6641 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6642 // will be 16th element in their respective 128 bit lanes. 6643 movl(rtmp, 16); 6644 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6645 6646 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6647 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6648 // original shuffle indices and move the shuffled lanes corresponding to true 6649 // mask to destination vector. 6650 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6651 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6652 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6653 6654 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6655 // and broadcasting second 128 bit lane. 6656 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6657 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6658 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6659 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6660 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6661 6662 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6663 // and broadcasting third 128 bit lane. 6664 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6665 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6666 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6667 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6668 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6669 6670 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6671 // and broadcasting third 128 bit lane. 6672 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6673 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6674 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6675 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6676 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6677 } 6678 6679 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6680 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6681 if (vlen_enc == AVX_128bit) { 6682 vpermilps(dst, src, shuffle, vlen_enc); 6683 } else if (bt == T_INT) { 6684 vpermd(dst, shuffle, src, vlen_enc); 6685 } else { 6686 assert(bt == T_FLOAT, ""); 6687 vpermps(dst, shuffle, src, vlen_enc); 6688 } 6689 } 6690 6691 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6692 switch(elem_bt) { 6693 case T_BYTE: 6694 if (ideal_opc == Op_SaturatingAddV) { 6695 vpaddsb(dst, src1, src2, vlen_enc); 6696 } else { 6697 assert(ideal_opc == Op_SaturatingSubV, ""); 6698 vpsubsb(dst, src1, src2, vlen_enc); 6699 } 6700 break; 6701 case T_SHORT: 6702 if (ideal_opc == Op_SaturatingAddV) { 6703 vpaddsw(dst, src1, src2, vlen_enc); 6704 } else { 6705 assert(ideal_opc == Op_SaturatingSubV, ""); 6706 vpsubsw(dst, src1, src2, vlen_enc); 6707 } 6708 break; 6709 default: 6710 fatal("Unsupported type %s", type2name(elem_bt)); 6711 break; 6712 } 6713 } 6714 6715 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6716 switch(elem_bt) { 6717 case T_BYTE: 6718 if (ideal_opc == Op_SaturatingAddV) { 6719 vpaddusb(dst, src1, src2, vlen_enc); 6720 } else { 6721 assert(ideal_opc == Op_SaturatingSubV, ""); 6722 vpsubusb(dst, src1, src2, vlen_enc); 6723 } 6724 break; 6725 case T_SHORT: 6726 if (ideal_opc == Op_SaturatingAddV) { 6727 vpaddusw(dst, src1, src2, vlen_enc); 6728 } else { 6729 assert(ideal_opc == Op_SaturatingSubV, ""); 6730 vpsubusw(dst, src1, src2, vlen_enc); 6731 } 6732 break; 6733 default: 6734 fatal("Unsupported type %s", type2name(elem_bt)); 6735 break; 6736 } 6737 } 6738 6739 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6740 XMMRegister src2, KRegister ktmp, int vlen_enc) { 6741 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input. 6742 // overflow_mask = Inp1 <u Inp2 6743 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc); 6744 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative) 6745 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false); 6746 } 6747 6748 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6749 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6750 // Emulate unsigned comparison using signed comparison 6751 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE 6752 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true); 6753 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc); 6754 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc); 6755 6756 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc); 6757 6758 // Res = INP1 - INP2 (non-commutative and non-associative) 6759 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6760 // Res = Mask ? Zero : Res 6761 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); 6762 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc); 6763 } 6764 6765 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6766 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) { 6767 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation. 6768 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2) 6769 // Res = Signed Add INP1, INP2 6770 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6771 // T1 = SRC1 | SRC2 6772 vpor(xtmp1, src1, src2, vlen_enc); 6773 // Max_Unsigned = -1 6774 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6775 // Unsigned compare: Mask = Res <u T1 6776 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc); 6777 // res = Mask ? Max_Unsigned : Res 6778 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc); 6779 } 6780 6781 // 6782 // Section 2-13 Hacker's Delight list following overflow detection check for saturating 6783 // unsigned addition operation. 6784 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1 6785 // 6786 // We empirically determined its semantic equivalence to following reduced expression 6787 // overflow_mask = (a + b) <u (a | b) 6788 // 6789 // and also verified it though Alive2 solver. 6790 // (https://alive2.llvm.org/ce/z/XDQ7dY) 6791 // 6792 6793 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6794 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) { 6795 // Res = Signed Add INP1, INP2 6796 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6797 // Compute T1 = INP1 | INP2 6798 vpor(xtmp3, src1, src2, vlen_enc); 6799 // T1 = Minimum signed value. 6800 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6801 // Convert T1 to signed value, T1 = T1 + MIN_VALUE 6802 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc); 6803 // Convert Res to signed value, Res<s> = Res + MIN_VALUE 6804 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc); 6805 // Compute overflow detection mask = Res<1> <s T1 6806 if (elem_bt == T_INT) { 6807 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc); 6808 } else { 6809 assert(elem_bt == T_LONG, ""); 6810 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc); 6811 } 6812 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6813 } 6814 6815 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6816 int vlen_enc, bool xtmp2_hold_M1) { 6817 if (VM_Version::supports_avx512dq()) { 6818 evpmovq2m(ktmp, src, vlen_enc); 6819 } else { 6820 assert(VM_Version::supports_evex(), ""); 6821 if (!xtmp2_hold_M1) { 6822 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6823 } 6824 evpsraq(xtmp1, src, 63, vlen_enc); 6825 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6826 } 6827 } 6828 6829 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6830 int vlen_enc, bool xtmp2_hold_M1) { 6831 if (VM_Version::supports_avx512dq()) { 6832 evpmovd2m(ktmp, src, vlen_enc); 6833 } else { 6834 assert(VM_Version::supports_evex(), ""); 6835 if (!xtmp2_hold_M1) { 6836 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6837 } 6838 vpsrad(xtmp1, src, 31, vlen_enc); 6839 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6840 } 6841 } 6842 6843 6844 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 6845 if (elem_bt == T_LONG) { 6846 if (VM_Version::supports_evex()) { 6847 evpsraq(dst, src, 63, vlen_enc); 6848 } else { 6849 vpsrad(dst, src, 31, vlen_enc); 6850 vpshufd(dst, dst, 0xF5, vlen_enc); 6851 } 6852 } else { 6853 assert(elem_bt == T_INT, ""); 6854 vpsrad(dst, src, 31, vlen_enc); 6855 } 6856 } 6857 6858 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6859 if (compute_allones) { 6860 if (vlen_enc == Assembler::AVX_512bit) { 6861 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6862 } else { 6863 vpcmpeqq(allones, allones, allones, vlen_enc); 6864 } 6865 } 6866 if (elem_bt == T_LONG) { 6867 vpsrlq(dst, allones, 1, vlen_enc); 6868 } else { 6869 assert(elem_bt == T_INT, ""); 6870 vpsrld(dst, allones, 1, vlen_enc); 6871 } 6872 } 6873 6874 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6875 if (compute_allones) { 6876 if (vlen_enc == Assembler::AVX_512bit) { 6877 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6878 } else { 6879 vpcmpeqq(allones, allones, allones, vlen_enc); 6880 } 6881 } 6882 if (elem_bt == T_LONG) { 6883 vpsllq(dst, allones, 63, vlen_enc); 6884 } else { 6885 assert(elem_bt == T_INT, ""); 6886 vpslld(dst, allones, 31, vlen_enc); 6887 } 6888 } 6889 6890 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2, 6891 Assembler::ComparisonPredicate cond, int vlen_enc) { 6892 switch(elem_bt) { 6893 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break; 6894 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break; 6895 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break; 6896 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break; 6897 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6898 } 6899 } 6900 6901 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6902 switch(elem_bt) { 6903 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break; 6904 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break; 6905 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break; 6906 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break; 6907 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6908 } 6909 } 6910 6911 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1, 6912 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) { 6913 if (elem_bt == T_LONG) { 6914 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6915 } else { 6916 assert(elem_bt == T_INT, ""); 6917 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6918 } 6919 } 6920 6921 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6922 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6923 KRegister ktmp1, KRegister ktmp2, int vlen_enc) { 6924 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6925 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6926 // Overflow detection based on Hacker's delight section 2-13. 6927 if (ideal_opc == Op_SaturatingAddV) { 6928 // res = src1 + src2 6929 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6930 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6931 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6932 vpxor(xtmp1, dst, src1, vlen_enc); 6933 vpxor(xtmp2, dst, src2, vlen_enc); 6934 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6935 } else { 6936 assert(ideal_opc == Op_SaturatingSubV, ""); 6937 // res = src1 - src2 6938 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6939 // Overflow occurs when both inputs have opposite polarity and 6940 // result polarity does not comply with first input polarity. 6941 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6942 vpxor(xtmp1, src1, src2, vlen_enc); 6943 vpxor(xtmp2, dst, src1, vlen_enc); 6944 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6945 } 6946 6947 // Compute overflow detection mask. 6948 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc); 6949 // Note: xtmp1 hold -1 in all its lanes after above call. 6950 6951 // Compute mask based on first input polarity. 6952 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true); 6953 6954 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6955 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6956 6957 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to 6958 // set bits in first input polarity mask holds a min value. 6959 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc); 6960 // Blend destination lanes with saturated values using overflow detection mask. 6961 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc); 6962 } 6963 6964 6965 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6966 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6967 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) { 6968 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6969 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6970 // Overflow detection based on Hacker's delight section 2-13. 6971 if (ideal_opc == Op_SaturatingAddV) { 6972 // res = src1 + src2 6973 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6974 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6975 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6976 vpxor(xtmp1, dst, src1, vlen_enc); 6977 vpxor(xtmp2, dst, src2, vlen_enc); 6978 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6979 } else { 6980 assert(ideal_opc == Op_SaturatingSubV, ""); 6981 // res = src1 - src2 6982 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6983 // Overflow occurs when both inputs have opposite polarity and 6984 // result polarity does not comply with first input polarity. 6985 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6986 vpxor(xtmp1, src1, src2, vlen_enc); 6987 vpxor(xtmp2, dst, src1, vlen_enc); 6988 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6989 } 6990 6991 // Sign-extend to compute overflow detection mask. 6992 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc); 6993 6994 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc); 6995 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc); 6996 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6997 6998 // Compose saturating min/max vector using first input polarity mask. 6999 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc); 7000 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc); 7001 7002 // Blend result with saturating vector using overflow detection mask. 7003 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 7004 } 7005 7006 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7007 switch(elem_bt) { 7008 case T_BYTE: 7009 if (ideal_opc == Op_SaturatingAddV) { 7010 vpaddsb(dst, src1, src2, vlen_enc); 7011 } else { 7012 assert(ideal_opc == Op_SaturatingSubV, ""); 7013 vpsubsb(dst, src1, src2, vlen_enc); 7014 } 7015 break; 7016 case T_SHORT: 7017 if (ideal_opc == Op_SaturatingAddV) { 7018 vpaddsw(dst, src1, src2, vlen_enc); 7019 } else { 7020 assert(ideal_opc == Op_SaturatingSubV, ""); 7021 vpsubsw(dst, src1, src2, vlen_enc); 7022 } 7023 break; 7024 default: 7025 fatal("Unsupported type %s", type2name(elem_bt)); 7026 break; 7027 } 7028 } 7029 7030 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7031 switch(elem_bt) { 7032 case T_BYTE: 7033 if (ideal_opc == Op_SaturatingAddV) { 7034 vpaddusb(dst, src1, src2, vlen_enc); 7035 } else { 7036 assert(ideal_opc == Op_SaturatingSubV, ""); 7037 vpsubusb(dst, src1, src2, vlen_enc); 7038 } 7039 break; 7040 case T_SHORT: 7041 if (ideal_opc == Op_SaturatingAddV) { 7042 vpaddusw(dst, src1, src2, vlen_enc); 7043 } else { 7044 assert(ideal_opc == Op_SaturatingSubV, ""); 7045 vpsubusw(dst, src1, src2, vlen_enc); 7046 } 7047 break; 7048 default: 7049 fatal("Unsupported type %s", type2name(elem_bt)); 7050 break; 7051 } 7052 } 7053 7054 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 7055 XMMRegister src2, int vlen_enc) { 7056 switch(elem_bt) { 7057 case T_BYTE: 7058 evpermi2b(dst, src1, src2, vlen_enc); 7059 break; 7060 case T_SHORT: 7061 evpermi2w(dst, src1, src2, vlen_enc); 7062 break; 7063 case T_INT: 7064 evpermi2d(dst, src1, src2, vlen_enc); 7065 break; 7066 case T_LONG: 7067 evpermi2q(dst, src1, src2, vlen_enc); 7068 break; 7069 case T_FLOAT: 7070 evpermi2ps(dst, src1, src2, vlen_enc); 7071 break; 7072 case T_DOUBLE: 7073 evpermi2pd(dst, src1, src2, vlen_enc); 7074 break; 7075 default: 7076 fatal("Unsupported type %s", type2name(elem_bt)); 7077 break; 7078 } 7079 } 7080 7081 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) { 7082 if (is_unsigned) { 7083 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7084 } else { 7085 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7086 } 7087 } 7088 7089 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) { 7090 if (is_unsigned) { 7091 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7092 } else { 7093 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7094 } 7095 }