1 /* 2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "gc/shared/barrierSet.hpp" 28 #include "gc/shared/barrierSetAssembler.hpp" 29 #include "oops/methodData.hpp" 30 #include "opto/c2_MacroAssembler.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/output.hpp" 33 #include "opto/opcodes.hpp" 34 #include "opto/subnode.hpp" 35 #include "runtime/globals.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 #include "utilities/checkedCast.hpp" 39 #include "utilities/globalDefinitions.hpp" 40 #include "utilities/powerOfTwo.hpp" 41 #include "utilities/sizes.hpp" 42 43 #ifdef PRODUCT 44 #define BLOCK_COMMENT(str) /* nothing */ 45 #define STOP(error) stop(error) 46 #else 47 #define BLOCK_COMMENT(str) block_comment(str) 48 #define STOP(error) block_comment(error); stop(error) 49 #endif 50 51 // C2 compiled method's prolog code. 52 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 53 54 // WARNING: Initial instruction MUST be 5 bytes or longer so that 55 // NativeJump::patch_verified_entry will be able to patch out the entry 56 // code safely. The push to verify stack depth is ok at 5 bytes, 57 // the frame allocation can be either 3 or 6 bytes. So if we don't do 58 // stack bang then we must use the 6 byte frame allocation even if 59 // we have no frame. :-( 60 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 61 62 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 63 // Remove word for return addr 64 framesize -= wordSize; 65 stack_bang_size -= wordSize; 66 67 // Calls to C2R adapters often do not accept exceptional returns. 68 // We require that their callers must bang for them. But be careful, because 69 // some VM calls (such as call site linkage) can use several kilobytes of 70 // stack. But the stack safety zone should account for that. 71 // See bugs 4446381, 4468289, 4497237. 72 if (stack_bang_size > 0) { 73 generate_stack_overflow_check(stack_bang_size); 74 75 // We always push rbp, so that on return to interpreter rbp, will be 76 // restored correctly and we can correct the stack. 77 push(rbp); 78 // Save caller's stack pointer into RBP if the frame pointer is preserved. 79 if (PreserveFramePointer) { 80 mov(rbp, rsp); 81 } 82 // Remove word for ebp 83 framesize -= wordSize; 84 85 // Create frame 86 if (framesize) { 87 subptr(rsp, framesize); 88 } 89 } else { 90 // Create frame (force generation of a 4 byte immediate value) 91 subptr_imm32(rsp, framesize); 92 93 // Save RBP register now. 94 framesize -= wordSize; 95 movptr(Address(rsp, framesize), rbp); 96 // Save caller's stack pointer into RBP if the frame pointer is preserved. 97 if (PreserveFramePointer) { 98 movptr(rbp, rsp); 99 if (framesize > 0) { 100 addptr(rbp, framesize); 101 } 102 } 103 } 104 105 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 106 framesize -= wordSize; 107 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 108 } 109 110 #ifndef _LP64 111 // If method sets FPU control word do it now 112 if (fp_mode_24b) { 113 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 114 } 115 if (UseSSE >= 2 && VerifyFPU) { 116 verify_FPU(0, "FPU stack must be clean on entry"); 117 } 118 #endif 119 120 #ifdef ASSERT 121 if (VerifyStackAtCalls) { 122 Label L; 123 push(rax); 124 mov(rax, rsp); 125 andptr(rax, StackAlignmentInBytes-1); 126 cmpptr(rax, StackAlignmentInBytes-wordSize); 127 pop(rax); 128 jcc(Assembler::equal, L); 129 STOP("Stack is not properly aligned!"); 130 bind(L); 131 } 132 #endif 133 134 if (!is_stub) { 135 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 136 #ifdef _LP64 137 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 138 Label dummy_slow_path; 139 Label dummy_continuation; 140 Label* slow_path = &dummy_slow_path; 141 Label* continuation = &dummy_continuation; 142 if (!Compile::current()->output()->in_scratch_emit_size()) { 143 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 144 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 145 Compile::current()->output()->add_stub(stub); 146 slow_path = &stub->entry(); 147 continuation = &stub->continuation(); 148 } 149 bs->nmethod_entry_barrier(this, slow_path, continuation); 150 #else 151 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 152 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 153 #endif 154 } 155 } 156 157 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 158 switch (vlen_in_bytes) { 159 case 4: // fall-through 160 case 8: // fall-through 161 case 16: return Assembler::AVX_128bit; 162 case 32: return Assembler::AVX_256bit; 163 case 64: return Assembler::AVX_512bit; 164 165 default: { 166 ShouldNotReachHere(); 167 return Assembler::AVX_NoVec; 168 } 169 } 170 } 171 172 // fast_lock and fast_unlock used by C2 173 174 // Because the transitions from emitted code to the runtime 175 // monitorenter/exit helper stubs are so slow it's critical that 176 // we inline both the stack-locking fast path and the inflated fast path. 177 // 178 // See also: cmpFastLock and cmpFastUnlock. 179 // 180 // What follows is a specialized inline transliteration of the code 181 // in enter() and exit(). If we're concerned about I$ bloat another 182 // option would be to emit TrySlowEnter and TrySlowExit methods 183 // at startup-time. These methods would accept arguments as 184 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 185 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 186 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 187 // In practice, however, the # of lock sites is bounded and is usually small. 188 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 189 // if the processor uses simple bimodal branch predictors keyed by EIP 190 // Since the helper routines would be called from multiple synchronization 191 // sites. 192 // 193 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 194 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 195 // to those specialized methods. That'd give us a mostly platform-independent 196 // implementation that the JITs could optimize and inline at their pleasure. 197 // Done correctly, the only time we'd need to cross to native could would be 198 // to park() or unpark() threads. We'd also need a few more unsafe operators 199 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 200 // (b) explicit barriers or fence operations. 201 // 202 // TODO: 203 // 204 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 205 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 206 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 207 // the lock operators would typically be faster than reifying Self. 208 // 209 // * Ideally I'd define the primitives as: 210 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 211 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 212 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 213 // Instead, we're stuck with a rather awkward and brittle register assignments below. 214 // Furthermore the register assignments are overconstrained, possibly resulting in 215 // sub-optimal code near the synchronization site. 216 // 217 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 218 // Alternately, use a better sp-proximity test. 219 // 220 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 221 // Either one is sufficient to uniquely identify a thread. 222 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 223 // 224 // * Intrinsify notify() and notifyAll() for the common cases where the 225 // object is locked by the calling thread but the waitlist is empty. 226 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 227 // 228 // * use jccb and jmpb instead of jcc and jmp to improve code density. 229 // But beware of excessive branch density on AMD Opterons. 230 // 231 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 232 // or failure of the fast path. If the fast path fails then we pass 233 // control to the slow path, typically in C. In fast_lock and 234 // fast_unlock we often branch to DONE_LABEL, just to find that C2 235 // will emit a conditional branch immediately after the node. 236 // So we have branches to branches and lots of ICC.ZF games. 237 // Instead, it might be better to have C2 pass a "FailureLabel" 238 // into fast_lock and fast_unlock. In the case of success, control 239 // will drop through the node. ICC.ZF is undefined at exit. 240 // In the case of failure, the node will branch directly to the 241 // FailureLabel 242 243 244 // obj: object to lock 245 // box: on-stack box address (displaced header location) - KILLED 246 // rax,: tmp -- KILLED 247 // scr: tmp -- KILLED 248 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 249 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 250 Metadata* method_data) { 251 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 252 // Ensure the register assignments are disjoint 253 assert(tmpReg == rax, ""); 254 assert(cx1Reg == noreg, ""); 255 assert(cx2Reg == noreg, ""); 256 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 257 258 // Possible cases that we'll encounter in fast_lock 259 // ------------------------------------------------ 260 // * Inflated 261 // -- unlocked 262 // -- Locked 263 // = by self 264 // = by other 265 // * neutral 266 // * stack-locked 267 // -- by self 268 // = sp-proximity test hits 269 // = sp-proximity test generates false-negative 270 // -- by other 271 // 272 273 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 274 275 if (DiagnoseSyncOnValueBasedClasses != 0) { 276 load_klass(tmpReg, objReg, scrReg); 277 testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 278 jcc(Assembler::notZero, DONE_LABEL); 279 } 280 281 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 282 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 283 jcc(Assembler::notZero, IsInflated); 284 285 if (LockingMode == LM_MONITOR) { 286 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 287 testptr(objReg, objReg); 288 } else { 289 assert(LockingMode == LM_LEGACY, "must be"); 290 // Attempt stack-locking ... 291 orptr (tmpReg, markWord::unlocked_value); 292 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 293 lock(); 294 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 295 jcc(Assembler::equal, COUNT); // Success 296 297 // Recursive locking. 298 // The object is stack-locked: markword contains stack pointer to BasicLock. 299 // Locked by current thread if difference with current SP is less than one page. 300 subptr(tmpReg, rsp); 301 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 302 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 303 movptr(Address(boxReg, 0), tmpReg); 304 } 305 jmp(DONE_LABEL); 306 307 bind(IsInflated); 308 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 309 310 #ifndef _LP64 311 // Just take slow path to avoid dealing with 64 bit atomic instructions here. 312 orl(boxReg, 1); // set ICC.ZF=0 to indicate failure 313 #else 314 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 315 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 316 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 317 318 // It's inflated and we use scrReg for ObjectMonitor* in this section. 319 movptr(boxReg, Address(r15_thread, JavaThread::monitor_owner_id_offset())); 320 movq(scrReg, tmpReg); 321 xorq(tmpReg, tmpReg); 322 lock(); 323 cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 324 325 // Propagate ICC.ZF from CAS above into DONE_LABEL. 326 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 327 328 cmpptr(boxReg, rax); // Check if we are already the owner (recursive lock) 329 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 330 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 331 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 332 #endif // _LP64 333 bind(DONE_LABEL); 334 335 // ZFlag == 1 count in fast path 336 // ZFlag == 0 count in slow path 337 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 338 339 bind(COUNT); 340 if (LockingMode == LM_LEGACY) { 341 #ifdef _LP64 342 // Count monitors in fast path 343 increment(Address(thread, JavaThread::held_monitor_count_offset())); 344 #endif 345 } 346 xorl(tmpReg, tmpReg); // Set ZF == 1 347 348 bind(NO_COUNT); 349 350 // At NO_COUNT the icc ZFlag is set as follows ... 351 // fast_unlock uses the same protocol. 352 // ZFlag == 1 -> Success 353 // ZFlag == 0 -> Failure - force control through the slow path 354 } 355 356 // obj: object to unlock 357 // box: box address (displaced header location), killed. Must be EAX. 358 // tmp: killed, cannot be obj nor box. 359 // 360 // Some commentary on balanced locking: 361 // 362 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 363 // Methods that don't have provably balanced locking are forced to run in the 364 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 365 // The interpreter provides two properties: 366 // I1: At return-time the interpreter automatically and quietly unlocks any 367 // objects acquired the current activation (frame). Recall that the 368 // interpreter maintains an on-stack list of locks currently held by 369 // a frame. 370 // I2: If a method attempts to unlock an object that is not held by the 371 // the frame the interpreter throws IMSX. 372 // 373 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 374 // B() doesn't have provably balanced locking so it runs in the interpreter. 375 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 376 // is still locked by A(). 377 // 378 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 379 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 380 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 381 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 382 // Arguably given that the spec legislates the JNI case as undefined our implementation 383 // could reasonably *avoid* checking owner in fast_unlock(). 384 // In the interest of performance we elide m->Owner==Self check in unlock. 385 // A perfectly viable alternative is to elide the owner check except when 386 // Xcheck:jni is enabled. 387 388 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { 389 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 390 assert(boxReg == rax, ""); 391 assert_different_registers(objReg, boxReg, tmpReg); 392 393 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 394 395 if (LockingMode == LM_LEGACY) { 396 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 397 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 398 } 399 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 400 if (LockingMode != LM_MONITOR) { 401 testptr(tmpReg, markWord::monitor_value); // Inflated? 402 jcc(Assembler::zero, Stacked); 403 } 404 405 // It's inflated. 406 407 #ifndef _LP64 408 // Just take slow path to avoid dealing with 64 bit atomic instructions here. 409 orl(boxReg, 1); // set ICC.ZF=0 to indicate failure 410 jmpb(DONE_LABEL); 411 #else 412 // Despite our balanced locking property we still check that m->_owner == Self 413 // as java routines or native JNI code called by this thread might 414 // have released the lock. 415 // 416 // If there's no contention try a 1-0 exit. That is, exit without 417 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 418 // we detect and recover from the race that the 1-0 exit admits. 419 // 420 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 421 // before it STs null into _owner, releasing the lock. Updates 422 // to data protected by the critical section must be visible before 423 // we drop the lock (and thus before any other thread could acquire 424 // the lock and observe the fields protected by the lock). 425 // IA32's memory-model is SPO, so STs are ordered with respect to 426 // each other and there's no need for an explicit barrier (fence). 427 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 428 Label LSuccess, LNotRecursive; 429 430 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 431 jccb(Assembler::equal, LNotRecursive); 432 433 // Recursive inflated unlock 434 decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 435 jmpb(LSuccess); 436 437 bind(LNotRecursive); 438 439 // Set owner to null. 440 // Release to satisfy the JMM 441 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 442 // We need a full fence after clearing owner to avoid stranding. 443 // StoreLoad achieves this. 444 membar(StoreLoad); 445 446 // Check if the entry_list is empty. 447 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(entry_list)), NULL_WORD); 448 jccb(Assembler::zero, LSuccess); // If so we are done. 449 450 // Check if there is a successor. 451 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 452 jccb(Assembler::notZero, LSuccess); // If so we are done. 453 454 // Save the monitor pointer in the current thread, so we can try to 455 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 456 andptr(tmpReg, ~(int32_t)markWord::monitor_value); 457 movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg); 458 459 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 460 jmpb (DONE_LABEL); 461 462 bind (LSuccess); 463 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 464 jmpb (DONE_LABEL); 465 #endif // _LP64 466 467 if (LockingMode == LM_LEGACY) { 468 bind (Stacked); 469 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 470 lock(); 471 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 472 // Intentional fall-thru into DONE_LABEL 473 } 474 475 bind(DONE_LABEL); 476 477 // ZFlag == 1 count in fast path 478 // ZFlag == 0 count in slow path 479 jccb(Assembler::notZero, NO_COUNT); 480 481 bind(COUNT); 482 483 if (LockingMode == LM_LEGACY) { 484 // Count monitors in fast path 485 #ifdef _LP64 486 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 487 #endif 488 } 489 490 xorl(tmpReg, tmpReg); // Set ZF == 1 491 492 bind(NO_COUNT); 493 } 494 495 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 496 Register t, Register thread) { 497 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 498 assert(rax_reg == rax, "Used for CAS"); 499 assert_different_registers(obj, box, rax_reg, t, thread); 500 501 // Handle inflated monitor. 502 Label inflated; 503 // Finish fast lock successfully. ZF value is irrelevant. 504 Label locked; 505 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 506 Label slow_path; 507 508 if (UseObjectMonitorTable) { 509 // Clear cache in case fast locking succeeds. 510 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 511 } 512 513 if (DiagnoseSyncOnValueBasedClasses != 0) { 514 load_klass(rax_reg, obj, t); 515 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 516 jcc(Assembler::notZero, slow_path); 517 } 518 519 const Register mark = t; 520 521 { // Lightweight Lock 522 523 Label push; 524 525 const Register top = UseObjectMonitorTable ? rax_reg : box; 526 527 // Load the mark. 528 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 529 530 // Prefetch top. 531 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 532 533 // Check for monitor (0b10). 534 testptr(mark, markWord::monitor_value); 535 jcc(Assembler::notZero, inflated); 536 537 // Check if lock-stack is full. 538 cmpl(top, LockStack::end_offset() - 1); 539 jcc(Assembler::greater, slow_path); 540 541 // Check if recursive. 542 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 543 jccb(Assembler::equal, push); 544 545 // Try to lock. Transition lock bits 0b01 => 0b00 546 movptr(rax_reg, mark); 547 orptr(rax_reg, markWord::unlocked_value); 548 andptr(mark, ~(int32_t)markWord::unlocked_value); 549 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 550 jcc(Assembler::notEqual, slow_path); 551 552 if (UseObjectMonitorTable) { 553 // Need to reload top, clobbered by CAS. 554 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 555 } 556 bind(push); 557 // After successful lock, push object on lock-stack. 558 movptr(Address(thread, top), obj); 559 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 560 jmpb(locked); 561 } 562 563 { // Handle inflated monitor. 564 bind(inflated); 565 566 #ifndef _LP64 567 // Just take slow path to avoid dealing with 64 bit atomic instructions here. 568 orl(box, 1); // set ICC.ZF=0 to indicate failure 569 jmpb(slow_path); 570 #else 571 const Register monitor = t; 572 573 if (!UseObjectMonitorTable) { 574 assert(mark == monitor, "should be the same here"); 575 } else { 576 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 577 // Fetch ObjectMonitor* from the cache or take the slow-path. 578 Label monitor_found; 579 580 // Load cache address 581 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 582 583 const int num_unrolled = 2; 584 for (int i = 0; i < num_unrolled; i++) { 585 cmpptr(obj, Address(t)); 586 jccb(Assembler::equal, monitor_found); 587 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 588 } 589 590 Label loop; 591 592 // Search for obj in cache. 593 bind(loop); 594 595 // Check for match. 596 cmpptr(obj, Address(t)); 597 jccb(Assembler::equal, monitor_found); 598 599 // Search until null encountered, guaranteed _null_sentinel at end. 600 cmpptr(Address(t), 1); 601 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 602 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 603 jmpb(loop); 604 605 // Cache hit. 606 bind(monitor_found); 607 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 608 } 609 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 610 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 611 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 612 613 Label monitor_locked; 614 // Lock the monitor. 615 616 if (UseObjectMonitorTable) { 617 // Cache the monitor for unlock before trashing box. On failure to acquire 618 // the lock, the slow path will reset the entry accordingly (see CacheSetter). 619 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 620 } 621 622 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 623 xorptr(rax_reg, rax_reg); 624 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset())); 625 lock(); cmpxchgptr(box, owner_address); 626 jccb(Assembler::equal, monitor_locked); 627 628 // Check if recursive. 629 cmpptr(box, rax_reg); 630 jccb(Assembler::notEqual, slow_path); 631 632 // Recursive. 633 increment(recursions_address); 634 635 bind(monitor_locked); 636 #endif // _LP64 637 } 638 639 bind(locked); 640 // Set ZF = 1 641 xorl(rax_reg, rax_reg); 642 643 #ifdef ASSERT 644 // Check that locked label is reached with ZF set. 645 Label zf_correct; 646 Label zf_bad_zero; 647 jcc(Assembler::zero, zf_correct); 648 jmp(zf_bad_zero); 649 #endif 650 651 bind(slow_path); 652 #ifdef ASSERT 653 // Check that slow_path label is reached with ZF not set. 654 jcc(Assembler::notZero, zf_correct); 655 stop("Fast Lock ZF != 0"); 656 bind(zf_bad_zero); 657 stop("Fast Lock ZF != 1"); 658 bind(zf_correct); 659 #endif 660 // C2 uses the value of ZF to determine the continuation. 661 } 662 663 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 664 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 665 assert(reg_rax == rax, "Used for CAS"); 666 assert_different_registers(obj, reg_rax, t); 667 668 // Handle inflated monitor. 669 Label inflated, inflated_check_lock_stack; 670 // Finish fast unlock successfully. MUST jump with ZF == 1 671 Label unlocked, slow_path; 672 673 const Register mark = t; 674 const Register monitor = t; 675 const Register top = UseObjectMonitorTable ? t : reg_rax; 676 const Register box = reg_rax; 677 678 Label dummy; 679 C2FastUnlockLightweightStub* stub = nullptr; 680 681 if (!Compile::current()->output()->in_scratch_emit_size()) { 682 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 683 Compile::current()->output()->add_stub(stub); 684 } 685 686 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 687 688 { // Lightweight Unlock 689 690 // Load top. 691 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 692 693 if (!UseObjectMonitorTable) { 694 // Prefetch mark. 695 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 696 } 697 698 // Check if obj is top of lock-stack. 699 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 700 // Top of lock stack was not obj. Must be monitor. 701 jcc(Assembler::notEqual, inflated_check_lock_stack); 702 703 // Pop lock-stack. 704 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 705 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 706 707 // Check if recursive. 708 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 709 jcc(Assembler::equal, unlocked); 710 711 // We elide the monitor check, let the CAS fail instead. 712 713 if (UseObjectMonitorTable) { 714 // Load mark. 715 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 716 } 717 718 // Try to unlock. Transition lock bits 0b00 => 0b01 719 movptr(reg_rax, mark); 720 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 721 orptr(mark, markWord::unlocked_value); 722 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 723 jcc(Assembler::notEqual, push_and_slow_path); 724 jmp(unlocked); 725 } 726 727 728 { // Handle inflated monitor. 729 bind(inflated_check_lock_stack); 730 #ifdef ASSERT 731 Label check_done; 732 subl(top, oopSize); 733 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 734 jcc(Assembler::below, check_done); 735 cmpptr(obj, Address(thread, top)); 736 jccb(Assembler::notEqual, inflated_check_lock_stack); 737 stop("Fast Unlock lock on stack"); 738 bind(check_done); 739 if (UseObjectMonitorTable) { 740 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 741 } 742 testptr(mark, markWord::monitor_value); 743 jccb(Assembler::notZero, inflated); 744 stop("Fast Unlock not monitor"); 745 #endif 746 747 bind(inflated); 748 749 #ifndef _LP64 750 // Just take slow path to avoid dealing with 64 bit atomic instructions here. 751 orl(t, 1); // set ICC.ZF=0 to indicate failure 752 jmpb(slow_path); 753 #else 754 if (!UseObjectMonitorTable) { 755 assert(mark == monitor, "should be the same here"); 756 } else { 757 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 758 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 759 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 760 cmpptr(monitor, alignof(ObjectMonitor*)); 761 jcc(Assembler::below, slow_path); 762 } 763 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 764 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 765 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 766 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag}; 767 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 768 769 Label recursive; 770 771 // Check if recursive. 772 cmpptr(recursions_address, 0); 773 jccb(Assembler::notZero, recursive); 774 775 // Set owner to null. 776 // Release to satisfy the JMM 777 movptr(owner_address, NULL_WORD); 778 // We need a full fence after clearing owner to avoid stranding. 779 // StoreLoad achieves this. 780 membar(StoreLoad); 781 782 // Check if the entry_list is empty. 783 cmpptr(entry_list_address, NULL_WORD); 784 jccb(Assembler::zero, unlocked); // If so we are done. 785 786 // Check if there is a successor. 787 cmpptr(succ_address, NULL_WORD); 788 jccb(Assembler::notZero, unlocked); // If so we are done. 789 790 // Save the monitor pointer in the current thread, so we can try to 791 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 792 if (!UseObjectMonitorTable) { 793 andptr(monitor, ~(int32_t)markWord::monitor_value); 794 } 795 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 796 797 orl(t, 1); // Fast Unlock ZF = 0 798 jmpb(slow_path); 799 800 // Recursive unlock. 801 bind(recursive); 802 decrement(recursions_address); 803 #endif // _LP64 804 } 805 806 bind(unlocked); 807 xorl(t, t); // Fast Unlock ZF = 1 808 809 #ifdef ASSERT 810 // Check that unlocked label is reached with ZF set. 811 Label zf_correct; 812 Label zf_bad_zero; 813 jcc(Assembler::zero, zf_correct); 814 jmp(zf_bad_zero); 815 #endif 816 817 bind(slow_path); 818 if (stub != nullptr) { 819 bind(stub->slow_path_continuation()); 820 } 821 #ifdef ASSERT 822 // Check that stub->continuation() label is reached with ZF not set. 823 jcc(Assembler::notZero, zf_correct); 824 stop("Fast Unlock ZF != 0"); 825 bind(zf_bad_zero); 826 stop("Fast Unlock ZF != 1"); 827 bind(zf_correct); 828 #endif 829 // C2 uses the value of ZF to determine the continuation. 830 } 831 832 //------------------------------------------------------------------------------------------- 833 // Generic instructions support for use in .ad files C2 code generation 834 835 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 836 if (dst != src) { 837 movdqu(dst, src); 838 } 839 if (opcode == Op_AbsVD) { 840 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 841 } else { 842 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 843 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 844 } 845 } 846 847 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 848 if (opcode == Op_AbsVD) { 849 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 850 } else { 851 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 852 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 853 } 854 } 855 856 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 857 if (dst != src) { 858 movdqu(dst, src); 859 } 860 if (opcode == Op_AbsVF) { 861 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 862 } else { 863 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 864 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 865 } 866 } 867 868 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 869 if (opcode == Op_AbsVF) { 870 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 871 } else { 872 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 873 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 874 } 875 } 876 877 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 878 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 879 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 880 881 if (opcode == Op_MinV) { 882 if (elem_bt == T_BYTE) { 883 pminsb(dst, src); 884 } else if (elem_bt == T_SHORT) { 885 pminsw(dst, src); 886 } else if (elem_bt == T_INT) { 887 pminsd(dst, src); 888 } else { 889 assert(elem_bt == T_LONG, "required"); 890 assert(tmp == xmm0, "required"); 891 assert_different_registers(dst, src, tmp); 892 movdqu(xmm0, dst); 893 pcmpgtq(xmm0, src); 894 blendvpd(dst, src); // xmm0 as mask 895 } 896 } else { // opcode == Op_MaxV 897 if (elem_bt == T_BYTE) { 898 pmaxsb(dst, src); 899 } else if (elem_bt == T_SHORT) { 900 pmaxsw(dst, src); 901 } else if (elem_bt == T_INT) { 902 pmaxsd(dst, src); 903 } else { 904 assert(elem_bt == T_LONG, "required"); 905 assert(tmp == xmm0, "required"); 906 assert_different_registers(dst, src, tmp); 907 movdqu(xmm0, src); 908 pcmpgtq(xmm0, dst); 909 blendvpd(dst, src); // xmm0 as mask 910 } 911 } 912 } 913 914 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 915 XMMRegister src1, Address src2, int vlen_enc) { 916 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 917 if (opcode == Op_UMinV) { 918 switch(elem_bt) { 919 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 920 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 921 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 922 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 923 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 924 } 925 } else { 926 assert(opcode == Op_UMaxV, "required"); 927 switch(elem_bt) { 928 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 929 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 930 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 931 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 932 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 933 } 934 } 935 } 936 937 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 938 // For optimality, leverage a full vector width of 512 bits 939 // for operations over smaller vector sizes on AVX512 targets. 940 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { 941 if (opcode == Op_UMaxV) { 942 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 943 } else { 944 assert(opcode == Op_UMinV, "required"); 945 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 946 } 947 } else { 948 // T1 = -1 949 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc); 950 // T1 = -1 << 63 951 vpsllq(xtmp1, xtmp1, 63, vlen_enc); 952 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2 953 vpaddq(xtmp2, xtmp1, src2, vlen_enc); 954 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1 955 vpaddq(xtmp1, xtmp1, src1, vlen_enc); 956 // Mask = T2 > T1 957 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc); 958 if (opcode == Op_UMaxV) { 959 // Res = Mask ? Src2 : Src1 960 vpblendvb(dst, src1, src2, xtmp1, vlen_enc); 961 } else { 962 // Res = Mask ? Src1 : Src2 963 vpblendvb(dst, src2, src1, xtmp1, vlen_enc); 964 } 965 } 966 } 967 968 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 969 XMMRegister src1, XMMRegister src2, int vlen_enc) { 970 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 971 if (opcode == Op_UMinV) { 972 switch(elem_bt) { 973 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 974 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 975 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 976 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 977 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 978 } 979 } else { 980 assert(opcode == Op_UMaxV, "required"); 981 switch(elem_bt) { 982 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 983 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 984 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 985 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 986 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 987 } 988 } 989 } 990 991 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 992 XMMRegister dst, XMMRegister src1, XMMRegister src2, 993 int vlen_enc) { 994 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 995 996 if (opcode == Op_MinV) { 997 if (elem_bt == T_BYTE) { 998 vpminsb(dst, src1, src2, vlen_enc); 999 } else if (elem_bt == T_SHORT) { 1000 vpminsw(dst, src1, src2, vlen_enc); 1001 } else if (elem_bt == T_INT) { 1002 vpminsd(dst, src1, src2, vlen_enc); 1003 } else { 1004 assert(elem_bt == T_LONG, "required"); 1005 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1006 vpminsq(dst, src1, src2, vlen_enc); 1007 } else { 1008 assert_different_registers(dst, src1, src2); 1009 vpcmpgtq(dst, src1, src2, vlen_enc); 1010 vblendvpd(dst, src1, src2, dst, vlen_enc); 1011 } 1012 } 1013 } else { // opcode == Op_MaxV 1014 if (elem_bt == T_BYTE) { 1015 vpmaxsb(dst, src1, src2, vlen_enc); 1016 } else if (elem_bt == T_SHORT) { 1017 vpmaxsw(dst, src1, src2, vlen_enc); 1018 } else if (elem_bt == T_INT) { 1019 vpmaxsd(dst, src1, src2, vlen_enc); 1020 } else { 1021 assert(elem_bt == T_LONG, "required"); 1022 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1023 vpmaxsq(dst, src1, src2, vlen_enc); 1024 } else { 1025 assert_different_registers(dst, src1, src2); 1026 vpcmpgtq(dst, src1, src2, vlen_enc); 1027 vblendvpd(dst, src2, src1, dst, vlen_enc); 1028 } 1029 } 1030 } 1031 } 1032 1033 // Float/Double min max 1034 1035 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1036 XMMRegister dst, XMMRegister a, XMMRegister b, 1037 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1038 int vlen_enc) { 1039 assert(UseAVX > 0, "required"); 1040 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1041 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1042 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1043 assert_different_registers(a, tmp, atmp, btmp); 1044 assert_different_registers(b, tmp, atmp, btmp); 1045 1046 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1047 bool is_double_word = is_double_word_type(elem_bt); 1048 1049 /* Note on 'non-obvious' assembly sequence: 1050 * 1051 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1052 * and Java on how they handle floats: 1053 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1054 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1055 * 1056 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1057 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1058 * (only useful when signs differ, noop otherwise) 1059 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1060 1061 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1062 * btmp = (b < +0.0) ? a : b 1063 * atmp = (b < +0.0) ? b : a 1064 * Tmp = Max_Float(atmp , btmp) 1065 * Res = (atmp == NaN) ? atmp : Tmp 1066 */ 1067 1068 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1069 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1070 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1071 XMMRegister mask; 1072 1073 if (!is_double_word && is_min) { 1074 mask = a; 1075 vblend = &MacroAssembler::vblendvps; 1076 vmaxmin = &MacroAssembler::vminps; 1077 vcmp = &MacroAssembler::vcmpps; 1078 } else if (!is_double_word && !is_min) { 1079 mask = b; 1080 vblend = &MacroAssembler::vblendvps; 1081 vmaxmin = &MacroAssembler::vmaxps; 1082 vcmp = &MacroAssembler::vcmpps; 1083 } else if (is_double_word && is_min) { 1084 mask = a; 1085 vblend = &MacroAssembler::vblendvpd; 1086 vmaxmin = &MacroAssembler::vminpd; 1087 vcmp = &MacroAssembler::vcmppd; 1088 } else { 1089 assert(is_double_word && !is_min, "sanity"); 1090 mask = b; 1091 vblend = &MacroAssembler::vblendvpd; 1092 vmaxmin = &MacroAssembler::vmaxpd; 1093 vcmp = &MacroAssembler::vcmppd; 1094 } 1095 1096 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1097 XMMRegister maxmin, scratch; 1098 if (dst == btmp) { 1099 maxmin = btmp; 1100 scratch = tmp; 1101 } else { 1102 maxmin = tmp; 1103 scratch = btmp; 1104 } 1105 1106 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1107 if (precompute_mask && !is_double_word) { 1108 vpsrad(tmp, mask, 32, vlen_enc); 1109 mask = tmp; 1110 } else if (precompute_mask && is_double_word) { 1111 vpxor(tmp, tmp, tmp, vlen_enc); 1112 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1113 mask = tmp; 1114 } 1115 1116 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1117 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1118 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1119 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1120 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1121 } 1122 1123 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1124 XMMRegister dst, XMMRegister a, XMMRegister b, 1125 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1126 int vlen_enc) { 1127 assert(UseAVX > 2, "required"); 1128 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1129 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1130 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1131 assert_different_registers(dst, a, atmp, btmp); 1132 assert_different_registers(dst, b, atmp, btmp); 1133 1134 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1135 bool is_double_word = is_double_word_type(elem_bt); 1136 bool merge = true; 1137 1138 if (!is_double_word && is_min) { 1139 evpmovd2m(ktmp, a, vlen_enc); 1140 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1141 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1142 vminps(dst, atmp, btmp, vlen_enc); 1143 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1144 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1145 } else if (!is_double_word && !is_min) { 1146 evpmovd2m(ktmp, b, vlen_enc); 1147 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1148 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1149 vmaxps(dst, atmp, btmp, vlen_enc); 1150 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1151 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1152 } else if (is_double_word && is_min) { 1153 evpmovq2m(ktmp, a, vlen_enc); 1154 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1155 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1156 vminpd(dst, atmp, btmp, vlen_enc); 1157 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1158 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1159 } else { 1160 assert(is_double_word && !is_min, "sanity"); 1161 evpmovq2m(ktmp, b, vlen_enc); 1162 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1163 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1164 vmaxpd(dst, atmp, btmp, vlen_enc); 1165 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1166 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1167 } 1168 } 1169 1170 // Float/Double signum 1171 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1172 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1173 1174 Label DONE_LABEL; 1175 1176 if (opcode == Op_SignumF) { 1177 assert(UseSSE > 0, "required"); 1178 ucomiss(dst, zero); 1179 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1180 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1181 movflt(dst, one); 1182 jcc(Assembler::above, DONE_LABEL); 1183 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1184 } else if (opcode == Op_SignumD) { 1185 assert(UseSSE > 1, "required"); 1186 ucomisd(dst, zero); 1187 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1188 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1189 movdbl(dst, one); 1190 jcc(Assembler::above, DONE_LABEL); 1191 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1192 } 1193 1194 bind(DONE_LABEL); 1195 } 1196 1197 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1198 if (sign) { 1199 pmovsxbw(dst, src); 1200 } else { 1201 pmovzxbw(dst, src); 1202 } 1203 } 1204 1205 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1206 if (sign) { 1207 vpmovsxbw(dst, src, vector_len); 1208 } else { 1209 vpmovzxbw(dst, src, vector_len); 1210 } 1211 } 1212 1213 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1214 if (sign) { 1215 vpmovsxbd(dst, src, vector_len); 1216 } else { 1217 vpmovzxbd(dst, src, vector_len); 1218 } 1219 } 1220 1221 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1222 if (sign) { 1223 vpmovsxwd(dst, src, vector_len); 1224 } else { 1225 vpmovzxwd(dst, src, vector_len); 1226 } 1227 } 1228 1229 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1230 int shift, int vector_len) { 1231 if (opcode == Op_RotateLeftV) { 1232 if (etype == T_INT) { 1233 evprold(dst, src, shift, vector_len); 1234 } else { 1235 assert(etype == T_LONG, "expected type T_LONG"); 1236 evprolq(dst, src, shift, vector_len); 1237 } 1238 } else { 1239 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1240 if (etype == T_INT) { 1241 evprord(dst, src, shift, vector_len); 1242 } else { 1243 assert(etype == T_LONG, "expected type T_LONG"); 1244 evprorq(dst, src, shift, vector_len); 1245 } 1246 } 1247 } 1248 1249 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1250 XMMRegister shift, int vector_len) { 1251 if (opcode == Op_RotateLeftV) { 1252 if (etype == T_INT) { 1253 evprolvd(dst, src, shift, vector_len); 1254 } else { 1255 assert(etype == T_LONG, "expected type T_LONG"); 1256 evprolvq(dst, src, shift, vector_len); 1257 } 1258 } else { 1259 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1260 if (etype == T_INT) { 1261 evprorvd(dst, src, shift, vector_len); 1262 } else { 1263 assert(etype == T_LONG, "expected type T_LONG"); 1264 evprorvq(dst, src, shift, vector_len); 1265 } 1266 } 1267 } 1268 1269 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1270 if (opcode == Op_RShiftVI) { 1271 psrad(dst, shift); 1272 } else if (opcode == Op_LShiftVI) { 1273 pslld(dst, shift); 1274 } else { 1275 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1276 psrld(dst, shift); 1277 } 1278 } 1279 1280 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1281 switch (opcode) { 1282 case Op_RShiftVI: psrad(dst, shift); break; 1283 case Op_LShiftVI: pslld(dst, shift); break; 1284 case Op_URShiftVI: psrld(dst, shift); break; 1285 1286 default: assert(false, "%s", NodeClassNames[opcode]); 1287 } 1288 } 1289 1290 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1291 if (opcode == Op_RShiftVI) { 1292 vpsrad(dst, nds, shift, vector_len); 1293 } else if (opcode == Op_LShiftVI) { 1294 vpslld(dst, nds, shift, vector_len); 1295 } else { 1296 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1297 vpsrld(dst, nds, shift, vector_len); 1298 } 1299 } 1300 1301 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1302 switch (opcode) { 1303 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1304 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1305 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1306 1307 default: assert(false, "%s", NodeClassNames[opcode]); 1308 } 1309 } 1310 1311 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1312 switch (opcode) { 1313 case Op_RShiftVB: // fall-through 1314 case Op_RShiftVS: psraw(dst, shift); break; 1315 1316 case Op_LShiftVB: // fall-through 1317 case Op_LShiftVS: psllw(dst, shift); break; 1318 1319 case Op_URShiftVS: // fall-through 1320 case Op_URShiftVB: psrlw(dst, shift); break; 1321 1322 default: assert(false, "%s", NodeClassNames[opcode]); 1323 } 1324 } 1325 1326 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1327 switch (opcode) { 1328 case Op_RShiftVB: // fall-through 1329 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1330 1331 case Op_LShiftVB: // fall-through 1332 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1333 1334 case Op_URShiftVS: // fall-through 1335 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1336 1337 default: assert(false, "%s", NodeClassNames[opcode]); 1338 } 1339 } 1340 1341 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1342 switch (opcode) { 1343 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1344 case Op_LShiftVL: psllq(dst, shift); break; 1345 case Op_URShiftVL: psrlq(dst, shift); break; 1346 1347 default: assert(false, "%s", NodeClassNames[opcode]); 1348 } 1349 } 1350 1351 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1352 if (opcode == Op_RShiftVL) { 1353 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1354 } else if (opcode == Op_LShiftVL) { 1355 psllq(dst, shift); 1356 } else { 1357 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1358 psrlq(dst, shift); 1359 } 1360 } 1361 1362 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1363 switch (opcode) { 1364 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1365 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1366 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1367 1368 default: assert(false, "%s", NodeClassNames[opcode]); 1369 } 1370 } 1371 1372 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1373 if (opcode == Op_RShiftVL) { 1374 evpsraq(dst, nds, shift, vector_len); 1375 } else if (opcode == Op_LShiftVL) { 1376 vpsllq(dst, nds, shift, vector_len); 1377 } else { 1378 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1379 vpsrlq(dst, nds, shift, vector_len); 1380 } 1381 } 1382 1383 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1384 switch (opcode) { 1385 case Op_RShiftVB: // fall-through 1386 case Op_RShiftVS: // fall-through 1387 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1388 1389 case Op_LShiftVB: // fall-through 1390 case Op_LShiftVS: // fall-through 1391 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1392 1393 case Op_URShiftVB: // fall-through 1394 case Op_URShiftVS: // fall-through 1395 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1396 1397 default: assert(false, "%s", NodeClassNames[opcode]); 1398 } 1399 } 1400 1401 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1402 switch (opcode) { 1403 case Op_RShiftVB: // fall-through 1404 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1405 1406 case Op_LShiftVB: // fall-through 1407 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1408 1409 case Op_URShiftVB: // fall-through 1410 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1411 1412 default: assert(false, "%s", NodeClassNames[opcode]); 1413 } 1414 } 1415 1416 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1417 assert(UseAVX >= 2, "required"); 1418 switch (opcode) { 1419 case Op_RShiftVL: { 1420 if (UseAVX > 2) { 1421 assert(tmp == xnoreg, "not used"); 1422 if (!VM_Version::supports_avx512vl()) { 1423 vlen_enc = Assembler::AVX_512bit; 1424 } 1425 evpsravq(dst, src, shift, vlen_enc); 1426 } else { 1427 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1428 vpsrlvq(dst, src, shift, vlen_enc); 1429 vpsrlvq(tmp, tmp, shift, vlen_enc); 1430 vpxor(dst, dst, tmp, vlen_enc); 1431 vpsubq(dst, dst, tmp, vlen_enc); 1432 } 1433 break; 1434 } 1435 case Op_LShiftVL: { 1436 assert(tmp == xnoreg, "not used"); 1437 vpsllvq(dst, src, shift, vlen_enc); 1438 break; 1439 } 1440 case Op_URShiftVL: { 1441 assert(tmp == xnoreg, "not used"); 1442 vpsrlvq(dst, src, shift, vlen_enc); 1443 break; 1444 } 1445 default: assert(false, "%s", NodeClassNames[opcode]); 1446 } 1447 } 1448 1449 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1450 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1451 assert(opcode == Op_LShiftVB || 1452 opcode == Op_RShiftVB || 1453 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1454 bool sign = (opcode != Op_URShiftVB); 1455 assert(vector_len == 0, "required"); 1456 vextendbd(sign, dst, src, 1); 1457 vpmovzxbd(vtmp, shift, 1); 1458 varshiftd(opcode, dst, dst, vtmp, 1); 1459 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1460 vextracti128_high(vtmp, dst); 1461 vpackusdw(dst, dst, vtmp, 0); 1462 } 1463 1464 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1465 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1466 assert(opcode == Op_LShiftVB || 1467 opcode == Op_RShiftVB || 1468 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1469 bool sign = (opcode != Op_URShiftVB); 1470 int ext_vector_len = vector_len + 1; 1471 vextendbw(sign, dst, src, ext_vector_len); 1472 vpmovzxbw(vtmp, shift, ext_vector_len); 1473 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1474 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1475 if (vector_len == 0) { 1476 vextracti128_high(vtmp, dst); 1477 vpackuswb(dst, dst, vtmp, vector_len); 1478 } else { 1479 vextracti64x4_high(vtmp, dst); 1480 vpackuswb(dst, dst, vtmp, vector_len); 1481 vpermq(dst, dst, 0xD8, vector_len); 1482 } 1483 } 1484 1485 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1486 switch(typ) { 1487 case T_BYTE: 1488 pinsrb(dst, val, idx); 1489 break; 1490 case T_SHORT: 1491 pinsrw(dst, val, idx); 1492 break; 1493 case T_INT: 1494 pinsrd(dst, val, idx); 1495 break; 1496 case T_LONG: 1497 pinsrq(dst, val, idx); 1498 break; 1499 default: 1500 assert(false,"Should not reach here."); 1501 break; 1502 } 1503 } 1504 1505 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1506 switch(typ) { 1507 case T_BYTE: 1508 vpinsrb(dst, src, val, idx); 1509 break; 1510 case T_SHORT: 1511 vpinsrw(dst, src, val, idx); 1512 break; 1513 case T_INT: 1514 vpinsrd(dst, src, val, idx); 1515 break; 1516 case T_LONG: 1517 vpinsrq(dst, src, val, idx); 1518 break; 1519 default: 1520 assert(false,"Should not reach here."); 1521 break; 1522 } 1523 } 1524 1525 #ifdef _LP64 1526 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1527 XMMRegister dst, Register base, 1528 Register idx_base, 1529 Register offset, Register mask, 1530 Register mask_idx, Register rtmp, 1531 int vlen_enc) { 1532 vpxor(dst, dst, dst, vlen_enc); 1533 if (elem_bt == T_SHORT) { 1534 for (int i = 0; i < 4; i++) { 1535 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1536 Label skip_load; 1537 btq(mask, mask_idx); 1538 jccb(Assembler::carryClear, skip_load); 1539 movl(rtmp, Address(idx_base, i * 4)); 1540 if (offset != noreg) { 1541 addl(rtmp, offset); 1542 } 1543 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1544 bind(skip_load); 1545 incq(mask_idx); 1546 } 1547 } else { 1548 assert(elem_bt == T_BYTE, ""); 1549 for (int i = 0; i < 8; i++) { 1550 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1551 Label skip_load; 1552 btq(mask, mask_idx); 1553 jccb(Assembler::carryClear, skip_load); 1554 movl(rtmp, Address(idx_base, i * 4)); 1555 if (offset != noreg) { 1556 addl(rtmp, offset); 1557 } 1558 pinsrb(dst, Address(base, rtmp), i); 1559 bind(skip_load); 1560 incq(mask_idx); 1561 } 1562 } 1563 } 1564 #endif // _LP64 1565 1566 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1567 Register base, Register idx_base, 1568 Register offset, Register rtmp, 1569 int vlen_enc) { 1570 vpxor(dst, dst, dst, vlen_enc); 1571 if (elem_bt == T_SHORT) { 1572 for (int i = 0; i < 4; i++) { 1573 // dst[i] = src[offset + idx_base[i]] 1574 movl(rtmp, Address(idx_base, i * 4)); 1575 if (offset != noreg) { 1576 addl(rtmp, offset); 1577 } 1578 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1579 } 1580 } else { 1581 assert(elem_bt == T_BYTE, ""); 1582 for (int i = 0; i < 8; i++) { 1583 // dst[i] = src[offset + idx_base[i]] 1584 movl(rtmp, Address(idx_base, i * 4)); 1585 if (offset != noreg) { 1586 addl(rtmp, offset); 1587 } 1588 pinsrb(dst, Address(base, rtmp), i); 1589 } 1590 } 1591 } 1592 1593 /* 1594 * Gather using hybrid algorithm, first partially unroll scalar loop 1595 * to accumulate values from gather indices into a quad-word(64bit) slice. 1596 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1597 * permutation to place the slice into appropriate vector lane 1598 * locations in destination vector. Following pseudo code describes the 1599 * algorithm in detail: 1600 * 1601 * DST_VEC = ZERO_VEC 1602 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1603 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1604 * FOREACH_ITER: 1605 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1606 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1607 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1608 * PERM_INDEX = PERM_INDEX - TWO_VEC 1609 * 1610 * With each iteration, doubleword permute indices (0,1) corresponding 1611 * to gathered quadword gets right shifted by two lane positions. 1612 * 1613 */ 1614 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1615 Register base, Register idx_base, 1616 Register offset, Register mask, 1617 XMMRegister xtmp1, XMMRegister xtmp2, 1618 XMMRegister temp_dst, Register rtmp, 1619 Register mask_idx, Register length, 1620 int vector_len, int vlen_enc) { 1621 Label GATHER8_LOOP; 1622 assert(is_subword_type(elem_ty), ""); 1623 movl(length, vector_len); 1624 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1625 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1626 vallones(xtmp2, vlen_enc); 1627 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1628 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1629 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1630 1631 bind(GATHER8_LOOP); 1632 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1633 if (mask == noreg) { 1634 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1635 } else { 1636 LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc)); 1637 } 1638 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1639 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1640 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1641 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1642 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1643 vpor(dst, dst, temp_dst, vlen_enc); 1644 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1645 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1646 jcc(Assembler::notEqual, GATHER8_LOOP); 1647 } 1648 1649 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1650 switch(typ) { 1651 case T_INT: 1652 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1653 break; 1654 case T_FLOAT: 1655 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1656 break; 1657 case T_LONG: 1658 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1659 break; 1660 case T_DOUBLE: 1661 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1662 break; 1663 default: 1664 assert(false,"Should not reach here."); 1665 break; 1666 } 1667 } 1668 1669 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1670 switch(typ) { 1671 case T_INT: 1672 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1673 break; 1674 case T_FLOAT: 1675 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1676 break; 1677 case T_LONG: 1678 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1679 break; 1680 case T_DOUBLE: 1681 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1682 break; 1683 default: 1684 assert(false,"Should not reach here."); 1685 break; 1686 } 1687 } 1688 1689 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1690 switch(typ) { 1691 case T_INT: 1692 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1693 break; 1694 case T_FLOAT: 1695 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1696 break; 1697 case T_LONG: 1698 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1699 break; 1700 case T_DOUBLE: 1701 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1702 break; 1703 default: 1704 assert(false,"Should not reach here."); 1705 break; 1706 } 1707 } 1708 1709 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1710 if (vlen_in_bytes <= 16) { 1711 pxor (dst, dst); 1712 psubb(dst, src); 1713 switch (elem_bt) { 1714 case T_BYTE: /* nothing to do */ break; 1715 case T_SHORT: pmovsxbw(dst, dst); break; 1716 case T_INT: pmovsxbd(dst, dst); break; 1717 case T_FLOAT: pmovsxbd(dst, dst); break; 1718 case T_LONG: pmovsxbq(dst, dst); break; 1719 case T_DOUBLE: pmovsxbq(dst, dst); break; 1720 1721 default: assert(false, "%s", type2name(elem_bt)); 1722 } 1723 } else { 1724 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1725 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1726 1727 vpxor (dst, dst, dst, vlen_enc); 1728 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1729 1730 switch (elem_bt) { 1731 case T_BYTE: /* nothing to do */ break; 1732 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1733 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1734 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1735 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1736 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1737 1738 default: assert(false, "%s", type2name(elem_bt)); 1739 } 1740 } 1741 } 1742 1743 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1744 if (novlbwdq) { 1745 vpmovsxbd(xtmp, src, vlen_enc); 1746 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1747 Assembler::eq, true, vlen_enc, noreg); 1748 } else { 1749 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1750 vpsubb(xtmp, xtmp, src, vlen_enc); 1751 evpmovb2m(dst, xtmp, vlen_enc); 1752 } 1753 } 1754 1755 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) { 1756 if (is_integral_type(bt)) { 1757 switch (vlen_in_bytes) { 1758 case 4: movdl(dst, src); break; 1759 case 8: movq(dst, src); break; 1760 case 16: movdqu(dst, src); break; 1761 case 32: vmovdqu(dst, src); break; 1762 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1763 default: ShouldNotReachHere(); 1764 } 1765 } else { 1766 switch (vlen_in_bytes) { 1767 case 4: movflt(dst, src); break; 1768 case 8: movdbl(dst, src); break; 1769 case 16: movups(dst, src); break; 1770 case 32: vmovups(dst, src, Assembler::AVX_256bit); break; 1771 case 64: vmovups(dst, src, Assembler::AVX_512bit); break; 1772 default: ShouldNotReachHere(); 1773 } 1774 } 1775 } 1776 1777 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1778 assert(rscratch != noreg || always_reachable(src), "missing"); 1779 1780 if (reachable(src)) { 1781 load_vector(bt, dst, as_Address(src), vlen_in_bytes); 1782 } else { 1783 lea(rscratch, src); 1784 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes); 1785 } 1786 } 1787 1788 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1789 int vlen_enc = vector_length_encoding(vlen); 1790 if (VM_Version::supports_avx()) { 1791 if (bt == T_LONG) { 1792 if (VM_Version::supports_avx2()) { 1793 vpbroadcastq(dst, src, vlen_enc); 1794 } else { 1795 vmovddup(dst, src, vlen_enc); 1796 } 1797 } else if (bt == T_DOUBLE) { 1798 if (vlen_enc != Assembler::AVX_128bit) { 1799 vbroadcastsd(dst, src, vlen_enc, noreg); 1800 } else { 1801 vmovddup(dst, src, vlen_enc); 1802 } 1803 } else { 1804 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1805 vpbroadcastd(dst, src, vlen_enc); 1806 } else { 1807 vbroadcastss(dst, src, vlen_enc); 1808 } 1809 } 1810 } else if (VM_Version::supports_sse3()) { 1811 movddup(dst, src); 1812 } else { 1813 load_vector(bt, dst, src, vlen); 1814 } 1815 } 1816 1817 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1818 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1819 int offset = exact_log2(type2aelembytes(bt)) << 6; 1820 if (is_floating_point_type(bt)) { 1821 offset += 128; 1822 } 1823 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1824 load_vector(T_BYTE, dst, addr, vlen_in_bytes); 1825 } 1826 1827 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1828 1829 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1830 int vector_len = Assembler::AVX_128bit; 1831 1832 switch (opcode) { 1833 case Op_AndReductionV: pand(dst, src); break; 1834 case Op_OrReductionV: por (dst, src); break; 1835 case Op_XorReductionV: pxor(dst, src); break; 1836 case Op_MinReductionV: 1837 switch (typ) { 1838 case T_BYTE: pminsb(dst, src); break; 1839 case T_SHORT: pminsw(dst, src); break; 1840 case T_INT: pminsd(dst, src); break; 1841 case T_LONG: assert(UseAVX > 2, "required"); 1842 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1843 default: assert(false, "wrong type"); 1844 } 1845 break; 1846 case Op_MaxReductionV: 1847 switch (typ) { 1848 case T_BYTE: pmaxsb(dst, src); break; 1849 case T_SHORT: pmaxsw(dst, src); break; 1850 case T_INT: pmaxsd(dst, src); break; 1851 case T_LONG: assert(UseAVX > 2, "required"); 1852 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1853 default: assert(false, "wrong type"); 1854 } 1855 break; 1856 case Op_AddReductionVF: addss(dst, src); break; 1857 case Op_AddReductionVD: addsd(dst, src); break; 1858 case Op_AddReductionVI: 1859 switch (typ) { 1860 case T_BYTE: paddb(dst, src); break; 1861 case T_SHORT: paddw(dst, src); break; 1862 case T_INT: paddd(dst, src); break; 1863 default: assert(false, "wrong type"); 1864 } 1865 break; 1866 case Op_AddReductionVL: paddq(dst, src); break; 1867 case Op_MulReductionVF: mulss(dst, src); break; 1868 case Op_MulReductionVD: mulsd(dst, src); break; 1869 case Op_MulReductionVI: 1870 switch (typ) { 1871 case T_SHORT: pmullw(dst, src); break; 1872 case T_INT: pmulld(dst, src); break; 1873 default: assert(false, "wrong type"); 1874 } 1875 break; 1876 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1877 evpmullq(dst, dst, src, vector_len); break; 1878 default: assert(false, "wrong opcode"); 1879 } 1880 } 1881 1882 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1883 switch (opcode) { 1884 case Op_AddReductionVF: addps(dst, src); break; 1885 case Op_AddReductionVD: addpd(dst, src); break; 1886 case Op_MulReductionVF: mulps(dst, src); break; 1887 case Op_MulReductionVD: mulpd(dst, src); break; 1888 default: assert(false, "%s", NodeClassNames[opcode]); 1889 } 1890 } 1891 1892 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1893 int vector_len = Assembler::AVX_256bit; 1894 1895 switch (opcode) { 1896 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1897 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1898 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1899 case Op_MinReductionV: 1900 switch (typ) { 1901 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1902 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1903 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1904 case T_LONG: assert(UseAVX > 2, "required"); 1905 vpminsq(dst, src1, src2, vector_len); break; 1906 default: assert(false, "wrong type"); 1907 } 1908 break; 1909 case Op_MaxReductionV: 1910 switch (typ) { 1911 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1912 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1913 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1914 case T_LONG: assert(UseAVX > 2, "required"); 1915 vpmaxsq(dst, src1, src2, vector_len); break; 1916 default: assert(false, "wrong type"); 1917 } 1918 break; 1919 case Op_AddReductionVI: 1920 switch (typ) { 1921 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1922 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1923 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1924 default: assert(false, "wrong type"); 1925 } 1926 break; 1927 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1928 case Op_MulReductionVI: 1929 switch (typ) { 1930 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1931 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1932 default: assert(false, "wrong type"); 1933 } 1934 break; 1935 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1936 default: assert(false, "wrong opcode"); 1937 } 1938 } 1939 1940 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1941 int vector_len = Assembler::AVX_256bit; 1942 1943 switch (opcode) { 1944 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 1945 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 1946 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 1947 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 1948 default: assert(false, "%s", NodeClassNames[opcode]); 1949 } 1950 } 1951 1952 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1953 XMMRegister dst, XMMRegister src, 1954 XMMRegister vtmp1, XMMRegister vtmp2) { 1955 switch (opcode) { 1956 case Op_AddReductionVF: 1957 case Op_MulReductionVF: 1958 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1959 break; 1960 1961 case Op_AddReductionVD: 1962 case Op_MulReductionVD: 1963 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1964 break; 1965 1966 default: assert(false, "wrong opcode"); 1967 } 1968 } 1969 1970 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 1971 XMMRegister dst, XMMRegister src, 1972 XMMRegister vtmp1, XMMRegister vtmp2) { 1973 switch (opcode) { 1974 case Op_AddReductionVF: 1975 case Op_MulReductionVF: 1976 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1977 break; 1978 1979 case Op_AddReductionVD: 1980 case Op_MulReductionVD: 1981 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1982 break; 1983 1984 default: assert(false, "%s", NodeClassNames[opcode]); 1985 } 1986 } 1987 1988 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1989 Register dst, Register src1, XMMRegister src2, 1990 XMMRegister vtmp1, XMMRegister vtmp2) { 1991 switch (vlen) { 1992 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1993 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1994 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1995 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1996 1997 default: assert(false, "wrong vector length"); 1998 } 1999 } 2000 2001 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 2002 Register dst, Register src1, XMMRegister src2, 2003 XMMRegister vtmp1, XMMRegister vtmp2) { 2004 switch (vlen) { 2005 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2006 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2007 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2008 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2009 2010 default: assert(false, "wrong vector length"); 2011 } 2012 } 2013 2014 void C2_MacroAssembler::reduceS(int opcode, int vlen, 2015 Register dst, Register src1, XMMRegister src2, 2016 XMMRegister vtmp1, XMMRegister vtmp2) { 2017 switch (vlen) { 2018 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2019 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2020 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2021 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2022 2023 default: assert(false, "wrong vector length"); 2024 } 2025 } 2026 2027 void C2_MacroAssembler::reduceI(int opcode, int vlen, 2028 Register dst, Register src1, XMMRegister src2, 2029 XMMRegister vtmp1, XMMRegister vtmp2) { 2030 switch (vlen) { 2031 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2032 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2033 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2034 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2035 2036 default: assert(false, "wrong vector length"); 2037 } 2038 } 2039 2040 #ifdef _LP64 2041 void C2_MacroAssembler::reduceL(int opcode, int vlen, 2042 Register dst, Register src1, XMMRegister src2, 2043 XMMRegister vtmp1, XMMRegister vtmp2) { 2044 switch (vlen) { 2045 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2046 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2047 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2048 2049 default: assert(false, "wrong vector length"); 2050 } 2051 } 2052 #endif // _LP64 2053 2054 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2055 switch (vlen) { 2056 case 2: 2057 assert(vtmp2 == xnoreg, ""); 2058 reduce2F(opcode, dst, src, vtmp1); 2059 break; 2060 case 4: 2061 assert(vtmp2 == xnoreg, ""); 2062 reduce4F(opcode, dst, src, vtmp1); 2063 break; 2064 case 8: 2065 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2066 break; 2067 case 16: 2068 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2069 break; 2070 default: assert(false, "wrong vector length"); 2071 } 2072 } 2073 2074 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2075 switch (vlen) { 2076 case 2: 2077 assert(vtmp2 == xnoreg, ""); 2078 reduce2D(opcode, dst, src, vtmp1); 2079 break; 2080 case 4: 2081 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2082 break; 2083 case 8: 2084 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2085 break; 2086 default: assert(false, "wrong vector length"); 2087 } 2088 } 2089 2090 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2091 switch (vlen) { 2092 case 2: 2093 assert(vtmp1 == xnoreg, ""); 2094 assert(vtmp2 == xnoreg, ""); 2095 unorderedReduce2F(opcode, dst, src); 2096 break; 2097 case 4: 2098 assert(vtmp2 == xnoreg, ""); 2099 unorderedReduce4F(opcode, dst, src, vtmp1); 2100 break; 2101 case 8: 2102 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 2103 break; 2104 case 16: 2105 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 2106 break; 2107 default: assert(false, "wrong vector length"); 2108 } 2109 } 2110 2111 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2112 switch (vlen) { 2113 case 2: 2114 assert(vtmp1 == xnoreg, ""); 2115 assert(vtmp2 == xnoreg, ""); 2116 unorderedReduce2D(opcode, dst, src); 2117 break; 2118 case 4: 2119 assert(vtmp2 == xnoreg, ""); 2120 unorderedReduce4D(opcode, dst, src, vtmp1); 2121 break; 2122 case 8: 2123 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2124 break; 2125 default: assert(false, "wrong vector length"); 2126 } 2127 } 2128 2129 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2130 if (opcode == Op_AddReductionVI) { 2131 if (vtmp1 != src2) { 2132 movdqu(vtmp1, src2); 2133 } 2134 phaddd(vtmp1, vtmp1); 2135 } else { 2136 pshufd(vtmp1, src2, 0x1); 2137 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2138 } 2139 movdl(vtmp2, src1); 2140 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2141 movdl(dst, vtmp1); 2142 } 2143 2144 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2145 if (opcode == Op_AddReductionVI) { 2146 if (vtmp1 != src2) { 2147 movdqu(vtmp1, src2); 2148 } 2149 phaddd(vtmp1, src2); 2150 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2151 } else { 2152 pshufd(vtmp2, src2, 0xE); 2153 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2154 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2155 } 2156 } 2157 2158 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2159 if (opcode == Op_AddReductionVI) { 2160 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2161 vextracti128_high(vtmp2, vtmp1); 2162 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2163 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2164 } else { 2165 vextracti128_high(vtmp1, src2); 2166 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2167 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2168 } 2169 } 2170 2171 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2172 vextracti64x4_high(vtmp2, src2); 2173 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2174 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2175 } 2176 2177 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2178 pshufd(vtmp2, src2, 0x1); 2179 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2180 movdqu(vtmp1, vtmp2); 2181 psrldq(vtmp1, 2); 2182 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2183 movdqu(vtmp2, vtmp1); 2184 psrldq(vtmp2, 1); 2185 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2186 movdl(vtmp2, src1); 2187 pmovsxbd(vtmp1, vtmp1); 2188 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2189 pextrb(dst, vtmp1, 0x0); 2190 movsbl(dst, dst); 2191 } 2192 2193 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2194 pshufd(vtmp1, src2, 0xE); 2195 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2196 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2197 } 2198 2199 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2200 vextracti128_high(vtmp2, src2); 2201 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2202 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2203 } 2204 2205 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2206 vextracti64x4_high(vtmp1, src2); 2207 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2208 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2209 } 2210 2211 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2212 pmovsxbw(vtmp2, src2); 2213 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2214 } 2215 2216 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2217 if (UseAVX > 1) { 2218 int vector_len = Assembler::AVX_256bit; 2219 vpmovsxbw(vtmp1, src2, vector_len); 2220 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2221 } else { 2222 pmovsxbw(vtmp2, src2); 2223 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2224 pshufd(vtmp2, src2, 0x1); 2225 pmovsxbw(vtmp2, src2); 2226 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2227 } 2228 } 2229 2230 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2231 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2232 int vector_len = Assembler::AVX_512bit; 2233 vpmovsxbw(vtmp1, src2, vector_len); 2234 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2235 } else { 2236 assert(UseAVX >= 2,"Should not reach here."); 2237 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2238 vextracti128_high(vtmp2, src2); 2239 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2240 } 2241 } 2242 2243 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2244 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2245 vextracti64x4_high(vtmp2, src2); 2246 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2247 } 2248 2249 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2250 if (opcode == Op_AddReductionVI) { 2251 if (vtmp1 != src2) { 2252 movdqu(vtmp1, src2); 2253 } 2254 phaddw(vtmp1, vtmp1); 2255 phaddw(vtmp1, vtmp1); 2256 } else { 2257 pshufd(vtmp2, src2, 0x1); 2258 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2259 movdqu(vtmp1, vtmp2); 2260 psrldq(vtmp1, 2); 2261 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2262 } 2263 movdl(vtmp2, src1); 2264 pmovsxwd(vtmp1, vtmp1); 2265 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2266 pextrw(dst, vtmp1, 0x0); 2267 movswl(dst, dst); 2268 } 2269 2270 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2271 if (opcode == Op_AddReductionVI) { 2272 if (vtmp1 != src2) { 2273 movdqu(vtmp1, src2); 2274 } 2275 phaddw(vtmp1, src2); 2276 } else { 2277 pshufd(vtmp1, src2, 0xE); 2278 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2279 } 2280 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2281 } 2282 2283 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2284 if (opcode == Op_AddReductionVI) { 2285 int vector_len = Assembler::AVX_256bit; 2286 vphaddw(vtmp2, src2, src2, vector_len); 2287 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2288 } else { 2289 vextracti128_high(vtmp2, src2); 2290 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2291 } 2292 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2293 } 2294 2295 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2296 int vector_len = Assembler::AVX_256bit; 2297 vextracti64x4_high(vtmp1, src2); 2298 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2299 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2300 } 2301 2302 #ifdef _LP64 2303 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2304 pshufd(vtmp2, src2, 0xE); 2305 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2306 movdq(vtmp1, src1); 2307 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2308 movdq(dst, vtmp1); 2309 } 2310 2311 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2312 vextracti128_high(vtmp1, src2); 2313 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2314 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2315 } 2316 2317 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2318 vextracti64x4_high(vtmp2, src2); 2319 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2320 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2321 } 2322 2323 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2324 mov64(temp, -1L); 2325 bzhiq(temp, temp, len); 2326 kmovql(dst, temp); 2327 } 2328 #endif // _LP64 2329 2330 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2331 reduce_operation_128(T_FLOAT, opcode, dst, src); 2332 pshufd(vtmp, src, 0x1); 2333 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2334 } 2335 2336 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2337 reduce2F(opcode, dst, src, vtmp); 2338 pshufd(vtmp, src, 0x2); 2339 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2340 pshufd(vtmp, src, 0x3); 2341 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2342 } 2343 2344 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2345 reduce4F(opcode, dst, src, vtmp2); 2346 vextractf128_high(vtmp2, src); 2347 reduce4F(opcode, dst, vtmp2, vtmp1); 2348 } 2349 2350 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2351 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2352 vextracti64x4_high(vtmp1, src); 2353 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2354 } 2355 2356 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2357 pshufd(dst, src, 0x1); 2358 reduce_operation_128(T_FLOAT, opcode, dst, src); 2359 } 2360 2361 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2362 pshufd(vtmp, src, 0xE); 2363 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2364 unorderedReduce2F(opcode, dst, vtmp); 2365 } 2366 2367 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2368 vextractf128_high(vtmp1, src); 2369 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2370 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2371 } 2372 2373 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2374 vextractf64x4_high(vtmp2, src); 2375 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2376 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2377 } 2378 2379 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2380 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2381 pshufd(vtmp, src, 0xE); 2382 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2383 } 2384 2385 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2386 reduce2D(opcode, dst, src, vtmp2); 2387 vextractf128_high(vtmp2, src); 2388 reduce2D(opcode, dst, vtmp2, vtmp1); 2389 } 2390 2391 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2392 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2393 vextracti64x4_high(vtmp1, src); 2394 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2395 } 2396 2397 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2398 pshufd(dst, src, 0xE); 2399 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2400 } 2401 2402 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2403 vextractf128_high(vtmp, src); 2404 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2405 unorderedReduce2D(opcode, dst, vtmp); 2406 } 2407 2408 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2409 vextractf64x4_high(vtmp2, src); 2410 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2411 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2412 } 2413 2414 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2415 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2416 } 2417 2418 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2419 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2420 } 2421 2422 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 2423 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2424 } 2425 2426 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2427 int vec_enc) { 2428 switch(elem_bt) { 2429 case T_INT: 2430 case T_FLOAT: 2431 vmaskmovps(dst, src, mask, vec_enc); 2432 break; 2433 case T_LONG: 2434 case T_DOUBLE: 2435 vmaskmovpd(dst, src, mask, vec_enc); 2436 break; 2437 default: 2438 fatal("Unsupported type %s", type2name(elem_bt)); 2439 break; 2440 } 2441 } 2442 2443 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2444 int vec_enc) { 2445 switch(elem_bt) { 2446 case T_INT: 2447 case T_FLOAT: 2448 vmaskmovps(dst, src, mask, vec_enc); 2449 break; 2450 case T_LONG: 2451 case T_DOUBLE: 2452 vmaskmovpd(dst, src, mask, vec_enc); 2453 break; 2454 default: 2455 fatal("Unsupported type %s", type2name(elem_bt)); 2456 break; 2457 } 2458 } 2459 2460 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2461 XMMRegister dst, XMMRegister src, 2462 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2463 XMMRegister xmm_0, XMMRegister xmm_1) { 2464 const int permconst[] = {1, 14}; 2465 XMMRegister wsrc = src; 2466 XMMRegister wdst = xmm_0; 2467 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2468 2469 int vlen_enc = Assembler::AVX_128bit; 2470 if (vlen == 16) { 2471 vlen_enc = Assembler::AVX_256bit; 2472 } 2473 2474 for (int i = log2(vlen) - 1; i >=0; i--) { 2475 if (i == 0 && !is_dst_valid) { 2476 wdst = dst; 2477 } 2478 if (i == 3) { 2479 vextracti64x4_high(wtmp, wsrc); 2480 } else if (i == 2) { 2481 vextracti128_high(wtmp, wsrc); 2482 } else { // i = [0,1] 2483 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2484 } 2485 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2486 wsrc = wdst; 2487 vlen_enc = Assembler::AVX_128bit; 2488 } 2489 if (is_dst_valid) { 2490 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2491 } 2492 } 2493 2494 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2495 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2496 XMMRegister xmm_0, XMMRegister xmm_1) { 2497 XMMRegister wsrc = src; 2498 XMMRegister wdst = xmm_0; 2499 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2500 int vlen_enc = Assembler::AVX_128bit; 2501 if (vlen == 8) { 2502 vlen_enc = Assembler::AVX_256bit; 2503 } 2504 for (int i = log2(vlen) - 1; i >=0; i--) { 2505 if (i == 0 && !is_dst_valid) { 2506 wdst = dst; 2507 } 2508 if (i == 1) { 2509 vextracti128_high(wtmp, wsrc); 2510 } else if (i == 2) { 2511 vextracti64x4_high(wtmp, wsrc); 2512 } else { 2513 assert(i == 0, "%d", i); 2514 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2515 } 2516 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2517 wsrc = wdst; 2518 vlen_enc = Assembler::AVX_128bit; 2519 } 2520 if (is_dst_valid) { 2521 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2522 } 2523 } 2524 2525 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2526 switch (bt) { 2527 case T_BYTE: pextrb(dst, src, idx); break; 2528 case T_SHORT: pextrw(dst, src, idx); break; 2529 case T_INT: pextrd(dst, src, idx); break; 2530 case T_LONG: pextrq(dst, src, idx); break; 2531 2532 default: 2533 assert(false,"Should not reach here."); 2534 break; 2535 } 2536 } 2537 2538 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2539 int esize = type2aelembytes(typ); 2540 int elem_per_lane = 16/esize; 2541 int lane = elemindex / elem_per_lane; 2542 int eindex = elemindex % elem_per_lane; 2543 2544 if (lane >= 2) { 2545 assert(UseAVX > 2, "required"); 2546 vextractf32x4(dst, src, lane & 3); 2547 return dst; 2548 } else if (lane > 0) { 2549 assert(UseAVX > 0, "required"); 2550 vextractf128(dst, src, lane); 2551 return dst; 2552 } else { 2553 return src; 2554 } 2555 } 2556 2557 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2558 if (typ == T_BYTE) { 2559 movsbl(dst, dst); 2560 } else if (typ == T_SHORT) { 2561 movswl(dst, dst); 2562 } 2563 } 2564 2565 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2566 int esize = type2aelembytes(typ); 2567 int elem_per_lane = 16/esize; 2568 int eindex = elemindex % elem_per_lane; 2569 assert(is_integral_type(typ),"required"); 2570 2571 if (eindex == 0) { 2572 if (typ == T_LONG) { 2573 movq(dst, src); 2574 } else { 2575 movdl(dst, src); 2576 movsxl(typ, dst); 2577 } 2578 } else { 2579 extract(typ, dst, src, eindex); 2580 movsxl(typ, dst); 2581 } 2582 } 2583 2584 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2585 int esize = type2aelembytes(typ); 2586 int elem_per_lane = 16/esize; 2587 int eindex = elemindex % elem_per_lane; 2588 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2589 2590 if (eindex == 0) { 2591 movq(dst, src); 2592 } else { 2593 if (typ == T_FLOAT) { 2594 if (UseAVX == 0) { 2595 movdqu(dst, src); 2596 shufps(dst, dst, eindex); 2597 } else { 2598 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2599 } 2600 } else { 2601 if (UseAVX == 0) { 2602 movdqu(dst, src); 2603 psrldq(dst, eindex*esize); 2604 } else { 2605 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2606 } 2607 movq(dst, dst); 2608 } 2609 } 2610 // Zero upper bits 2611 if (typ == T_FLOAT) { 2612 if (UseAVX == 0) { 2613 assert(vtmp != xnoreg, "required."); 2614 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2615 pand(dst, vtmp); 2616 } else { 2617 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2618 } 2619 } 2620 } 2621 2622 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2623 switch(typ) { 2624 case T_BYTE: 2625 case T_BOOLEAN: 2626 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2627 break; 2628 case T_SHORT: 2629 case T_CHAR: 2630 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2631 break; 2632 case T_INT: 2633 case T_FLOAT: 2634 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2635 break; 2636 case T_LONG: 2637 case T_DOUBLE: 2638 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2639 break; 2640 default: 2641 assert(false,"Should not reach here."); 2642 break; 2643 } 2644 } 2645 2646 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2647 assert(rscratch != noreg || always_reachable(src2), "missing"); 2648 2649 switch(typ) { 2650 case T_BOOLEAN: 2651 case T_BYTE: 2652 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2653 break; 2654 case T_CHAR: 2655 case T_SHORT: 2656 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2657 break; 2658 case T_INT: 2659 case T_FLOAT: 2660 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2661 break; 2662 case T_LONG: 2663 case T_DOUBLE: 2664 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2665 break; 2666 default: 2667 assert(false,"Should not reach here."); 2668 break; 2669 } 2670 } 2671 2672 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2673 switch(typ) { 2674 case T_BYTE: 2675 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2676 break; 2677 case T_SHORT: 2678 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2679 break; 2680 case T_INT: 2681 case T_FLOAT: 2682 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2683 break; 2684 case T_LONG: 2685 case T_DOUBLE: 2686 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2687 break; 2688 default: 2689 assert(false,"Should not reach here."); 2690 break; 2691 } 2692 } 2693 2694 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2695 assert(vlen_in_bytes <= 32, ""); 2696 int esize = type2aelembytes(bt); 2697 if (vlen_in_bytes == 32) { 2698 assert(vtmp == xnoreg, "required."); 2699 if (esize >= 4) { 2700 vtestps(src1, src2, AVX_256bit); 2701 } else { 2702 vptest(src1, src2, AVX_256bit); 2703 } 2704 return; 2705 } 2706 if (vlen_in_bytes < 16) { 2707 // Duplicate the lower part to fill the whole register, 2708 // Don't need to do so for src2 2709 assert(vtmp != xnoreg, "required"); 2710 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2711 pshufd(vtmp, src1, shuffle_imm); 2712 } else { 2713 assert(vtmp == xnoreg, "required"); 2714 vtmp = src1; 2715 } 2716 if (esize >= 4 && VM_Version::supports_avx()) { 2717 vtestps(vtmp, src2, AVX_128bit); 2718 } else { 2719 ptest(vtmp, src2); 2720 } 2721 } 2722 2723 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2724 #ifdef ASSERT 2725 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2726 bool is_bw_supported = VM_Version::supports_avx512bw(); 2727 if (is_bw && !is_bw_supported) { 2728 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2729 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2730 "XMM register should be 0-15"); 2731 } 2732 #endif // ASSERT 2733 switch (elem_bt) { 2734 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2735 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2736 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2737 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2738 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2739 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2740 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2741 } 2742 } 2743 2744 #ifdef _LP64 2745 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2746 assert(UseAVX >= 2, "required"); 2747 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2748 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2749 if ((UseAVX > 2) && 2750 (!is_bw || VM_Version::supports_avx512bw()) && 2751 (!is_vl || VM_Version::supports_avx512vl())) { 2752 switch (elem_bt) { 2753 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2754 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2755 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2756 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2757 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2758 } 2759 } else { 2760 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2761 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2762 switch (elem_bt) { 2763 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2764 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2765 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2766 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2767 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2768 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2769 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2770 } 2771 } 2772 } 2773 #endif 2774 2775 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2776 switch (to_elem_bt) { 2777 case T_SHORT: 2778 vpmovsxbw(dst, src, vlen_enc); 2779 break; 2780 case T_INT: 2781 vpmovsxbd(dst, src, vlen_enc); 2782 break; 2783 case T_FLOAT: 2784 vpmovsxbd(dst, src, vlen_enc); 2785 vcvtdq2ps(dst, dst, vlen_enc); 2786 break; 2787 case T_LONG: 2788 vpmovsxbq(dst, src, vlen_enc); 2789 break; 2790 case T_DOUBLE: { 2791 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2792 vpmovsxbd(dst, src, mid_vlen_enc); 2793 vcvtdq2pd(dst, dst, vlen_enc); 2794 break; 2795 } 2796 default: 2797 fatal("Unsupported type %s", type2name(to_elem_bt)); 2798 break; 2799 } 2800 } 2801 2802 //------------------------------------------------------------------------------------------- 2803 2804 // IndexOf for constant substrings with size >= 8 chars 2805 // which don't need to be loaded through stack. 2806 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2807 Register cnt1, Register cnt2, 2808 int int_cnt2, Register result, 2809 XMMRegister vec, Register tmp, 2810 int ae) { 2811 ShortBranchVerifier sbv(this); 2812 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2813 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2814 2815 // This method uses the pcmpestri instruction with bound registers 2816 // inputs: 2817 // xmm - substring 2818 // rax - substring length (elements count) 2819 // mem - scanned string 2820 // rdx - string length (elements count) 2821 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2822 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2823 // outputs: 2824 // rcx - matched index in string 2825 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2826 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2827 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2828 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2829 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2830 2831 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2832 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2833 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2834 2835 // Note, inline_string_indexOf() generates checks: 2836 // if (substr.count > string.count) return -1; 2837 // if (substr.count == 0) return 0; 2838 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2839 2840 // Load substring. 2841 if (ae == StrIntrinsicNode::UL) { 2842 pmovzxbw(vec, Address(str2, 0)); 2843 } else { 2844 movdqu(vec, Address(str2, 0)); 2845 } 2846 movl(cnt2, int_cnt2); 2847 movptr(result, str1); // string addr 2848 2849 if (int_cnt2 > stride) { 2850 jmpb(SCAN_TO_SUBSTR); 2851 2852 // Reload substr for rescan, this code 2853 // is executed only for large substrings (> 8 chars) 2854 bind(RELOAD_SUBSTR); 2855 if (ae == StrIntrinsicNode::UL) { 2856 pmovzxbw(vec, Address(str2, 0)); 2857 } else { 2858 movdqu(vec, Address(str2, 0)); 2859 } 2860 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2861 2862 bind(RELOAD_STR); 2863 // We came here after the beginning of the substring was 2864 // matched but the rest of it was not so we need to search 2865 // again. Start from the next element after the previous match. 2866 2867 // cnt2 is number of substring reminding elements and 2868 // cnt1 is number of string reminding elements when cmp failed. 2869 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2870 subl(cnt1, cnt2); 2871 addl(cnt1, int_cnt2); 2872 movl(cnt2, int_cnt2); // Now restore cnt2 2873 2874 decrementl(cnt1); // Shift to next element 2875 cmpl(cnt1, cnt2); 2876 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2877 2878 addptr(result, (1<<scale1)); 2879 2880 } // (int_cnt2 > 8) 2881 2882 // Scan string for start of substr in 16-byte vectors 2883 bind(SCAN_TO_SUBSTR); 2884 pcmpestri(vec, Address(result, 0), mode); 2885 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2886 subl(cnt1, stride); 2887 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2888 cmpl(cnt1, cnt2); 2889 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2890 addptr(result, 16); 2891 jmpb(SCAN_TO_SUBSTR); 2892 2893 // Found a potential substr 2894 bind(FOUND_CANDIDATE); 2895 // Matched whole vector if first element matched (tmp(rcx) == 0). 2896 if (int_cnt2 == stride) { 2897 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2898 } else { // int_cnt2 > 8 2899 jccb(Assembler::overflow, FOUND_SUBSTR); 2900 } 2901 // After pcmpestri tmp(rcx) contains matched element index 2902 // Compute start addr of substr 2903 lea(result, Address(result, tmp, scale1)); 2904 2905 // Make sure string is still long enough 2906 subl(cnt1, tmp); 2907 cmpl(cnt1, cnt2); 2908 if (int_cnt2 == stride) { 2909 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2910 } else { // int_cnt2 > 8 2911 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2912 } 2913 // Left less then substring. 2914 2915 bind(RET_NOT_FOUND); 2916 movl(result, -1); 2917 jmp(EXIT); 2918 2919 if (int_cnt2 > stride) { 2920 // This code is optimized for the case when whole substring 2921 // is matched if its head is matched. 2922 bind(MATCH_SUBSTR_HEAD); 2923 pcmpestri(vec, Address(result, 0), mode); 2924 // Reload only string if does not match 2925 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2926 2927 Label CONT_SCAN_SUBSTR; 2928 // Compare the rest of substring (> 8 chars). 2929 bind(FOUND_SUBSTR); 2930 // First 8 chars are already matched. 2931 negptr(cnt2); 2932 addptr(cnt2, stride); 2933 2934 bind(SCAN_SUBSTR); 2935 subl(cnt1, stride); 2936 cmpl(cnt2, -stride); // Do not read beyond substring 2937 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2938 // Back-up strings to avoid reading beyond substring: 2939 // cnt1 = cnt1 - cnt2 + 8 2940 addl(cnt1, cnt2); // cnt2 is negative 2941 addl(cnt1, stride); 2942 movl(cnt2, stride); negptr(cnt2); 2943 bind(CONT_SCAN_SUBSTR); 2944 if (int_cnt2 < (int)G) { 2945 int tail_off1 = int_cnt2<<scale1; 2946 int tail_off2 = int_cnt2<<scale2; 2947 if (ae == StrIntrinsicNode::UL) { 2948 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2949 } else { 2950 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2951 } 2952 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2953 } else { 2954 // calculate index in register to avoid integer overflow (int_cnt2*2) 2955 movl(tmp, int_cnt2); 2956 addptr(tmp, cnt2); 2957 if (ae == StrIntrinsicNode::UL) { 2958 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2959 } else { 2960 movdqu(vec, Address(str2, tmp, scale2, 0)); 2961 } 2962 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2963 } 2964 // Need to reload strings pointers if not matched whole vector 2965 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2966 addptr(cnt2, stride); 2967 jcc(Assembler::negative, SCAN_SUBSTR); 2968 // Fall through if found full substring 2969 2970 } // (int_cnt2 > 8) 2971 2972 bind(RET_FOUND); 2973 // Found result if we matched full small substring. 2974 // Compute substr offset 2975 subptr(result, str1); 2976 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2977 shrl(result, 1); // index 2978 } 2979 bind(EXIT); 2980 2981 } // string_indexofC8 2982 2983 // Small strings are loaded through stack if they cross page boundary. 2984 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2985 Register cnt1, Register cnt2, 2986 int int_cnt2, Register result, 2987 XMMRegister vec, Register tmp, 2988 int ae) { 2989 ShortBranchVerifier sbv(this); 2990 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2991 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2992 2993 // 2994 // int_cnt2 is length of small (< 8 chars) constant substring 2995 // or (-1) for non constant substring in which case its length 2996 // is in cnt2 register. 2997 // 2998 // Note, inline_string_indexOf() generates checks: 2999 // if (substr.count > string.count) return -1; 3000 // if (substr.count == 0) return 0; 3001 // 3002 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 3003 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 3004 // This method uses the pcmpestri instruction with bound registers 3005 // inputs: 3006 // xmm - substring 3007 // rax - substring length (elements count) 3008 // mem - scanned string 3009 // rdx - string length (elements count) 3010 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 3011 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 3012 // outputs: 3013 // rcx - matched index in string 3014 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3015 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 3016 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 3017 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 3018 3019 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 3020 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 3021 FOUND_CANDIDATE; 3022 3023 { //======================================================== 3024 // We don't know where these strings are located 3025 // and we can't read beyond them. Load them through stack. 3026 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 3027 3028 movptr(tmp, rsp); // save old SP 3029 3030 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 3031 if (int_cnt2 == (1>>scale2)) { // One byte 3032 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 3033 load_unsigned_byte(result, Address(str2, 0)); 3034 movdl(vec, result); // move 32 bits 3035 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 3036 // Not enough header space in 32-bit VM: 12+3 = 15. 3037 movl(result, Address(str2, -1)); 3038 shrl(result, 8); 3039 movdl(vec, result); // move 32 bits 3040 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 3041 load_unsigned_short(result, Address(str2, 0)); 3042 movdl(vec, result); // move 32 bits 3043 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 3044 movdl(vec, Address(str2, 0)); // move 32 bits 3045 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 3046 movq(vec, Address(str2, 0)); // move 64 bits 3047 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 3048 // Array header size is 12 bytes in 32-bit VM 3049 // + 6 bytes for 3 chars == 18 bytes, 3050 // enough space to load vec and shift. 3051 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3052 if (ae == StrIntrinsicNode::UL) { 3053 int tail_off = int_cnt2-8; 3054 pmovzxbw(vec, Address(str2, tail_off)); 3055 psrldq(vec, -2*tail_off); 3056 } 3057 else { 3058 int tail_off = int_cnt2*(1<<scale2); 3059 movdqu(vec, Address(str2, tail_off-16)); 3060 psrldq(vec, 16-tail_off); 3061 } 3062 } 3063 } else { // not constant substring 3064 cmpl(cnt2, stride); 3065 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3066 3067 // We can read beyond string if srt+16 does not cross page boundary 3068 // since heaps are aligned and mapped by pages. 3069 assert(os::vm_page_size() < (int)G, "default page should be small"); 3070 movl(result, str2); // We need only low 32 bits 3071 andl(result, ((int)os::vm_page_size()-1)); 3072 cmpl(result, ((int)os::vm_page_size()-16)); 3073 jccb(Assembler::belowEqual, CHECK_STR); 3074 3075 // Move small strings to stack to allow load 16 bytes into vec. 3076 subptr(rsp, 16); 3077 int stk_offset = wordSize-(1<<scale2); 3078 push(cnt2); 3079 3080 bind(COPY_SUBSTR); 3081 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3082 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3083 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3084 } else if (ae == StrIntrinsicNode::UU) { 3085 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3086 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3087 } 3088 decrement(cnt2); 3089 jccb(Assembler::notZero, COPY_SUBSTR); 3090 3091 pop(cnt2); 3092 movptr(str2, rsp); // New substring address 3093 } // non constant 3094 3095 bind(CHECK_STR); 3096 cmpl(cnt1, stride); 3097 jccb(Assembler::aboveEqual, BIG_STRINGS); 3098 3099 // Check cross page boundary. 3100 movl(result, str1); // We need only low 32 bits 3101 andl(result, ((int)os::vm_page_size()-1)); 3102 cmpl(result, ((int)os::vm_page_size()-16)); 3103 jccb(Assembler::belowEqual, BIG_STRINGS); 3104 3105 subptr(rsp, 16); 3106 int stk_offset = -(1<<scale1); 3107 if (int_cnt2 < 0) { // not constant 3108 push(cnt2); 3109 stk_offset += wordSize; 3110 } 3111 movl(cnt2, cnt1); 3112 3113 bind(COPY_STR); 3114 if (ae == StrIntrinsicNode::LL) { 3115 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3116 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3117 } else { 3118 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3119 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3120 } 3121 decrement(cnt2); 3122 jccb(Assembler::notZero, COPY_STR); 3123 3124 if (int_cnt2 < 0) { // not constant 3125 pop(cnt2); 3126 } 3127 movptr(str1, rsp); // New string address 3128 3129 bind(BIG_STRINGS); 3130 // Load substring. 3131 if (int_cnt2 < 0) { // -1 3132 if (ae == StrIntrinsicNode::UL) { 3133 pmovzxbw(vec, Address(str2, 0)); 3134 } else { 3135 movdqu(vec, Address(str2, 0)); 3136 } 3137 push(cnt2); // substr count 3138 push(str2); // substr addr 3139 push(str1); // string addr 3140 } else { 3141 // Small (< 8 chars) constant substrings are loaded already. 3142 movl(cnt2, int_cnt2); 3143 } 3144 push(tmp); // original SP 3145 3146 } // Finished loading 3147 3148 //======================================================== 3149 // Start search 3150 // 3151 3152 movptr(result, str1); // string addr 3153 3154 if (int_cnt2 < 0) { // Only for non constant substring 3155 jmpb(SCAN_TO_SUBSTR); 3156 3157 // SP saved at sp+0 3158 // String saved at sp+1*wordSize 3159 // Substr saved at sp+2*wordSize 3160 // Substr count saved at sp+3*wordSize 3161 3162 // Reload substr for rescan, this code 3163 // is executed only for large substrings (> 8 chars) 3164 bind(RELOAD_SUBSTR); 3165 movptr(str2, Address(rsp, 2*wordSize)); 3166 movl(cnt2, Address(rsp, 3*wordSize)); 3167 if (ae == StrIntrinsicNode::UL) { 3168 pmovzxbw(vec, Address(str2, 0)); 3169 } else { 3170 movdqu(vec, Address(str2, 0)); 3171 } 3172 // We came here after the beginning of the substring was 3173 // matched but the rest of it was not so we need to search 3174 // again. Start from the next element after the previous match. 3175 subptr(str1, result); // Restore counter 3176 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3177 shrl(str1, 1); 3178 } 3179 addl(cnt1, str1); 3180 decrementl(cnt1); // Shift to next element 3181 cmpl(cnt1, cnt2); 3182 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3183 3184 addptr(result, (1<<scale1)); 3185 } // non constant 3186 3187 // Scan string for start of substr in 16-byte vectors 3188 bind(SCAN_TO_SUBSTR); 3189 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3190 pcmpestri(vec, Address(result, 0), mode); 3191 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3192 subl(cnt1, stride); 3193 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3194 cmpl(cnt1, cnt2); 3195 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3196 addptr(result, 16); 3197 3198 bind(ADJUST_STR); 3199 cmpl(cnt1, stride); // Do not read beyond string 3200 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3201 // Back-up string to avoid reading beyond string. 3202 lea(result, Address(result, cnt1, scale1, -16)); 3203 movl(cnt1, stride); 3204 jmpb(SCAN_TO_SUBSTR); 3205 3206 // Found a potential substr 3207 bind(FOUND_CANDIDATE); 3208 // After pcmpestri tmp(rcx) contains matched element index 3209 3210 // Make sure string is still long enough 3211 subl(cnt1, tmp); 3212 cmpl(cnt1, cnt2); 3213 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3214 // Left less then substring. 3215 3216 bind(RET_NOT_FOUND); 3217 movl(result, -1); 3218 jmp(CLEANUP); 3219 3220 bind(FOUND_SUBSTR); 3221 // Compute start addr of substr 3222 lea(result, Address(result, tmp, scale1)); 3223 if (int_cnt2 > 0) { // Constant substring 3224 // Repeat search for small substring (< 8 chars) 3225 // from new point without reloading substring. 3226 // Have to check that we don't read beyond string. 3227 cmpl(tmp, stride-int_cnt2); 3228 jccb(Assembler::greater, ADJUST_STR); 3229 // Fall through if matched whole substring. 3230 } else { // non constant 3231 assert(int_cnt2 == -1, "should be != 0"); 3232 3233 addl(tmp, cnt2); 3234 // Found result if we matched whole substring. 3235 cmpl(tmp, stride); 3236 jcc(Assembler::lessEqual, RET_FOUND); 3237 3238 // Repeat search for small substring (<= 8 chars) 3239 // from new point 'str1' without reloading substring. 3240 cmpl(cnt2, stride); 3241 // Have to check that we don't read beyond string. 3242 jccb(Assembler::lessEqual, ADJUST_STR); 3243 3244 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3245 // Compare the rest of substring (> 8 chars). 3246 movptr(str1, result); 3247 3248 cmpl(tmp, cnt2); 3249 // First 8 chars are already matched. 3250 jccb(Assembler::equal, CHECK_NEXT); 3251 3252 bind(SCAN_SUBSTR); 3253 pcmpestri(vec, Address(str1, 0), mode); 3254 // Need to reload strings pointers if not matched whole vector 3255 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3256 3257 bind(CHECK_NEXT); 3258 subl(cnt2, stride); 3259 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3260 addptr(str1, 16); 3261 if (ae == StrIntrinsicNode::UL) { 3262 addptr(str2, 8); 3263 } else { 3264 addptr(str2, 16); 3265 } 3266 subl(cnt1, stride); 3267 cmpl(cnt2, stride); // Do not read beyond substring 3268 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3269 // Back-up strings to avoid reading beyond substring. 3270 3271 if (ae == StrIntrinsicNode::UL) { 3272 lea(str2, Address(str2, cnt2, scale2, -8)); 3273 lea(str1, Address(str1, cnt2, scale1, -16)); 3274 } else { 3275 lea(str2, Address(str2, cnt2, scale2, -16)); 3276 lea(str1, Address(str1, cnt2, scale1, -16)); 3277 } 3278 subl(cnt1, cnt2); 3279 movl(cnt2, stride); 3280 addl(cnt1, stride); 3281 bind(CONT_SCAN_SUBSTR); 3282 if (ae == StrIntrinsicNode::UL) { 3283 pmovzxbw(vec, Address(str2, 0)); 3284 } else { 3285 movdqu(vec, Address(str2, 0)); 3286 } 3287 jmp(SCAN_SUBSTR); 3288 3289 bind(RET_FOUND_LONG); 3290 movptr(str1, Address(rsp, wordSize)); 3291 } // non constant 3292 3293 bind(RET_FOUND); 3294 // Compute substr offset 3295 subptr(result, str1); 3296 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3297 shrl(result, 1); // index 3298 } 3299 bind(CLEANUP); 3300 pop(rsp); // restore SP 3301 3302 } // string_indexof 3303 3304 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3305 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3306 ShortBranchVerifier sbv(this); 3307 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3308 3309 int stride = 8; 3310 3311 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3312 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3313 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3314 FOUND_SEQ_CHAR, DONE_LABEL; 3315 3316 movptr(result, str1); 3317 if (UseAVX >= 2) { 3318 cmpl(cnt1, stride); 3319 jcc(Assembler::less, SCAN_TO_CHAR); 3320 cmpl(cnt1, 2*stride); 3321 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3322 movdl(vec1, ch); 3323 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3324 vpxor(vec2, vec2); 3325 movl(tmp, cnt1); 3326 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3327 andl(cnt1,0x0000000F); //tail count (in chars) 3328 3329 bind(SCAN_TO_16_CHAR_LOOP); 3330 vmovdqu(vec3, Address(result, 0)); 3331 vpcmpeqw(vec3, vec3, vec1, 1); 3332 vptest(vec2, vec3); 3333 jcc(Assembler::carryClear, FOUND_CHAR); 3334 addptr(result, 32); 3335 subl(tmp, 2*stride); 3336 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3337 jmp(SCAN_TO_8_CHAR); 3338 bind(SCAN_TO_8_CHAR_INIT); 3339 movdl(vec1, ch); 3340 pshuflw(vec1, vec1, 0x00); 3341 pshufd(vec1, vec1, 0); 3342 pxor(vec2, vec2); 3343 } 3344 bind(SCAN_TO_8_CHAR); 3345 cmpl(cnt1, stride); 3346 jcc(Assembler::less, SCAN_TO_CHAR); 3347 if (UseAVX < 2) { 3348 movdl(vec1, ch); 3349 pshuflw(vec1, vec1, 0x00); 3350 pshufd(vec1, vec1, 0); 3351 pxor(vec2, vec2); 3352 } 3353 movl(tmp, cnt1); 3354 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3355 andl(cnt1,0x00000007); //tail count (in chars) 3356 3357 bind(SCAN_TO_8_CHAR_LOOP); 3358 movdqu(vec3, Address(result, 0)); 3359 pcmpeqw(vec3, vec1); 3360 ptest(vec2, vec3); 3361 jcc(Assembler::carryClear, FOUND_CHAR); 3362 addptr(result, 16); 3363 subl(tmp, stride); 3364 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3365 bind(SCAN_TO_CHAR); 3366 testl(cnt1, cnt1); 3367 jcc(Assembler::zero, RET_NOT_FOUND); 3368 bind(SCAN_TO_CHAR_LOOP); 3369 load_unsigned_short(tmp, Address(result, 0)); 3370 cmpl(ch, tmp); 3371 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3372 addptr(result, 2); 3373 subl(cnt1, 1); 3374 jccb(Assembler::zero, RET_NOT_FOUND); 3375 jmp(SCAN_TO_CHAR_LOOP); 3376 3377 bind(RET_NOT_FOUND); 3378 movl(result, -1); 3379 jmpb(DONE_LABEL); 3380 3381 bind(FOUND_CHAR); 3382 if (UseAVX >= 2) { 3383 vpmovmskb(tmp, vec3); 3384 } else { 3385 pmovmskb(tmp, vec3); 3386 } 3387 bsfl(ch, tmp); 3388 addptr(result, ch); 3389 3390 bind(FOUND_SEQ_CHAR); 3391 subptr(result, str1); 3392 shrl(result, 1); 3393 3394 bind(DONE_LABEL); 3395 } // string_indexof_char 3396 3397 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3398 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3399 ShortBranchVerifier sbv(this); 3400 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3401 3402 int stride = 16; 3403 3404 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3405 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3406 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3407 FOUND_SEQ_CHAR, DONE_LABEL; 3408 3409 movptr(result, str1); 3410 if (UseAVX >= 2) { 3411 cmpl(cnt1, stride); 3412 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3413 cmpl(cnt1, stride*2); 3414 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3415 movdl(vec1, ch); 3416 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3417 vpxor(vec2, vec2); 3418 movl(tmp, cnt1); 3419 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3420 andl(cnt1,0x0000001F); //tail count (in chars) 3421 3422 bind(SCAN_TO_32_CHAR_LOOP); 3423 vmovdqu(vec3, Address(result, 0)); 3424 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3425 vptest(vec2, vec3); 3426 jcc(Assembler::carryClear, FOUND_CHAR); 3427 addptr(result, 32); 3428 subl(tmp, stride*2); 3429 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3430 jmp(SCAN_TO_16_CHAR); 3431 3432 bind(SCAN_TO_16_CHAR_INIT); 3433 movdl(vec1, ch); 3434 pxor(vec2, vec2); 3435 pshufb(vec1, vec2); 3436 } 3437 3438 bind(SCAN_TO_16_CHAR); 3439 cmpl(cnt1, stride); 3440 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3441 if (UseAVX < 2) { 3442 movdl(vec1, ch); 3443 pxor(vec2, vec2); 3444 pshufb(vec1, vec2); 3445 } 3446 movl(tmp, cnt1); 3447 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3448 andl(cnt1,0x0000000F); //tail count (in bytes) 3449 3450 bind(SCAN_TO_16_CHAR_LOOP); 3451 movdqu(vec3, Address(result, 0)); 3452 pcmpeqb(vec3, vec1); 3453 ptest(vec2, vec3); 3454 jcc(Assembler::carryClear, FOUND_CHAR); 3455 addptr(result, 16); 3456 subl(tmp, stride); 3457 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3458 3459 bind(SCAN_TO_CHAR_INIT); 3460 testl(cnt1, cnt1); 3461 jcc(Assembler::zero, RET_NOT_FOUND); 3462 bind(SCAN_TO_CHAR_LOOP); 3463 load_unsigned_byte(tmp, Address(result, 0)); 3464 cmpl(ch, tmp); 3465 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3466 addptr(result, 1); 3467 subl(cnt1, 1); 3468 jccb(Assembler::zero, RET_NOT_FOUND); 3469 jmp(SCAN_TO_CHAR_LOOP); 3470 3471 bind(RET_NOT_FOUND); 3472 movl(result, -1); 3473 jmpb(DONE_LABEL); 3474 3475 bind(FOUND_CHAR); 3476 if (UseAVX >= 2) { 3477 vpmovmskb(tmp, vec3); 3478 } else { 3479 pmovmskb(tmp, vec3); 3480 } 3481 bsfl(ch, tmp); 3482 addptr(result, ch); 3483 3484 bind(FOUND_SEQ_CHAR); 3485 subptr(result, str1); 3486 3487 bind(DONE_LABEL); 3488 } // stringL_indexof_char 3489 3490 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3491 switch (eltype) { 3492 case T_BOOLEAN: return sizeof(jboolean); 3493 case T_BYTE: return sizeof(jbyte); 3494 case T_SHORT: return sizeof(jshort); 3495 case T_CHAR: return sizeof(jchar); 3496 case T_INT: return sizeof(jint); 3497 default: 3498 ShouldNotReachHere(); 3499 return -1; 3500 } 3501 } 3502 3503 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3504 switch (eltype) { 3505 // T_BOOLEAN used as surrogate for unsigned byte 3506 case T_BOOLEAN: movzbl(dst, src); break; 3507 case T_BYTE: movsbl(dst, src); break; 3508 case T_SHORT: movswl(dst, src); break; 3509 case T_CHAR: movzwl(dst, src); break; 3510 case T_INT: movl(dst, src); break; 3511 default: 3512 ShouldNotReachHere(); 3513 } 3514 } 3515 3516 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3517 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3518 } 3519 3520 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3521 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3522 } 3523 3524 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3525 const int vlen = Assembler::AVX_256bit; 3526 switch (eltype) { 3527 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3528 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3529 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3530 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3531 case T_INT: 3532 // do nothing 3533 break; 3534 default: 3535 ShouldNotReachHere(); 3536 } 3537 } 3538 3539 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3540 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3541 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3542 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3543 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3544 BasicType eltype) { 3545 ShortBranchVerifier sbv(this); 3546 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3547 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3548 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3549 3550 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3551 SHORT_UNROLLED_LOOP_EXIT, 3552 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3553 UNROLLED_VECTOR_LOOP_BEGIN, 3554 END; 3555 switch (eltype) { 3556 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3557 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3558 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3559 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3560 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3561 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3562 } 3563 3564 // For "renaming" for readibility of the code 3565 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3566 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3567 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3568 3569 const int elsize = arrays_hashcode_elsize(eltype); 3570 3571 /* 3572 if (cnt1 >= 2) { 3573 if (cnt1 >= 32) { 3574 UNROLLED VECTOR LOOP 3575 } 3576 UNROLLED SCALAR LOOP 3577 } 3578 SINGLE SCALAR 3579 */ 3580 3581 cmpl(cnt1, 32); 3582 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3583 3584 // cnt1 >= 32 && generate_vectorized_loop 3585 xorl(index, index); 3586 3587 // vresult = IntVector.zero(I256); 3588 for (int idx = 0; idx < 4; idx++) { 3589 vpxor(vresult[idx], vresult[idx]); 3590 } 3591 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3592 Register bound = tmp2; 3593 Register next = tmp3; 3594 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3595 movl(next, Address(tmp2, 0)); 3596 movdl(vnext, next); 3597 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3598 3599 // index = 0; 3600 // bound = cnt1 & ~(32 - 1); 3601 movl(bound, cnt1); 3602 andl(bound, ~(32 - 1)); 3603 // for (; index < bound; index += 32) { 3604 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3605 // result *= next; 3606 imull(result, next); 3607 // loop fission to upfront the cost of fetching from memory, OOO execution 3608 // can then hopefully do a better job of prefetching 3609 for (int idx = 0; idx < 4; idx++) { 3610 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3611 } 3612 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3613 for (int idx = 0; idx < 4; idx++) { 3614 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3615 arrays_hashcode_elvcast(vtmp[idx], eltype); 3616 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3617 } 3618 // index += 32; 3619 addl(index, 32); 3620 // index < bound; 3621 cmpl(index, bound); 3622 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3623 // } 3624 3625 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3626 subl(cnt1, bound); 3627 // release bound 3628 3629 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3630 for (int idx = 0; idx < 4; idx++) { 3631 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3632 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3633 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3634 } 3635 // result += vresult.reduceLanes(ADD); 3636 for (int idx = 0; idx < 4; idx++) { 3637 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3638 } 3639 3640 // } else if (cnt1 < 32) { 3641 3642 bind(SHORT_UNROLLED_BEGIN); 3643 // int i = 1; 3644 movl(index, 1); 3645 cmpl(index, cnt1); 3646 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3647 3648 // for (; i < cnt1 ; i += 2) { 3649 bind(SHORT_UNROLLED_LOOP_BEGIN); 3650 movl(tmp3, 961); 3651 imull(result, tmp3); 3652 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3653 movl(tmp3, tmp2); 3654 shll(tmp3, 5); 3655 subl(tmp3, tmp2); 3656 addl(result, tmp3); 3657 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3658 addl(result, tmp3); 3659 addl(index, 2); 3660 cmpl(index, cnt1); 3661 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3662 3663 // } 3664 // if (i >= cnt1) { 3665 bind(SHORT_UNROLLED_LOOP_EXIT); 3666 jccb(Assembler::greater, END); 3667 movl(tmp2, result); 3668 shll(result, 5); 3669 subl(result, tmp2); 3670 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3671 addl(result, tmp3); 3672 // } 3673 bind(END); 3674 3675 BLOCK_COMMENT("} // arrays_hashcode"); 3676 3677 } // arrays_hashcode 3678 3679 // helper function for string_compare 3680 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3681 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3682 Address::ScaleFactor scale2, Register index, int ae) { 3683 if (ae == StrIntrinsicNode::LL) { 3684 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3685 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3686 } else if (ae == StrIntrinsicNode::UU) { 3687 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3688 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3689 } else { 3690 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3691 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3692 } 3693 } 3694 3695 // Compare strings, used for char[] and byte[]. 3696 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3697 Register cnt1, Register cnt2, Register result, 3698 XMMRegister vec1, int ae, KRegister mask) { 3699 ShortBranchVerifier sbv(this); 3700 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3701 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3702 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3703 int stride2x2 = 0x40; 3704 Address::ScaleFactor scale = Address::no_scale; 3705 Address::ScaleFactor scale1 = Address::no_scale; 3706 Address::ScaleFactor scale2 = Address::no_scale; 3707 3708 if (ae != StrIntrinsicNode::LL) { 3709 stride2x2 = 0x20; 3710 } 3711 3712 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3713 shrl(cnt2, 1); 3714 } 3715 // Compute the minimum of the string lengths and the 3716 // difference of the string lengths (stack). 3717 // Do the conditional move stuff 3718 movl(result, cnt1); 3719 subl(cnt1, cnt2); 3720 push(cnt1); 3721 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3722 3723 // Is the minimum length zero? 3724 testl(cnt2, cnt2); 3725 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3726 if (ae == StrIntrinsicNode::LL) { 3727 // Load first bytes 3728 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3729 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3730 } else if (ae == StrIntrinsicNode::UU) { 3731 // Load first characters 3732 load_unsigned_short(result, Address(str1, 0)); 3733 load_unsigned_short(cnt1, Address(str2, 0)); 3734 } else { 3735 load_unsigned_byte(result, Address(str1, 0)); 3736 load_unsigned_short(cnt1, Address(str2, 0)); 3737 } 3738 subl(result, cnt1); 3739 jcc(Assembler::notZero, POP_LABEL); 3740 3741 if (ae == StrIntrinsicNode::UU) { 3742 // Divide length by 2 to get number of chars 3743 shrl(cnt2, 1); 3744 } 3745 cmpl(cnt2, 1); 3746 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3747 3748 // Check if the strings start at the same location and setup scale and stride 3749 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3750 cmpptr(str1, str2); 3751 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3752 if (ae == StrIntrinsicNode::LL) { 3753 scale = Address::times_1; 3754 stride = 16; 3755 } else { 3756 scale = Address::times_2; 3757 stride = 8; 3758 } 3759 } else { 3760 scale1 = Address::times_1; 3761 scale2 = Address::times_2; 3762 // scale not used 3763 stride = 8; 3764 } 3765 3766 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3767 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3768 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3769 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3770 Label COMPARE_TAIL_LONG; 3771 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3772 3773 int pcmpmask = 0x19; 3774 if (ae == StrIntrinsicNode::LL) { 3775 pcmpmask &= ~0x01; 3776 } 3777 3778 // Setup to compare 16-chars (32-bytes) vectors, 3779 // start from first character again because it has aligned address. 3780 if (ae == StrIntrinsicNode::LL) { 3781 stride2 = 32; 3782 } else { 3783 stride2 = 16; 3784 } 3785 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3786 adr_stride = stride << scale; 3787 } else { 3788 adr_stride1 = 8; //stride << scale1; 3789 adr_stride2 = 16; //stride << scale2; 3790 } 3791 3792 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3793 // rax and rdx are used by pcmpestri as elements counters 3794 movl(result, cnt2); 3795 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3796 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3797 3798 // fast path : compare first 2 8-char vectors. 3799 bind(COMPARE_16_CHARS); 3800 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3801 movdqu(vec1, Address(str1, 0)); 3802 } else { 3803 pmovzxbw(vec1, Address(str1, 0)); 3804 } 3805 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3806 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3807 3808 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3809 movdqu(vec1, Address(str1, adr_stride)); 3810 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3811 } else { 3812 pmovzxbw(vec1, Address(str1, adr_stride1)); 3813 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3814 } 3815 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3816 addl(cnt1, stride); 3817 3818 // Compare the characters at index in cnt1 3819 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3820 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3821 subl(result, cnt2); 3822 jmp(POP_LABEL); 3823 3824 // Setup the registers to start vector comparison loop 3825 bind(COMPARE_WIDE_VECTORS); 3826 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3827 lea(str1, Address(str1, result, scale)); 3828 lea(str2, Address(str2, result, scale)); 3829 } else { 3830 lea(str1, Address(str1, result, scale1)); 3831 lea(str2, Address(str2, result, scale2)); 3832 } 3833 subl(result, stride2); 3834 subl(cnt2, stride2); 3835 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3836 negptr(result); 3837 3838 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3839 bind(COMPARE_WIDE_VECTORS_LOOP); 3840 3841 #ifdef _LP64 3842 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3843 cmpl(cnt2, stride2x2); 3844 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3845 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3846 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3847 3848 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3849 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3850 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3851 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3852 } else { 3853 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3854 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3855 } 3856 kortestql(mask, mask); 3857 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3858 addptr(result, stride2x2); // update since we already compared at this addr 3859 subl(cnt2, stride2x2); // and sub the size too 3860 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3861 3862 vpxor(vec1, vec1); 3863 jmpb(COMPARE_WIDE_TAIL); 3864 }//if (VM_Version::supports_avx512vlbw()) 3865 #endif // _LP64 3866 3867 3868 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3869 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3870 vmovdqu(vec1, Address(str1, result, scale)); 3871 vpxor(vec1, Address(str2, result, scale)); 3872 } else { 3873 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3874 vpxor(vec1, Address(str2, result, scale2)); 3875 } 3876 vptest(vec1, vec1); 3877 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3878 addptr(result, stride2); 3879 subl(cnt2, stride2); 3880 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3881 // clean upper bits of YMM registers 3882 vpxor(vec1, vec1); 3883 3884 // compare wide vectors tail 3885 bind(COMPARE_WIDE_TAIL); 3886 testptr(result, result); 3887 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3888 3889 movl(result, stride2); 3890 movl(cnt2, result); 3891 negptr(result); 3892 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3893 3894 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3895 bind(VECTOR_NOT_EQUAL); 3896 // clean upper bits of YMM registers 3897 vpxor(vec1, vec1); 3898 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3899 lea(str1, Address(str1, result, scale)); 3900 lea(str2, Address(str2, result, scale)); 3901 } else { 3902 lea(str1, Address(str1, result, scale1)); 3903 lea(str2, Address(str2, result, scale2)); 3904 } 3905 jmp(COMPARE_16_CHARS); 3906 3907 // Compare tail chars, length between 1 to 15 chars 3908 bind(COMPARE_TAIL_LONG); 3909 movl(cnt2, result); 3910 cmpl(cnt2, stride); 3911 jcc(Assembler::less, COMPARE_SMALL_STR); 3912 3913 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3914 movdqu(vec1, Address(str1, 0)); 3915 } else { 3916 pmovzxbw(vec1, Address(str1, 0)); 3917 } 3918 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3919 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3920 subptr(cnt2, stride); 3921 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3922 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3923 lea(str1, Address(str1, result, scale)); 3924 lea(str2, Address(str2, result, scale)); 3925 } else { 3926 lea(str1, Address(str1, result, scale1)); 3927 lea(str2, Address(str2, result, scale2)); 3928 } 3929 negptr(cnt2); 3930 jmpb(WHILE_HEAD_LABEL); 3931 3932 bind(COMPARE_SMALL_STR); 3933 } else if (UseSSE42Intrinsics) { 3934 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3935 int pcmpmask = 0x19; 3936 // Setup to compare 8-char (16-byte) vectors, 3937 // start from first character again because it has aligned address. 3938 movl(result, cnt2); 3939 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3940 if (ae == StrIntrinsicNode::LL) { 3941 pcmpmask &= ~0x01; 3942 } 3943 jcc(Assembler::zero, COMPARE_TAIL); 3944 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3945 lea(str1, Address(str1, result, scale)); 3946 lea(str2, Address(str2, result, scale)); 3947 } else { 3948 lea(str1, Address(str1, result, scale1)); 3949 lea(str2, Address(str2, result, scale2)); 3950 } 3951 negptr(result); 3952 3953 // pcmpestri 3954 // inputs: 3955 // vec1- substring 3956 // rax - negative string length (elements count) 3957 // mem - scanned string 3958 // rdx - string length (elements count) 3959 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3960 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3961 // outputs: 3962 // rcx - first mismatched element index 3963 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3964 3965 bind(COMPARE_WIDE_VECTORS); 3966 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3967 movdqu(vec1, Address(str1, result, scale)); 3968 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3969 } else { 3970 pmovzxbw(vec1, Address(str1, result, scale1)); 3971 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3972 } 3973 // After pcmpestri cnt1(rcx) contains mismatched element index 3974 3975 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3976 addptr(result, stride); 3977 subptr(cnt2, stride); 3978 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3979 3980 // compare wide vectors tail 3981 testptr(result, result); 3982 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3983 3984 movl(cnt2, stride); 3985 movl(result, stride); 3986 negptr(result); 3987 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3988 movdqu(vec1, Address(str1, result, scale)); 3989 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3990 } else { 3991 pmovzxbw(vec1, Address(str1, result, scale1)); 3992 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3993 } 3994 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3995 3996 // Mismatched characters in the vectors 3997 bind(VECTOR_NOT_EQUAL); 3998 addptr(cnt1, result); 3999 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 4000 subl(result, cnt2); 4001 jmpb(POP_LABEL); 4002 4003 bind(COMPARE_TAIL); // limit is zero 4004 movl(cnt2, result); 4005 // Fallthru to tail compare 4006 } 4007 // Shift str2 and str1 to the end of the arrays, negate min 4008 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4009 lea(str1, Address(str1, cnt2, scale)); 4010 lea(str2, Address(str2, cnt2, scale)); 4011 } else { 4012 lea(str1, Address(str1, cnt2, scale1)); 4013 lea(str2, Address(str2, cnt2, scale2)); 4014 } 4015 decrementl(cnt2); // first character was compared already 4016 negptr(cnt2); 4017 4018 // Compare the rest of the elements 4019 bind(WHILE_HEAD_LABEL); 4020 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 4021 subl(result, cnt1); 4022 jccb(Assembler::notZero, POP_LABEL); 4023 increment(cnt2); 4024 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 4025 4026 // Strings are equal up to min length. Return the length difference. 4027 bind(LENGTH_DIFF_LABEL); 4028 pop(result); 4029 if (ae == StrIntrinsicNode::UU) { 4030 // Divide diff by 2 to get number of chars 4031 sarl(result, 1); 4032 } 4033 jmpb(DONE_LABEL); 4034 4035 #ifdef _LP64 4036 if (VM_Version::supports_avx512vlbw()) { 4037 4038 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 4039 4040 kmovql(cnt1, mask); 4041 notq(cnt1); 4042 bsfq(cnt2, cnt1); 4043 if (ae != StrIntrinsicNode::LL) { 4044 // Divide diff by 2 to get number of chars 4045 sarl(cnt2, 1); 4046 } 4047 addq(result, cnt2); 4048 if (ae == StrIntrinsicNode::LL) { 4049 load_unsigned_byte(cnt1, Address(str2, result)); 4050 load_unsigned_byte(result, Address(str1, result)); 4051 } else if (ae == StrIntrinsicNode::UU) { 4052 load_unsigned_short(cnt1, Address(str2, result, scale)); 4053 load_unsigned_short(result, Address(str1, result, scale)); 4054 } else { 4055 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4056 load_unsigned_byte(result, Address(str1, result, scale1)); 4057 } 4058 subl(result, cnt1); 4059 jmpb(POP_LABEL); 4060 }//if (VM_Version::supports_avx512vlbw()) 4061 #endif // _LP64 4062 4063 // Discard the stored length difference 4064 bind(POP_LABEL); 4065 pop(cnt1); 4066 4067 // That's it 4068 bind(DONE_LABEL); 4069 if(ae == StrIntrinsicNode::UL) { 4070 negl(result); 4071 } 4072 4073 } 4074 4075 // Search for Non-ASCII character (Negative byte value) in a byte array, 4076 // return the index of the first such character, otherwise the length 4077 // of the array segment searched. 4078 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4079 // @IntrinsicCandidate 4080 // public static int countPositives(byte[] ba, int off, int len) { 4081 // for (int i = off; i < off + len; i++) { 4082 // if (ba[i] < 0) { 4083 // return i - off; 4084 // } 4085 // } 4086 // return len; 4087 // } 4088 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4089 Register result, Register tmp1, 4090 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4091 // rsi: byte array 4092 // rcx: len 4093 // rax: result 4094 ShortBranchVerifier sbv(this); 4095 assert_different_registers(ary1, len, result, tmp1); 4096 assert_different_registers(vec1, vec2); 4097 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4098 4099 movl(result, len); // copy 4100 // len == 0 4101 testl(len, len); 4102 jcc(Assembler::zero, DONE); 4103 4104 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4105 VM_Version::supports_avx512vlbw() && 4106 VM_Version::supports_bmi2()) { 4107 4108 Label test_64_loop, test_tail, BREAK_LOOP; 4109 movl(tmp1, len); 4110 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4111 4112 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4113 andl(len, 0xffffffc0); // vector count (in chars) 4114 jccb(Assembler::zero, test_tail); 4115 4116 lea(ary1, Address(ary1, len, Address::times_1)); 4117 negptr(len); 4118 4119 bind(test_64_loop); 4120 // Check whether our 64 elements of size byte contain negatives 4121 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4122 kortestql(mask1, mask1); 4123 jcc(Assembler::notZero, BREAK_LOOP); 4124 4125 addptr(len, 64); 4126 jccb(Assembler::notZero, test_64_loop); 4127 4128 bind(test_tail); 4129 // bail out when there is nothing to be done 4130 testl(tmp1, -1); 4131 jcc(Assembler::zero, DONE); 4132 4133 4134 // check the tail for absense of negatives 4135 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4136 #ifdef _LP64 4137 { 4138 Register tmp3_aliased = len; 4139 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4140 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4141 notq(tmp3_aliased); 4142 kmovql(mask2, tmp3_aliased); 4143 } 4144 #else 4145 Label k_init; 4146 jmp(k_init); 4147 4148 // We could not read 64-bits from a general purpose register thus we move 4149 // data required to compose 64 1's to the instruction stream 4150 // We emit 64 byte wide series of elements from 0..63 which later on would 4151 // be used as a compare targets with tail count contained in tmp1 register. 4152 // Result would be a k register having tmp1 consecutive number or 1 4153 // counting from least significant bit. 4154 address tmp = pc(); 4155 emit_int64(0x0706050403020100); 4156 emit_int64(0x0F0E0D0C0B0A0908); 4157 emit_int64(0x1716151413121110); 4158 emit_int64(0x1F1E1D1C1B1A1918); 4159 emit_int64(0x2726252423222120); 4160 emit_int64(0x2F2E2D2C2B2A2928); 4161 emit_int64(0x3736353433323130); 4162 emit_int64(0x3F3E3D3C3B3A3938); 4163 4164 bind(k_init); 4165 lea(len, InternalAddress(tmp)); 4166 // create mask to test for negative byte inside a vector 4167 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 4168 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 4169 4170 #endif 4171 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4172 ktestq(mask1, mask2); 4173 jcc(Assembler::zero, DONE); 4174 4175 // do a full check for negative registers in the tail 4176 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4177 // ary1 already pointing to the right place 4178 jmpb(TAIL_START); 4179 4180 bind(BREAK_LOOP); 4181 // At least one byte in the last 64 byte block was negative. 4182 // Set up to look at the last 64 bytes as if they were a tail 4183 lea(ary1, Address(ary1, len, Address::times_1)); 4184 addptr(result, len); 4185 // Ignore the very last byte: if all others are positive, 4186 // it must be negative, so we can skip right to the 2+1 byte 4187 // end comparison at this point 4188 orl(result, 63); 4189 movl(len, 63); 4190 // Fallthru to tail compare 4191 } else { 4192 4193 if (UseAVX >= 2 && UseSSE >= 2) { 4194 // With AVX2, use 32-byte vector compare 4195 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4196 4197 // Compare 32-byte vectors 4198 testl(len, 0xffffffe0); // vector count (in bytes) 4199 jccb(Assembler::zero, TAIL_START); 4200 4201 andl(len, 0xffffffe0); 4202 lea(ary1, Address(ary1, len, Address::times_1)); 4203 negptr(len); 4204 4205 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4206 movdl(vec2, tmp1); 4207 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4208 4209 bind(COMPARE_WIDE_VECTORS); 4210 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4211 vptest(vec1, vec2); 4212 jccb(Assembler::notZero, BREAK_LOOP); 4213 addptr(len, 32); 4214 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4215 4216 testl(result, 0x0000001f); // any bytes remaining? 4217 jcc(Assembler::zero, DONE); 4218 4219 // Quick test using the already prepared vector mask 4220 movl(len, result); 4221 andl(len, 0x0000001f); 4222 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4223 vptest(vec1, vec2); 4224 jcc(Assembler::zero, DONE); 4225 // There are zeros, jump to the tail to determine exactly where 4226 jmpb(TAIL_START); 4227 4228 bind(BREAK_LOOP); 4229 // At least one byte in the last 32-byte vector is negative. 4230 // Set up to look at the last 32 bytes as if they were a tail 4231 lea(ary1, Address(ary1, len, Address::times_1)); 4232 addptr(result, len); 4233 // Ignore the very last byte: if all others are positive, 4234 // it must be negative, so we can skip right to the 2+1 byte 4235 // end comparison at this point 4236 orl(result, 31); 4237 movl(len, 31); 4238 // Fallthru to tail compare 4239 } else if (UseSSE42Intrinsics) { 4240 // With SSE4.2, use double quad vector compare 4241 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4242 4243 // Compare 16-byte vectors 4244 testl(len, 0xfffffff0); // vector count (in bytes) 4245 jcc(Assembler::zero, TAIL_START); 4246 4247 andl(len, 0xfffffff0); 4248 lea(ary1, Address(ary1, len, Address::times_1)); 4249 negptr(len); 4250 4251 movl(tmp1, 0x80808080); 4252 movdl(vec2, tmp1); 4253 pshufd(vec2, vec2, 0); 4254 4255 bind(COMPARE_WIDE_VECTORS); 4256 movdqu(vec1, Address(ary1, len, Address::times_1)); 4257 ptest(vec1, vec2); 4258 jccb(Assembler::notZero, BREAK_LOOP); 4259 addptr(len, 16); 4260 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4261 4262 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4263 jcc(Assembler::zero, DONE); 4264 4265 // Quick test using the already prepared vector mask 4266 movl(len, result); 4267 andl(len, 0x0000000f); // tail count (in bytes) 4268 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4269 ptest(vec1, vec2); 4270 jcc(Assembler::zero, DONE); 4271 jmpb(TAIL_START); 4272 4273 bind(BREAK_LOOP); 4274 // At least one byte in the last 16-byte vector is negative. 4275 // Set up and look at the last 16 bytes as if they were a tail 4276 lea(ary1, Address(ary1, len, Address::times_1)); 4277 addptr(result, len); 4278 // Ignore the very last byte: if all others are positive, 4279 // it must be negative, so we can skip right to the 2+1 byte 4280 // end comparison at this point 4281 orl(result, 15); 4282 movl(len, 15); 4283 // Fallthru to tail compare 4284 } 4285 } 4286 4287 bind(TAIL_START); 4288 // Compare 4-byte vectors 4289 andl(len, 0xfffffffc); // vector count (in bytes) 4290 jccb(Assembler::zero, COMPARE_CHAR); 4291 4292 lea(ary1, Address(ary1, len, Address::times_1)); 4293 negptr(len); 4294 4295 bind(COMPARE_VECTORS); 4296 movl(tmp1, Address(ary1, len, Address::times_1)); 4297 andl(tmp1, 0x80808080); 4298 jccb(Assembler::notZero, TAIL_ADJUST); 4299 addptr(len, 4); 4300 jccb(Assembler::notZero, COMPARE_VECTORS); 4301 4302 // Compare trailing char (final 2-3 bytes), if any 4303 bind(COMPARE_CHAR); 4304 4305 testl(result, 0x2); // tail char 4306 jccb(Assembler::zero, COMPARE_BYTE); 4307 load_unsigned_short(tmp1, Address(ary1, 0)); 4308 andl(tmp1, 0x00008080); 4309 jccb(Assembler::notZero, CHAR_ADJUST); 4310 lea(ary1, Address(ary1, 2)); 4311 4312 bind(COMPARE_BYTE); 4313 testl(result, 0x1); // tail byte 4314 jccb(Assembler::zero, DONE); 4315 load_unsigned_byte(tmp1, Address(ary1, 0)); 4316 testl(tmp1, 0x00000080); 4317 jccb(Assembler::zero, DONE); 4318 subptr(result, 1); 4319 jmpb(DONE); 4320 4321 bind(TAIL_ADJUST); 4322 // there are negative bits in the last 4 byte block. 4323 // Adjust result and check the next three bytes 4324 addptr(result, len); 4325 orl(result, 3); 4326 lea(ary1, Address(ary1, len, Address::times_1)); 4327 jmpb(COMPARE_CHAR); 4328 4329 bind(CHAR_ADJUST); 4330 // We are looking at a char + optional byte tail, and found that one 4331 // of the bytes in the char is negative. Adjust the result, check the 4332 // first byte and readjust if needed. 4333 andl(result, 0xfffffffc); 4334 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4335 jccb(Assembler::notZero, DONE); 4336 addptr(result, 1); 4337 4338 // That's it 4339 bind(DONE); 4340 if (UseAVX >= 2 && UseSSE >= 2) { 4341 // clean upper bits of YMM registers 4342 vpxor(vec1, vec1); 4343 vpxor(vec2, vec2); 4344 } 4345 } 4346 4347 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4348 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4349 Register limit, Register result, Register chr, 4350 XMMRegister vec1, XMMRegister vec2, bool is_char, 4351 KRegister mask, bool expand_ary2) { 4352 // for expand_ary2, limit is the (smaller) size of the second array. 4353 ShortBranchVerifier sbv(this); 4354 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4355 4356 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4357 "Expansion only implemented for AVX2"); 4358 4359 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4360 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4361 4362 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4363 int scaleIncr = expand_ary2 ? 8 : 16; 4364 4365 if (is_array_equ) { 4366 // Check the input args 4367 cmpoop(ary1, ary2); 4368 jcc(Assembler::equal, TRUE_LABEL); 4369 4370 // Need additional checks for arrays_equals. 4371 testptr(ary1, ary1); 4372 jcc(Assembler::zero, FALSE_LABEL); 4373 testptr(ary2, ary2); 4374 jcc(Assembler::zero, FALSE_LABEL); 4375 4376 // Check the lengths 4377 movl(limit, Address(ary1, length_offset)); 4378 cmpl(limit, Address(ary2, length_offset)); 4379 jcc(Assembler::notEqual, FALSE_LABEL); 4380 } 4381 4382 // count == 0 4383 testl(limit, limit); 4384 jcc(Assembler::zero, TRUE_LABEL); 4385 4386 if (is_array_equ) { 4387 // Load array address 4388 lea(ary1, Address(ary1, base_offset)); 4389 lea(ary2, Address(ary2, base_offset)); 4390 } 4391 4392 if (is_array_equ && is_char) { 4393 // arrays_equals when used for char[]. 4394 shll(limit, 1); // byte count != 0 4395 } 4396 movl(result, limit); // copy 4397 4398 if (UseAVX >= 2) { 4399 // With AVX2, use 32-byte vector compare 4400 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4401 4402 // Compare 32-byte vectors 4403 if (expand_ary2) { 4404 andl(result, 0x0000000f); // tail count (in bytes) 4405 andl(limit, 0xfffffff0); // vector count (in bytes) 4406 jcc(Assembler::zero, COMPARE_TAIL); 4407 } else { 4408 andl(result, 0x0000001f); // tail count (in bytes) 4409 andl(limit, 0xffffffe0); // vector count (in bytes) 4410 jcc(Assembler::zero, COMPARE_TAIL_16); 4411 } 4412 4413 lea(ary1, Address(ary1, limit, scaleFactor)); 4414 lea(ary2, Address(ary2, limit, Address::times_1)); 4415 negptr(limit); 4416 4417 #ifdef _LP64 4418 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4419 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4420 4421 cmpl(limit, -64); 4422 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4423 4424 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4425 4426 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4427 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4428 kortestql(mask, mask); 4429 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4430 addptr(limit, 64); // update since we already compared at this addr 4431 cmpl(limit, -64); 4432 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4433 4434 // At this point we may still need to compare -limit+result bytes. 4435 // We could execute the next two instruction and just continue via non-wide path: 4436 // cmpl(limit, 0); 4437 // jcc(Assembler::equal, COMPARE_TAIL); // true 4438 // But since we stopped at the points ary{1,2}+limit which are 4439 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4440 // (|limit| <= 32 and result < 32), 4441 // we may just compare the last 64 bytes. 4442 // 4443 addptr(result, -64); // it is safe, bc we just came from this area 4444 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4445 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4446 kortestql(mask, mask); 4447 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4448 4449 jmp(TRUE_LABEL); 4450 4451 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4452 4453 }//if (VM_Version::supports_avx512vlbw()) 4454 #endif //_LP64 4455 bind(COMPARE_WIDE_VECTORS); 4456 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4457 if (expand_ary2) { 4458 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4459 } else { 4460 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4461 } 4462 vpxor(vec1, vec2); 4463 4464 vptest(vec1, vec1); 4465 jcc(Assembler::notZero, FALSE_LABEL); 4466 addptr(limit, scaleIncr * 2); 4467 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4468 4469 testl(result, result); 4470 jcc(Assembler::zero, TRUE_LABEL); 4471 4472 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4473 if (expand_ary2) { 4474 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4475 } else { 4476 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4477 } 4478 vpxor(vec1, vec2); 4479 4480 vptest(vec1, vec1); 4481 jcc(Assembler::notZero, FALSE_LABEL); 4482 jmp(TRUE_LABEL); 4483 4484 bind(COMPARE_TAIL_16); // limit is zero 4485 movl(limit, result); 4486 4487 // Compare 16-byte chunks 4488 andl(result, 0x0000000f); // tail count (in bytes) 4489 andl(limit, 0xfffffff0); // vector count (in bytes) 4490 jcc(Assembler::zero, COMPARE_TAIL); 4491 4492 lea(ary1, Address(ary1, limit, scaleFactor)); 4493 lea(ary2, Address(ary2, limit, Address::times_1)); 4494 negptr(limit); 4495 4496 bind(COMPARE_WIDE_VECTORS_16); 4497 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4498 if (expand_ary2) { 4499 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4500 } else { 4501 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4502 } 4503 pxor(vec1, vec2); 4504 4505 ptest(vec1, vec1); 4506 jcc(Assembler::notZero, FALSE_LABEL); 4507 addptr(limit, scaleIncr); 4508 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4509 4510 bind(COMPARE_TAIL); // limit is zero 4511 movl(limit, result); 4512 // Fallthru to tail compare 4513 } else if (UseSSE42Intrinsics) { 4514 // With SSE4.2, use double quad vector compare 4515 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4516 4517 // Compare 16-byte vectors 4518 andl(result, 0x0000000f); // tail count (in bytes) 4519 andl(limit, 0xfffffff0); // vector count (in bytes) 4520 jcc(Assembler::zero, COMPARE_TAIL); 4521 4522 lea(ary1, Address(ary1, limit, Address::times_1)); 4523 lea(ary2, Address(ary2, limit, Address::times_1)); 4524 negptr(limit); 4525 4526 bind(COMPARE_WIDE_VECTORS); 4527 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4528 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4529 pxor(vec1, vec2); 4530 4531 ptest(vec1, vec1); 4532 jcc(Assembler::notZero, FALSE_LABEL); 4533 addptr(limit, 16); 4534 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4535 4536 testl(result, result); 4537 jcc(Assembler::zero, TRUE_LABEL); 4538 4539 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4540 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4541 pxor(vec1, vec2); 4542 4543 ptest(vec1, vec1); 4544 jccb(Assembler::notZero, FALSE_LABEL); 4545 jmpb(TRUE_LABEL); 4546 4547 bind(COMPARE_TAIL); // limit is zero 4548 movl(limit, result); 4549 // Fallthru to tail compare 4550 } 4551 4552 // Compare 4-byte vectors 4553 if (expand_ary2) { 4554 testl(result, result); 4555 jccb(Assembler::zero, TRUE_LABEL); 4556 } else { 4557 andl(limit, 0xfffffffc); // vector count (in bytes) 4558 jccb(Assembler::zero, COMPARE_CHAR); 4559 } 4560 4561 lea(ary1, Address(ary1, limit, scaleFactor)); 4562 lea(ary2, Address(ary2, limit, Address::times_1)); 4563 negptr(limit); 4564 4565 bind(COMPARE_VECTORS); 4566 if (expand_ary2) { 4567 // There are no "vector" operations for bytes to shorts 4568 movzbl(chr, Address(ary2, limit, Address::times_1)); 4569 cmpw(Address(ary1, limit, Address::times_2), chr); 4570 jccb(Assembler::notEqual, FALSE_LABEL); 4571 addptr(limit, 1); 4572 jcc(Assembler::notZero, COMPARE_VECTORS); 4573 jmp(TRUE_LABEL); 4574 } else { 4575 movl(chr, Address(ary1, limit, Address::times_1)); 4576 cmpl(chr, Address(ary2, limit, Address::times_1)); 4577 jccb(Assembler::notEqual, FALSE_LABEL); 4578 addptr(limit, 4); 4579 jcc(Assembler::notZero, COMPARE_VECTORS); 4580 } 4581 4582 // Compare trailing char (final 2 bytes), if any 4583 bind(COMPARE_CHAR); 4584 testl(result, 0x2); // tail char 4585 jccb(Assembler::zero, COMPARE_BYTE); 4586 load_unsigned_short(chr, Address(ary1, 0)); 4587 load_unsigned_short(limit, Address(ary2, 0)); 4588 cmpl(chr, limit); 4589 jccb(Assembler::notEqual, FALSE_LABEL); 4590 4591 if (is_array_equ && is_char) { 4592 bind(COMPARE_BYTE); 4593 } else { 4594 lea(ary1, Address(ary1, 2)); 4595 lea(ary2, Address(ary2, 2)); 4596 4597 bind(COMPARE_BYTE); 4598 testl(result, 0x1); // tail byte 4599 jccb(Assembler::zero, TRUE_LABEL); 4600 load_unsigned_byte(chr, Address(ary1, 0)); 4601 load_unsigned_byte(limit, Address(ary2, 0)); 4602 cmpl(chr, limit); 4603 jccb(Assembler::notEqual, FALSE_LABEL); 4604 } 4605 bind(TRUE_LABEL); 4606 movl(result, 1); // return true 4607 jmpb(DONE); 4608 4609 bind(FALSE_LABEL); 4610 xorl(result, result); // return false 4611 4612 // That's it 4613 bind(DONE); 4614 if (UseAVX >= 2) { 4615 // clean upper bits of YMM registers 4616 vpxor(vec1, vec1); 4617 vpxor(vec2, vec2); 4618 } 4619 } 4620 4621 #ifdef _LP64 4622 4623 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4624 #define __ masm. 4625 Register dst = stub.data<0>(); 4626 XMMRegister src = stub.data<1>(); 4627 address target = stub.data<2>(); 4628 __ bind(stub.entry()); 4629 __ subptr(rsp, 8); 4630 __ movdbl(Address(rsp), src); 4631 __ call(RuntimeAddress(target)); 4632 __ pop(dst); 4633 __ jmp(stub.continuation()); 4634 #undef __ 4635 } 4636 4637 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4638 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4639 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4640 4641 address slowpath_target; 4642 if (dst_bt == T_INT) { 4643 if (src_bt == T_FLOAT) { 4644 cvttss2sil(dst, src); 4645 cmpl(dst, 0x80000000); 4646 slowpath_target = StubRoutines::x86::f2i_fixup(); 4647 } else { 4648 cvttsd2sil(dst, src); 4649 cmpl(dst, 0x80000000); 4650 slowpath_target = StubRoutines::x86::d2i_fixup(); 4651 } 4652 } else { 4653 if (src_bt == T_FLOAT) { 4654 cvttss2siq(dst, src); 4655 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4656 slowpath_target = StubRoutines::x86::f2l_fixup(); 4657 } else { 4658 cvttsd2siq(dst, src); 4659 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4660 slowpath_target = StubRoutines::x86::d2l_fixup(); 4661 } 4662 } 4663 4664 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4665 jcc(Assembler::equal, stub->entry()); 4666 bind(stub->continuation()); 4667 } 4668 4669 #endif // _LP64 4670 4671 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4672 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4673 switch(ideal_opc) { 4674 case Op_LShiftVS: 4675 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4676 case Op_LShiftVI: 4677 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4678 case Op_LShiftVL: 4679 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4680 case Op_RShiftVS: 4681 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4682 case Op_RShiftVI: 4683 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4684 case Op_RShiftVL: 4685 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4686 case Op_URShiftVS: 4687 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4688 case Op_URShiftVI: 4689 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4690 case Op_URShiftVL: 4691 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4692 case Op_RotateRightV: 4693 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4694 case Op_RotateLeftV: 4695 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4696 default: 4697 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4698 break; 4699 } 4700 } 4701 4702 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4703 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) { 4704 if (is_unsigned) { 4705 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4706 } else { 4707 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4708 } 4709 } 4710 4711 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4712 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4713 switch (elem_bt) { 4714 case T_BYTE: 4715 if (ideal_opc == Op_SaturatingAddV) { 4716 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4717 } else { 4718 assert(ideal_opc == Op_SaturatingSubV, ""); 4719 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4720 } 4721 break; 4722 case T_SHORT: 4723 if (ideal_opc == Op_SaturatingAddV) { 4724 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4725 } else { 4726 assert(ideal_opc == Op_SaturatingSubV, ""); 4727 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4728 } 4729 break; 4730 default: 4731 fatal("Unsupported type %s", type2name(elem_bt)); 4732 break; 4733 } 4734 } 4735 4736 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4737 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4738 switch (elem_bt) { 4739 case T_BYTE: 4740 if (ideal_opc == Op_SaturatingAddV) { 4741 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4742 } else { 4743 assert(ideal_opc == Op_SaturatingSubV, ""); 4744 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4745 } 4746 break; 4747 case T_SHORT: 4748 if (ideal_opc == Op_SaturatingAddV) { 4749 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4750 } else { 4751 assert(ideal_opc == Op_SaturatingSubV, ""); 4752 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4753 } 4754 break; 4755 default: 4756 fatal("Unsupported type %s", type2name(elem_bt)); 4757 break; 4758 } 4759 } 4760 4761 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4762 Address src2, bool is_unsigned, bool merge, int vlen_enc) { 4763 if (is_unsigned) { 4764 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4765 } else { 4766 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4767 } 4768 } 4769 4770 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4771 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4772 switch (elem_bt) { 4773 case T_BYTE: 4774 if (ideal_opc == Op_SaturatingAddV) { 4775 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4776 } else { 4777 assert(ideal_opc == Op_SaturatingSubV, ""); 4778 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4779 } 4780 break; 4781 case T_SHORT: 4782 if (ideal_opc == Op_SaturatingAddV) { 4783 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4784 } else { 4785 assert(ideal_opc == Op_SaturatingSubV, ""); 4786 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4787 } 4788 break; 4789 default: 4790 fatal("Unsupported type %s", type2name(elem_bt)); 4791 break; 4792 } 4793 } 4794 4795 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4796 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4797 switch (elem_bt) { 4798 case T_BYTE: 4799 if (ideal_opc == Op_SaturatingAddV) { 4800 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4801 } else { 4802 assert(ideal_opc == Op_SaturatingSubV, ""); 4803 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4804 } 4805 break; 4806 case T_SHORT: 4807 if (ideal_opc == Op_SaturatingAddV) { 4808 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4809 } else { 4810 assert(ideal_opc == Op_SaturatingSubV, ""); 4811 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4812 } 4813 break; 4814 default: 4815 fatal("Unsupported type %s", type2name(elem_bt)); 4816 break; 4817 } 4818 } 4819 4820 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4821 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4822 bool is_varshift) { 4823 switch (ideal_opc) { 4824 case Op_AddVB: 4825 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4826 case Op_AddVS: 4827 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4828 case Op_AddVI: 4829 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4830 case Op_AddVL: 4831 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4832 case Op_AddVF: 4833 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4834 case Op_AddVD: 4835 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4836 case Op_SubVB: 4837 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4838 case Op_SubVS: 4839 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4840 case Op_SubVI: 4841 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4842 case Op_SubVL: 4843 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4844 case Op_SubVF: 4845 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4846 case Op_SubVD: 4847 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4848 case Op_MulVS: 4849 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4850 case Op_MulVI: 4851 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4852 case Op_MulVL: 4853 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4854 case Op_MulVF: 4855 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4856 case Op_MulVD: 4857 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4858 case Op_DivVF: 4859 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4860 case Op_DivVD: 4861 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4862 case Op_SqrtVF: 4863 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4864 case Op_SqrtVD: 4865 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4866 case Op_AbsVB: 4867 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4868 case Op_AbsVS: 4869 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4870 case Op_AbsVI: 4871 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4872 case Op_AbsVL: 4873 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4874 case Op_FmaVF: 4875 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4876 case Op_FmaVD: 4877 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4878 case Op_VectorRearrange: 4879 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4880 case Op_LShiftVS: 4881 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4882 case Op_LShiftVI: 4883 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4884 case Op_LShiftVL: 4885 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4886 case Op_RShiftVS: 4887 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4888 case Op_RShiftVI: 4889 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4890 case Op_RShiftVL: 4891 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4892 case Op_URShiftVS: 4893 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4894 case Op_URShiftVI: 4895 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4896 case Op_URShiftVL: 4897 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4898 case Op_RotateLeftV: 4899 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4900 case Op_RotateRightV: 4901 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4902 case Op_MaxV: 4903 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4904 case Op_MinV: 4905 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4906 case Op_UMinV: 4907 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4908 case Op_UMaxV: 4909 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4910 case Op_XorV: 4911 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4912 case Op_OrV: 4913 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4914 case Op_AndV: 4915 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4916 default: 4917 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4918 break; 4919 } 4920 } 4921 4922 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4923 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4924 switch (ideal_opc) { 4925 case Op_AddVB: 4926 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4927 case Op_AddVS: 4928 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4929 case Op_AddVI: 4930 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4931 case Op_AddVL: 4932 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4933 case Op_AddVF: 4934 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4935 case Op_AddVD: 4936 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4937 case Op_SubVB: 4938 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4939 case Op_SubVS: 4940 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4941 case Op_SubVI: 4942 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4943 case Op_SubVL: 4944 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4945 case Op_SubVF: 4946 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4947 case Op_SubVD: 4948 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4949 case Op_MulVS: 4950 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4951 case Op_MulVI: 4952 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4953 case Op_MulVL: 4954 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4955 case Op_MulVF: 4956 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4957 case Op_MulVD: 4958 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4959 case Op_DivVF: 4960 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4961 case Op_DivVD: 4962 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4963 case Op_FmaVF: 4964 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4965 case Op_FmaVD: 4966 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4967 case Op_MaxV: 4968 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4969 case Op_MinV: 4970 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4971 case Op_UMaxV: 4972 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4973 case Op_UMinV: 4974 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4975 case Op_XorV: 4976 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4977 case Op_OrV: 4978 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4979 case Op_AndV: 4980 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4981 default: 4982 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4983 break; 4984 } 4985 } 4986 4987 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4988 KRegister src1, KRegister src2) { 4989 BasicType etype = T_ILLEGAL; 4990 switch(mask_len) { 4991 case 2: 4992 case 4: 4993 case 8: etype = T_BYTE; break; 4994 case 16: etype = T_SHORT; break; 4995 case 32: etype = T_INT; break; 4996 case 64: etype = T_LONG; break; 4997 default: fatal("Unsupported type"); break; 4998 } 4999 assert(etype != T_ILLEGAL, ""); 5000 switch(ideal_opc) { 5001 case Op_AndVMask: 5002 kand(etype, dst, src1, src2); break; 5003 case Op_OrVMask: 5004 kor(etype, dst, src1, src2); break; 5005 case Op_XorVMask: 5006 kxor(etype, dst, src1, src2); break; 5007 default: 5008 fatal("Unsupported masked operation"); break; 5009 } 5010 } 5011 5012 /* 5013 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5014 * If src is NaN, the result is 0. 5015 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 5016 * the result is equal to the value of Integer.MIN_VALUE. 5017 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 5018 * the result is equal to the value of Integer.MAX_VALUE. 5019 */ 5020 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5021 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5022 Register rscratch, AddressLiteral float_sign_flip, 5023 int vec_enc) { 5024 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5025 Label done; 5026 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 5027 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5028 vptest(xtmp2, xtmp2, vec_enc); 5029 jccb(Assembler::equal, done); 5030 5031 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 5032 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 5033 5034 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5035 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 5036 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 5037 5038 // Recompute the mask for remaining special value. 5039 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 5040 // Extract SRC values corresponding to TRUE mask lanes. 5041 vpand(xtmp4, xtmp2, src, vec_enc); 5042 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 5043 // values are set. 5044 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 5045 5046 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 5047 bind(done); 5048 } 5049 5050 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5051 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5052 Register rscratch, AddressLiteral float_sign_flip, 5053 int vec_enc) { 5054 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5055 Label done; 5056 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5057 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5058 kortestwl(ktmp1, ktmp1); 5059 jccb(Assembler::equal, done); 5060 5061 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5062 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5063 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5064 5065 kxorwl(ktmp1, ktmp1, ktmp2); 5066 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5067 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5068 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5069 bind(done); 5070 } 5071 5072 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5073 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5074 Register rscratch, AddressLiteral double_sign_flip, 5075 int vec_enc) { 5076 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5077 5078 Label done; 5079 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5080 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 5081 kortestwl(ktmp1, ktmp1); 5082 jccb(Assembler::equal, done); 5083 5084 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5085 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5086 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5087 5088 kxorwl(ktmp1, ktmp1, ktmp2); 5089 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5090 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5091 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5092 bind(done); 5093 } 5094 5095 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5096 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5097 Register rscratch, AddressLiteral float_sign_flip, 5098 int vec_enc) { 5099 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5100 Label done; 5101 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5102 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5103 kortestwl(ktmp1, ktmp1); 5104 jccb(Assembler::equal, done); 5105 5106 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5107 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5108 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5109 5110 kxorwl(ktmp1, ktmp1, ktmp2); 5111 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5112 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5113 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5114 bind(done); 5115 } 5116 5117 /* 5118 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5119 * If src is NaN, the result is 0. 5120 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 5121 * the result is equal to the value of Long.MIN_VALUE. 5122 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 5123 * the result is equal to the value of Long.MAX_VALUE. 5124 */ 5125 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5126 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5127 Register rscratch, AddressLiteral double_sign_flip, 5128 int vec_enc) { 5129 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5130 5131 Label done; 5132 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5133 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 5134 kortestwl(ktmp1, ktmp1); 5135 jccb(Assembler::equal, done); 5136 5137 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5138 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5139 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5140 5141 kxorwl(ktmp1, ktmp1, ktmp2); 5142 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5143 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5144 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5145 bind(done); 5146 } 5147 5148 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5149 XMMRegister xtmp, int index, int vec_enc) { 5150 assert(vec_enc < Assembler::AVX_512bit, ""); 5151 if (vec_enc == Assembler::AVX_256bit) { 5152 vextractf128_high(xtmp, src); 5153 vshufps(dst, src, xtmp, index, vec_enc); 5154 } else { 5155 vshufps(dst, src, zero, index, vec_enc); 5156 } 5157 } 5158 5159 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5160 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5161 AddressLiteral float_sign_flip, int src_vec_enc) { 5162 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5163 5164 Label done; 5165 // Compare the destination lanes with float_sign_flip 5166 // value to get mask for all special values. 5167 movdqu(xtmp1, float_sign_flip, rscratch); 5168 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5169 ptest(xtmp2, xtmp2); 5170 jccb(Assembler::equal, done); 5171 5172 // Flip float_sign_flip to get max integer value. 5173 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5174 pxor(xtmp1, xtmp4); 5175 5176 // Set detination lanes corresponding to unordered source lanes as zero. 5177 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5178 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5179 5180 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5181 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5182 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5183 5184 // Recompute the mask for remaining special value. 5185 pxor(xtmp2, xtmp3); 5186 // Extract mask corresponding to non-negative source lanes. 5187 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5188 5189 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5190 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5191 pand(xtmp3, xtmp2); 5192 5193 // Replace destination lanes holding special value(0x80000000) with max int 5194 // if corresponding source lane holds a +ve value. 5195 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5196 bind(done); 5197 } 5198 5199 5200 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5201 XMMRegister xtmp, Register rscratch, int vec_enc) { 5202 switch(to_elem_bt) { 5203 case T_SHORT: 5204 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5205 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5206 vpackusdw(dst, dst, zero, vec_enc); 5207 if (vec_enc == Assembler::AVX_256bit) { 5208 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5209 } 5210 break; 5211 case T_BYTE: 5212 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5213 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5214 vpackusdw(dst, dst, zero, vec_enc); 5215 if (vec_enc == Assembler::AVX_256bit) { 5216 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5217 } 5218 vpackuswb(dst, dst, zero, vec_enc); 5219 break; 5220 default: assert(false, "%s", type2name(to_elem_bt)); 5221 } 5222 } 5223 5224 /* 5225 * Algorithm for vector D2L and F2I conversions:- 5226 * a) Perform vector D2L/F2I cast. 5227 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5228 * It signifies that source value could be any of the special floating point 5229 * values(NaN,-Inf,Inf,Max,-Min). 5230 * c) Set destination to zero if source is NaN value. 5231 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5232 */ 5233 5234 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5235 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5236 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5237 int to_elem_sz = type2aelembytes(to_elem_bt); 5238 assert(to_elem_sz <= 4, ""); 5239 vcvttps2dq(dst, src, vec_enc); 5240 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5241 if (to_elem_sz < 4) { 5242 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5243 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5244 } 5245 } 5246 5247 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5248 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5249 Register rscratch, int vec_enc) { 5250 int to_elem_sz = type2aelembytes(to_elem_bt); 5251 assert(to_elem_sz <= 4, ""); 5252 vcvttps2dq(dst, src, vec_enc); 5253 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5254 switch(to_elem_bt) { 5255 case T_INT: 5256 break; 5257 case T_SHORT: 5258 evpmovdw(dst, dst, vec_enc); 5259 break; 5260 case T_BYTE: 5261 evpmovdb(dst, dst, vec_enc); 5262 break; 5263 default: assert(false, "%s", type2name(to_elem_bt)); 5264 } 5265 } 5266 5267 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5268 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5269 Register rscratch, int vec_enc) { 5270 evcvttps2qq(dst, src, vec_enc); 5271 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5272 } 5273 5274 // Handling for downcasting from double to integer or sub-word types on AVX2. 5275 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5276 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5277 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5278 int to_elem_sz = type2aelembytes(to_elem_bt); 5279 assert(to_elem_sz < 8, ""); 5280 vcvttpd2dq(dst, src, vec_enc); 5281 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5282 float_sign_flip, vec_enc); 5283 if (to_elem_sz < 4) { 5284 // xtmp4 holds all zero lanes. 5285 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5286 } 5287 } 5288 5289 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5290 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5291 KRegister ktmp2, AddressLiteral sign_flip, 5292 Register rscratch, int vec_enc) { 5293 if (VM_Version::supports_avx512dq()) { 5294 evcvttpd2qq(dst, src, vec_enc); 5295 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5296 switch(to_elem_bt) { 5297 case T_LONG: 5298 break; 5299 case T_INT: 5300 evpmovsqd(dst, dst, vec_enc); 5301 break; 5302 case T_SHORT: 5303 evpmovsqd(dst, dst, vec_enc); 5304 evpmovdw(dst, dst, vec_enc); 5305 break; 5306 case T_BYTE: 5307 evpmovsqd(dst, dst, vec_enc); 5308 evpmovdb(dst, dst, vec_enc); 5309 break; 5310 default: assert(false, "%s", type2name(to_elem_bt)); 5311 } 5312 } else { 5313 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5314 vcvttpd2dq(dst, src, vec_enc); 5315 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5316 switch(to_elem_bt) { 5317 case T_INT: 5318 break; 5319 case T_SHORT: 5320 evpmovdw(dst, dst, vec_enc); 5321 break; 5322 case T_BYTE: 5323 evpmovdb(dst, dst, vec_enc); 5324 break; 5325 default: assert(false, "%s", type2name(to_elem_bt)); 5326 } 5327 } 5328 } 5329 5330 #ifdef _LP64 5331 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5332 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5333 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5334 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5335 // and re-instantiate original MXCSR.RC mode after that. 5336 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5337 5338 mov64(tmp, julong_cast(0.5L)); 5339 evpbroadcastq(xtmp1, tmp, vec_enc); 5340 vaddpd(xtmp1, src , xtmp1, vec_enc); 5341 evcvtpd2qq(dst, xtmp1, vec_enc); 5342 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5343 double_sign_flip, vec_enc);; 5344 5345 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5346 } 5347 5348 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5349 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5350 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5351 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5352 // and re-instantiate original MXCSR.RC mode after that. 5353 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5354 5355 movl(tmp, jint_cast(0.5)); 5356 movq(xtmp1, tmp); 5357 vbroadcastss(xtmp1, xtmp1, vec_enc); 5358 vaddps(xtmp1, src , xtmp1, vec_enc); 5359 vcvtps2dq(dst, xtmp1, vec_enc); 5360 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5361 float_sign_flip, vec_enc); 5362 5363 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5364 } 5365 5366 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5367 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5368 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5369 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5370 // and re-instantiate original MXCSR.RC mode after that. 5371 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5372 5373 movl(tmp, jint_cast(0.5)); 5374 movq(xtmp1, tmp); 5375 vbroadcastss(xtmp1, xtmp1, vec_enc); 5376 vaddps(xtmp1, src , xtmp1, vec_enc); 5377 vcvtps2dq(dst, xtmp1, vec_enc); 5378 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5379 5380 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5381 } 5382 #endif // _LP64 5383 5384 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5385 BasicType from_elem_bt, BasicType to_elem_bt) { 5386 switch (from_elem_bt) { 5387 case T_BYTE: 5388 switch (to_elem_bt) { 5389 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5390 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5391 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5392 default: ShouldNotReachHere(); 5393 } 5394 break; 5395 case T_SHORT: 5396 switch (to_elem_bt) { 5397 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5398 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5399 default: ShouldNotReachHere(); 5400 } 5401 break; 5402 case T_INT: 5403 assert(to_elem_bt == T_LONG, ""); 5404 vpmovzxdq(dst, src, vlen_enc); 5405 break; 5406 default: 5407 ShouldNotReachHere(); 5408 } 5409 } 5410 5411 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5412 BasicType from_elem_bt, BasicType to_elem_bt) { 5413 switch (from_elem_bt) { 5414 case T_BYTE: 5415 switch (to_elem_bt) { 5416 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5417 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5418 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5419 default: ShouldNotReachHere(); 5420 } 5421 break; 5422 case T_SHORT: 5423 switch (to_elem_bt) { 5424 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5425 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5426 default: ShouldNotReachHere(); 5427 } 5428 break; 5429 case T_INT: 5430 assert(to_elem_bt == T_LONG, ""); 5431 vpmovsxdq(dst, src, vlen_enc); 5432 break; 5433 default: 5434 ShouldNotReachHere(); 5435 } 5436 } 5437 5438 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5439 BasicType dst_bt, BasicType src_bt, int vlen) { 5440 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5441 assert(vlen_enc != AVX_512bit, ""); 5442 5443 int dst_bt_size = type2aelembytes(dst_bt); 5444 int src_bt_size = type2aelembytes(src_bt); 5445 if (dst_bt_size > src_bt_size) { 5446 switch (dst_bt_size / src_bt_size) { 5447 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5448 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5449 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5450 default: ShouldNotReachHere(); 5451 } 5452 } else { 5453 assert(dst_bt_size < src_bt_size, ""); 5454 switch (src_bt_size / dst_bt_size) { 5455 case 2: { 5456 if (vlen_enc == AVX_128bit) { 5457 vpacksswb(dst, src, src, vlen_enc); 5458 } else { 5459 vpacksswb(dst, src, src, vlen_enc); 5460 vpermq(dst, dst, 0x08, vlen_enc); 5461 } 5462 break; 5463 } 5464 case 4: { 5465 if (vlen_enc == AVX_128bit) { 5466 vpackssdw(dst, src, src, vlen_enc); 5467 vpacksswb(dst, dst, dst, vlen_enc); 5468 } else { 5469 vpackssdw(dst, src, src, vlen_enc); 5470 vpermq(dst, dst, 0x08, vlen_enc); 5471 vpacksswb(dst, dst, dst, AVX_128bit); 5472 } 5473 break; 5474 } 5475 case 8: { 5476 if (vlen_enc == AVX_128bit) { 5477 vpshufd(dst, src, 0x08, vlen_enc); 5478 vpackssdw(dst, dst, dst, vlen_enc); 5479 vpacksswb(dst, dst, dst, vlen_enc); 5480 } else { 5481 vpshufd(dst, src, 0x08, vlen_enc); 5482 vpermq(dst, dst, 0x08, vlen_enc); 5483 vpackssdw(dst, dst, dst, AVX_128bit); 5484 vpacksswb(dst, dst, dst, AVX_128bit); 5485 } 5486 break; 5487 } 5488 default: ShouldNotReachHere(); 5489 } 5490 } 5491 } 5492 5493 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5494 bool merge, BasicType bt, int vlen_enc) { 5495 if (bt == T_INT) { 5496 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5497 } else { 5498 assert(bt == T_LONG, ""); 5499 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5500 } 5501 } 5502 5503 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5504 bool merge, BasicType bt, int vlen_enc) { 5505 if (bt == T_INT) { 5506 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5507 } else { 5508 assert(bt == T_LONG, ""); 5509 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5510 } 5511 } 5512 5513 #ifdef _LP64 5514 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5515 Register rtmp2, XMMRegister xtmp, int mask_len, 5516 int vec_enc) { 5517 int index = 0; 5518 int vindex = 0; 5519 mov64(rtmp1, 0x0101010101010101L); 5520 pdepq(rtmp1, src, rtmp1); 5521 if (mask_len > 8) { 5522 movq(rtmp2, src); 5523 vpxor(xtmp, xtmp, xtmp, vec_enc); 5524 movq(xtmp, rtmp1); 5525 } 5526 movq(dst, rtmp1); 5527 5528 mask_len -= 8; 5529 while (mask_len > 0) { 5530 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5531 index++; 5532 if ((index % 2) == 0) { 5533 pxor(xtmp, xtmp); 5534 } 5535 mov64(rtmp1, 0x0101010101010101L); 5536 shrq(rtmp2, 8); 5537 pdepq(rtmp1, rtmp2, rtmp1); 5538 pinsrq(xtmp, rtmp1, index % 2); 5539 vindex = index / 2; 5540 if (vindex) { 5541 // Write entire 16 byte vector when both 64 bit 5542 // lanes are update to save redundant instructions. 5543 if (index % 2) { 5544 vinsertf128(dst, dst, xtmp, vindex); 5545 } 5546 } else { 5547 vmovdqu(dst, xtmp); 5548 } 5549 mask_len -= 8; 5550 } 5551 } 5552 5553 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5554 switch(opc) { 5555 case Op_VectorMaskTrueCount: 5556 popcntq(dst, tmp); 5557 break; 5558 case Op_VectorMaskLastTrue: 5559 if (VM_Version::supports_lzcnt()) { 5560 lzcntq(tmp, tmp); 5561 movl(dst, 63); 5562 subl(dst, tmp); 5563 } else { 5564 movl(dst, -1); 5565 bsrq(tmp, tmp); 5566 cmov32(Assembler::notZero, dst, tmp); 5567 } 5568 break; 5569 case Op_VectorMaskFirstTrue: 5570 if (VM_Version::supports_bmi1()) { 5571 if (masklen < 32) { 5572 orl(tmp, 1 << masklen); 5573 tzcntl(dst, tmp); 5574 } else if (masklen == 32) { 5575 tzcntl(dst, tmp); 5576 } else { 5577 assert(masklen == 64, ""); 5578 tzcntq(dst, tmp); 5579 } 5580 } else { 5581 if (masklen < 32) { 5582 orl(tmp, 1 << masklen); 5583 bsfl(dst, tmp); 5584 } else { 5585 assert(masklen == 32 || masklen == 64, ""); 5586 movl(dst, masklen); 5587 if (masklen == 32) { 5588 bsfl(tmp, tmp); 5589 } else { 5590 bsfq(tmp, tmp); 5591 } 5592 cmov32(Assembler::notZero, dst, tmp); 5593 } 5594 } 5595 break; 5596 case Op_VectorMaskToLong: 5597 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5598 break; 5599 default: assert(false, "Unhandled mask operation"); 5600 } 5601 } 5602 5603 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5604 int masklen, int masksize, int vec_enc) { 5605 assert(VM_Version::supports_popcnt(), ""); 5606 5607 if(VM_Version::supports_avx512bw()) { 5608 kmovql(tmp, mask); 5609 } else { 5610 assert(masklen <= 16, ""); 5611 kmovwl(tmp, mask); 5612 } 5613 5614 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5615 // operations needs to be clipped. 5616 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5617 andq(tmp, (1 << masklen) - 1); 5618 } 5619 5620 vector_mask_operation_helper(opc, dst, tmp, masklen); 5621 } 5622 5623 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5624 Register tmp, int masklen, BasicType bt, int vec_enc) { 5625 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5626 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5627 assert(VM_Version::supports_popcnt(), ""); 5628 5629 bool need_clip = false; 5630 switch(bt) { 5631 case T_BOOLEAN: 5632 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5633 vpxor(xtmp, xtmp, xtmp, vec_enc); 5634 vpsubb(xtmp, xtmp, mask, vec_enc); 5635 vpmovmskb(tmp, xtmp, vec_enc); 5636 need_clip = masklen < 16; 5637 break; 5638 case T_BYTE: 5639 vpmovmskb(tmp, mask, vec_enc); 5640 need_clip = masklen < 16; 5641 break; 5642 case T_SHORT: 5643 vpacksswb(xtmp, mask, mask, vec_enc); 5644 if (masklen >= 16) { 5645 vpermpd(xtmp, xtmp, 8, vec_enc); 5646 } 5647 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5648 need_clip = masklen < 16; 5649 break; 5650 case T_INT: 5651 case T_FLOAT: 5652 vmovmskps(tmp, mask, vec_enc); 5653 need_clip = masklen < 4; 5654 break; 5655 case T_LONG: 5656 case T_DOUBLE: 5657 vmovmskpd(tmp, mask, vec_enc); 5658 need_clip = masklen < 2; 5659 break; 5660 default: assert(false, "Unhandled type, %s", type2name(bt)); 5661 } 5662 5663 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5664 // operations needs to be clipped. 5665 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5666 // need_clip implies masklen < 32 5667 andq(tmp, (1 << masklen) - 1); 5668 } 5669 5670 vector_mask_operation_helper(opc, dst, tmp, masklen); 5671 } 5672 5673 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5674 Register rtmp2, int mask_len) { 5675 kmov(rtmp1, src); 5676 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5677 mov64(rtmp2, -1L); 5678 pextq(rtmp2, rtmp2, rtmp1); 5679 kmov(dst, rtmp2); 5680 } 5681 5682 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5683 XMMRegister mask, Register rtmp, Register rscratch, 5684 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5685 int vec_enc) { 5686 assert(type2aelembytes(bt) >= 4, ""); 5687 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5688 address compress_perm_table = nullptr; 5689 address expand_perm_table = nullptr; 5690 if (type2aelembytes(bt) == 8) { 5691 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5692 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5693 vmovmskpd(rtmp, mask, vec_enc); 5694 } else { 5695 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5696 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5697 vmovmskps(rtmp, mask, vec_enc); 5698 } 5699 shlq(rtmp, 5); // for 32 byte permute row. 5700 if (opcode == Op_CompressV) { 5701 lea(rscratch, ExternalAddress(compress_perm_table)); 5702 } else { 5703 lea(rscratch, ExternalAddress(expand_perm_table)); 5704 } 5705 addptr(rtmp, rscratch); 5706 vmovdqu(permv, Address(rtmp)); 5707 vpermps(dst, permv, src, Assembler::AVX_256bit); 5708 vpxor(xtmp, xtmp, xtmp, vec_enc); 5709 // Blend the result with zero vector using permute mask, each column entry 5710 // in a permute table row contains either a valid permute index or a -1 (default) 5711 // value, this can potentially be used as a blending mask after 5712 // compressing/expanding the source vector lanes. 5713 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5714 } 5715 5716 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5717 bool merge, BasicType bt, int vec_enc) { 5718 if (opcode == Op_CompressV) { 5719 switch(bt) { 5720 case T_BYTE: 5721 evpcompressb(dst, mask, src, merge, vec_enc); 5722 break; 5723 case T_CHAR: 5724 case T_SHORT: 5725 evpcompressw(dst, mask, src, merge, vec_enc); 5726 break; 5727 case T_INT: 5728 evpcompressd(dst, mask, src, merge, vec_enc); 5729 break; 5730 case T_FLOAT: 5731 evcompressps(dst, mask, src, merge, vec_enc); 5732 break; 5733 case T_LONG: 5734 evpcompressq(dst, mask, src, merge, vec_enc); 5735 break; 5736 case T_DOUBLE: 5737 evcompresspd(dst, mask, src, merge, vec_enc); 5738 break; 5739 default: 5740 fatal("Unsupported type %s", type2name(bt)); 5741 break; 5742 } 5743 } else { 5744 assert(opcode == Op_ExpandV, ""); 5745 switch(bt) { 5746 case T_BYTE: 5747 evpexpandb(dst, mask, src, merge, vec_enc); 5748 break; 5749 case T_CHAR: 5750 case T_SHORT: 5751 evpexpandw(dst, mask, src, merge, vec_enc); 5752 break; 5753 case T_INT: 5754 evpexpandd(dst, mask, src, merge, vec_enc); 5755 break; 5756 case T_FLOAT: 5757 evexpandps(dst, mask, src, merge, vec_enc); 5758 break; 5759 case T_LONG: 5760 evpexpandq(dst, mask, src, merge, vec_enc); 5761 break; 5762 case T_DOUBLE: 5763 evexpandpd(dst, mask, src, merge, vec_enc); 5764 break; 5765 default: 5766 fatal("Unsupported type %s", type2name(bt)); 5767 break; 5768 } 5769 } 5770 } 5771 #endif 5772 5773 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5774 KRegister ktmp1, int vec_enc) { 5775 if (opcode == Op_SignumVD) { 5776 vsubpd(dst, zero, one, vec_enc); 5777 // if src < 0 ? -1 : 1 5778 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5779 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5780 // if src == NaN, -0.0 or 0.0 return src. 5781 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5782 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5783 } else { 5784 assert(opcode == Op_SignumVF, ""); 5785 vsubps(dst, zero, one, vec_enc); 5786 // if src < 0 ? -1 : 1 5787 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5788 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5789 // if src == NaN, -0.0 or 0.0 return src. 5790 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5791 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5792 } 5793 } 5794 5795 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5796 XMMRegister xtmp1, int vec_enc) { 5797 if (opcode == Op_SignumVD) { 5798 vsubpd(dst, zero, one, vec_enc); 5799 // if src < 0 ? -1 : 1 5800 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5801 // if src == NaN, -0.0 or 0.0 return src. 5802 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5803 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5804 } else { 5805 assert(opcode == Op_SignumVF, ""); 5806 vsubps(dst, zero, one, vec_enc); 5807 // if src < 0 ? -1 : 1 5808 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5809 // if src == NaN, -0.0 or 0.0 return src. 5810 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5811 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5812 } 5813 } 5814 5815 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5816 if (VM_Version::supports_avx512bw()) { 5817 if (mask_len > 32) { 5818 kmovql(dst, src); 5819 } else { 5820 kmovdl(dst, src); 5821 if (mask_len != 32) { 5822 kshiftrdl(dst, dst, 32 - mask_len); 5823 } 5824 } 5825 } else { 5826 assert(mask_len <= 16, ""); 5827 kmovwl(dst, src); 5828 if (mask_len != 16) { 5829 kshiftrwl(dst, dst, 16 - mask_len); 5830 } 5831 } 5832 } 5833 5834 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5835 int lane_size = type2aelembytes(bt); 5836 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5837 if ((is_LP64 || lane_size < 8) && 5838 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5839 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5840 movptr(rtmp, imm32); 5841 switch(lane_size) { 5842 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5843 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5844 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5845 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5846 fatal("Unsupported lane size %d", lane_size); 5847 break; 5848 } 5849 } else { 5850 movptr(rtmp, imm32); 5851 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5852 switch(lane_size) { 5853 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5854 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5855 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5856 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5857 fatal("Unsupported lane size %d", lane_size); 5858 break; 5859 } 5860 } 5861 } 5862 5863 // 5864 // Following is lookup table based popcount computation algorithm:- 5865 // Index Bit set count 5866 // [ 0000 -> 0, 5867 // 0001 -> 1, 5868 // 0010 -> 1, 5869 // 0011 -> 2, 5870 // 0100 -> 1, 5871 // 0101 -> 2, 5872 // 0110 -> 2, 5873 // 0111 -> 3, 5874 // 1000 -> 1, 5875 // 1001 -> 2, 5876 // 1010 -> 3, 5877 // 1011 -> 3, 5878 // 1100 -> 2, 5879 // 1101 -> 3, 5880 // 1111 -> 4 ] 5881 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5882 // shuffle indices for lookup table access. 5883 // b. Right shift each byte of vector lane by 4 positions. 5884 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5885 // shuffle indices for lookup table access. 5886 // d. Add the bitset count of upper and lower 4 bits of each byte. 5887 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5888 // count of all the bytes of a quadword. 5889 // f. Perform step e. for upper 128bit vector lane. 5890 // g. Pack the bitset count of quadwords back to double word. 5891 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5892 5893 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5894 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5895 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5896 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5897 vpsrlw(dst, src, 4, vec_enc); 5898 vpand(dst, dst, xtmp1, vec_enc); 5899 vpand(xtmp1, src, xtmp1, vec_enc); 5900 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5901 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5902 vpshufb(dst, xtmp2, dst, vec_enc); 5903 vpaddb(dst, dst, xtmp1, vec_enc); 5904 } 5905 5906 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5907 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5908 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5909 // Following code is as per steps e,f,g and h of above algorithm. 5910 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5911 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5912 vpsadbw(dst, dst, xtmp2, vec_enc); 5913 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5914 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5915 vpackuswb(dst, xtmp1, dst, vec_enc); 5916 } 5917 5918 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5919 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5920 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5921 // Add the popcount of upper and lower bytes of word. 5922 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5923 vpsrlw(dst, xtmp1, 8, vec_enc); 5924 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5925 vpaddw(dst, dst, xtmp1, vec_enc); 5926 } 5927 5928 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5929 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5930 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5931 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5932 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5933 } 5934 5935 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5936 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5937 switch(bt) { 5938 case T_LONG: 5939 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5940 break; 5941 case T_INT: 5942 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5943 break; 5944 case T_CHAR: 5945 case T_SHORT: 5946 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5947 break; 5948 case T_BYTE: 5949 case T_BOOLEAN: 5950 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5951 break; 5952 default: 5953 fatal("Unsupported type %s", type2name(bt)); 5954 break; 5955 } 5956 } 5957 5958 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5959 KRegister mask, bool merge, int vec_enc) { 5960 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5961 switch(bt) { 5962 case T_LONG: 5963 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5964 evpopcntq(dst, mask, src, merge, vec_enc); 5965 break; 5966 case T_INT: 5967 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5968 evpopcntd(dst, mask, src, merge, vec_enc); 5969 break; 5970 case T_CHAR: 5971 case T_SHORT: 5972 assert(VM_Version::supports_avx512_bitalg(), ""); 5973 evpopcntw(dst, mask, src, merge, vec_enc); 5974 break; 5975 case T_BYTE: 5976 case T_BOOLEAN: 5977 assert(VM_Version::supports_avx512_bitalg(), ""); 5978 evpopcntb(dst, mask, src, merge, vec_enc); 5979 break; 5980 default: 5981 fatal("Unsupported type %s", type2name(bt)); 5982 break; 5983 } 5984 } 5985 5986 #ifndef _LP64 5987 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5988 assert(VM_Version::supports_avx512bw(), ""); 5989 kmovdl(tmp, src); 5990 kunpckdql(dst, tmp, tmp); 5991 } 5992 #endif 5993 5994 // Bit reversal algorithm first reverses the bits of each byte followed by 5995 // a byte level reversal for multi-byte primitive types (short/int/long). 5996 // Algorithm performs a lookup table access to get reverse bit sequence 5997 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5998 // is obtained by swapping the reverse bit sequences of upper and lower 5999 // nibble of a byte. 6000 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6001 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6002 if (VM_Version::supports_avx512vlbw()) { 6003 6004 // Get the reverse bit sequence of lower nibble of each byte. 6005 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 6006 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6007 evpandq(dst, xtmp2, src, vec_enc); 6008 vpshufb(dst, xtmp1, dst, vec_enc); 6009 vpsllq(dst, dst, 4, vec_enc); 6010 6011 // Get the reverse bit sequence of upper nibble of each byte. 6012 vpandn(xtmp2, xtmp2, src, vec_enc); 6013 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6014 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6015 6016 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6017 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6018 evporq(xtmp2, dst, xtmp2, vec_enc); 6019 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6020 6021 } else if(vec_enc == Assembler::AVX_512bit) { 6022 // Shift based bit reversal. 6023 assert(bt == T_LONG || bt == T_INT, ""); 6024 6025 // Swap lower and upper nibble of each byte. 6026 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 6027 6028 // Swap two least and most significant bits of each nibble. 6029 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 6030 6031 // Swap adjacent pair of bits. 6032 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6033 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 6034 6035 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6036 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 6037 } else { 6038 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 6039 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6040 6041 // Get the reverse bit sequence of lower nibble of each byte. 6042 vpand(dst, xtmp2, src, vec_enc); 6043 vpshufb(dst, xtmp1, dst, vec_enc); 6044 vpsllq(dst, dst, 4, vec_enc); 6045 6046 // Get the reverse bit sequence of upper nibble of each byte. 6047 vpandn(xtmp2, xtmp2, src, vec_enc); 6048 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6049 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6050 6051 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6052 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6053 vpor(xtmp2, dst, xtmp2, vec_enc); 6054 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6055 } 6056 } 6057 6058 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 6059 XMMRegister xtmp, Register rscratch) { 6060 assert(VM_Version::supports_gfni(), ""); 6061 assert(rscratch != noreg || always_reachable(mask), "missing"); 6062 6063 // Galois field instruction based bit reversal based on following algorithm. 6064 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6065 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 6066 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 6067 vector_reverse_byte(bt, dst, xtmp, vec_enc); 6068 } 6069 6070 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 6071 XMMRegister xtmp1, Register rtmp, int vec_enc) { 6072 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 6073 evpandq(dst, xtmp1, src, vec_enc); 6074 vpsllq(dst, dst, nbits, vec_enc); 6075 vpandn(xtmp1, xtmp1, src, vec_enc); 6076 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 6077 evporq(dst, dst, xtmp1, vec_enc); 6078 } 6079 6080 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6081 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6082 // Shift based bit reversal. 6083 assert(VM_Version::supports_evex(), ""); 6084 switch(bt) { 6085 case T_LONG: 6086 // Swap upper and lower double word of each quad word. 6087 evprorq(xtmp1, k0, src, 32, true, vec_enc); 6088 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 6089 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6090 break; 6091 case T_INT: 6092 // Swap upper and lower word of each double word. 6093 evprord(xtmp1, k0, src, 16, true, vec_enc); 6094 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6095 break; 6096 case T_CHAR: 6097 case T_SHORT: 6098 // Swap upper and lower byte of each word. 6099 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 6100 break; 6101 case T_BYTE: 6102 evmovdquq(dst, k0, src, true, vec_enc); 6103 break; 6104 default: 6105 fatal("Unsupported type %s", type2name(bt)); 6106 break; 6107 } 6108 } 6109 6110 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 6111 if (bt == T_BYTE) { 6112 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 6113 evmovdquq(dst, k0, src, true, vec_enc); 6114 } else { 6115 vmovdqu(dst, src); 6116 } 6117 return; 6118 } 6119 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 6120 // pre-computed shuffle indices. 6121 switch(bt) { 6122 case T_LONG: 6123 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 6124 break; 6125 case T_INT: 6126 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 6127 break; 6128 case T_CHAR: 6129 case T_SHORT: 6130 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6131 break; 6132 default: 6133 fatal("Unsupported type %s", type2name(bt)); 6134 break; 6135 } 6136 vpshufb(dst, src, dst, vec_enc); 6137 } 6138 6139 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6140 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6141 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6142 assert(is_integral_type(bt), ""); 6143 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6144 assert(VM_Version::supports_avx512cd(), ""); 6145 switch(bt) { 6146 case T_LONG: 6147 evplzcntq(dst, ktmp, src, merge, vec_enc); 6148 break; 6149 case T_INT: 6150 evplzcntd(dst, ktmp, src, merge, vec_enc); 6151 break; 6152 case T_SHORT: 6153 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6154 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6155 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6156 vpunpckhwd(dst, xtmp1, src, vec_enc); 6157 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6158 vpackusdw(dst, xtmp2, dst, vec_enc); 6159 break; 6160 case T_BYTE: 6161 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6162 // accessing the lookup table. 6163 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6164 // accessing the lookup table. 6165 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6166 assert(VM_Version::supports_avx512bw(), ""); 6167 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6168 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6169 vpand(xtmp2, dst, src, vec_enc); 6170 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6171 vpsrlw(xtmp3, src, 4, vec_enc); 6172 vpand(xtmp3, dst, xtmp3, vec_enc); 6173 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6174 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6175 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6176 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6177 break; 6178 default: 6179 fatal("Unsupported type %s", type2name(bt)); 6180 break; 6181 } 6182 } 6183 6184 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6185 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6186 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6187 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6188 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6189 // accessing the lookup table. 6190 vpand(dst, xtmp2, src, vec_enc); 6191 vpshufb(dst, xtmp1, dst, vec_enc); 6192 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6193 // accessing the lookup table. 6194 vpsrlw(xtmp3, src, 4, vec_enc); 6195 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6196 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6197 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6198 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6199 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6200 vpaddb(dst, dst, xtmp2, vec_enc); 6201 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6202 } 6203 6204 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6205 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6206 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6207 // Add zero counts of lower byte and upper byte of a word if 6208 // upper byte holds a zero value. 6209 vpsrlw(xtmp3, src, 8, vec_enc); 6210 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6211 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6212 vpsllw(xtmp2, dst, 8, vec_enc); 6213 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6214 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6215 vpsrlw(dst, dst, 8, vec_enc); 6216 } 6217 6218 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6219 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6220 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6221 // hence biased exponent can be used to compute leading zero count as per 6222 // following formula:- 6223 // LZCNT = 31 - (biased_exp - 127) 6224 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6225 6226 // Broadcast 0xFF 6227 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6228 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6229 6230 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher 6231 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit 6232 // contributes to the leading number of zeros. 6233 vpsrld(xtmp2, src, 1, vec_enc); 6234 vpandn(xtmp3, xtmp2, src, vec_enc); 6235 6236 // Extract biased exponent. 6237 vcvtdq2ps(dst, xtmp3, vec_enc); 6238 vpsrld(dst, dst, 23, vec_enc); 6239 vpand(dst, dst, xtmp1, vec_enc); 6240 6241 // Broadcast 127. 6242 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6243 // Exponent = biased_exp - 127 6244 vpsubd(dst, dst, xtmp1, vec_enc); 6245 6246 // Exponent_plus_one = Exponent + 1 6247 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6248 vpaddd(dst, dst, xtmp3, vec_enc); 6249 6250 // Replace -ve exponent with zero, exponent is -ve when src 6251 // lane contains a zero value. 6252 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6253 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6254 6255 // Rematerialize broadcast 32. 6256 vpslld(xtmp1, xtmp3, 5, vec_enc); 6257 // Exponent is 32 if corresponding source lane contains max_int value. 6258 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6259 // LZCNT = 32 - exponent_plus_one 6260 vpsubd(dst, xtmp1, dst, vec_enc); 6261 6262 // Replace LZCNT with a value 1 if corresponding source lane 6263 // contains max_int value. 6264 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6265 6266 // Replace biased_exp with 0 if source lane value is less than zero. 6267 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6268 vblendvps(dst, dst, xtmp2, src, vec_enc); 6269 } 6270 6271 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6272 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6273 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6274 // Add zero counts of lower word and upper word of a double word if 6275 // upper word holds a zero value. 6276 vpsrld(xtmp3, src, 16, vec_enc); 6277 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6278 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6279 vpslld(xtmp2, dst, 16, vec_enc); 6280 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6281 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6282 vpsrld(dst, dst, 16, vec_enc); 6283 // Add zero counts of lower doubleword and upper doubleword of a 6284 // quadword if upper doubleword holds a zero value. 6285 vpsrlq(xtmp3, src, 32, vec_enc); 6286 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6287 vpsllq(xtmp2, dst, 32, vec_enc); 6288 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6289 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6290 vpsrlq(dst, dst, 32, vec_enc); 6291 } 6292 6293 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6294 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6295 Register rtmp, int vec_enc) { 6296 assert(is_integral_type(bt), "unexpected type"); 6297 assert(vec_enc < Assembler::AVX_512bit, ""); 6298 switch(bt) { 6299 case T_LONG: 6300 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6301 break; 6302 case T_INT: 6303 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6304 break; 6305 case T_SHORT: 6306 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6307 break; 6308 case T_BYTE: 6309 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6310 break; 6311 default: 6312 fatal("Unsupported type %s", type2name(bt)); 6313 break; 6314 } 6315 } 6316 6317 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6318 switch(bt) { 6319 case T_BYTE: 6320 vpsubb(dst, src1, src2, vec_enc); 6321 break; 6322 case T_SHORT: 6323 vpsubw(dst, src1, src2, vec_enc); 6324 break; 6325 case T_INT: 6326 vpsubd(dst, src1, src2, vec_enc); 6327 break; 6328 case T_LONG: 6329 vpsubq(dst, src1, src2, vec_enc); 6330 break; 6331 default: 6332 fatal("Unsupported type %s", type2name(bt)); 6333 break; 6334 } 6335 } 6336 6337 // Trailing zero count computation is based on leading zero count operation as per 6338 // following equation. All AVX3 targets support AVX512CD feature which offers 6339 // direct vector instruction to compute leading zero count. 6340 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6341 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6342 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6343 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6344 assert(is_integral_type(bt), ""); 6345 // xtmp = -1 6346 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6347 // xtmp = xtmp + src 6348 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6349 // xtmp = xtmp & ~src 6350 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6351 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6352 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6353 vpsub(bt, dst, xtmp4, dst, vec_enc); 6354 } 6355 6356 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6357 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6358 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6359 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6360 assert(is_integral_type(bt), ""); 6361 // xtmp = 0 6362 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6363 // xtmp = 0 - src 6364 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6365 // xtmp = xtmp | src 6366 vpor(xtmp3, xtmp3, src, vec_enc); 6367 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6368 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6369 vpsub(bt, dst, xtmp1, dst, vec_enc); 6370 } 6371 6372 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6373 Label done; 6374 Label neg_divisor_fastpath; 6375 cmpl(divisor, 0); 6376 jccb(Assembler::less, neg_divisor_fastpath); 6377 xorl(rdx, rdx); 6378 divl(divisor); 6379 jmpb(done); 6380 bind(neg_divisor_fastpath); 6381 // Fastpath for divisor < 0: 6382 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6383 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6384 movl(rdx, rax); 6385 subl(rdx, divisor); 6386 if (VM_Version::supports_bmi1()) { 6387 andnl(rax, rdx, rax); 6388 } else { 6389 notl(rdx); 6390 andl(rax, rdx); 6391 } 6392 shrl(rax, 31); 6393 bind(done); 6394 } 6395 6396 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6397 Label done; 6398 Label neg_divisor_fastpath; 6399 cmpl(divisor, 0); 6400 jccb(Assembler::less, neg_divisor_fastpath); 6401 xorl(rdx, rdx); 6402 divl(divisor); 6403 jmpb(done); 6404 bind(neg_divisor_fastpath); 6405 // Fastpath when divisor < 0: 6406 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6407 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6408 movl(rdx, rax); 6409 subl(rax, divisor); 6410 if (VM_Version::supports_bmi1()) { 6411 andnl(rax, rax, rdx); 6412 } else { 6413 notl(rax); 6414 andl(rax, rdx); 6415 } 6416 sarl(rax, 31); 6417 andl(rax, divisor); 6418 subl(rdx, rax); 6419 bind(done); 6420 } 6421 6422 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6423 Label done; 6424 Label neg_divisor_fastpath; 6425 6426 cmpl(divisor, 0); 6427 jccb(Assembler::less, neg_divisor_fastpath); 6428 xorl(rdx, rdx); 6429 divl(divisor); 6430 jmpb(done); 6431 bind(neg_divisor_fastpath); 6432 // Fastpath for divisor < 0: 6433 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6434 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6435 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6436 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6437 movl(rdx, rax); 6438 subl(rax, divisor); 6439 if (VM_Version::supports_bmi1()) { 6440 andnl(rax, rax, rdx); 6441 } else { 6442 notl(rax); 6443 andl(rax, rdx); 6444 } 6445 movl(tmp, rax); 6446 shrl(rax, 31); // quotient 6447 sarl(tmp, 31); 6448 andl(tmp, divisor); 6449 subl(rdx, tmp); // remainder 6450 bind(done); 6451 } 6452 6453 #ifdef _LP64 6454 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6455 XMMRegister xtmp2, Register rtmp) { 6456 if(VM_Version::supports_gfni()) { 6457 // Galois field instruction based bit reversal based on following algorithm. 6458 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6459 mov64(rtmp, 0x8040201008040201L); 6460 movq(xtmp1, src); 6461 movq(xtmp2, rtmp); 6462 gf2p8affineqb(xtmp1, xtmp2, 0); 6463 movq(dst, xtmp1); 6464 } else { 6465 // Swap even and odd numbered bits. 6466 movl(rtmp, src); 6467 andl(rtmp, 0x55555555); 6468 shll(rtmp, 1); 6469 movl(dst, src); 6470 andl(dst, 0xAAAAAAAA); 6471 shrl(dst, 1); 6472 orl(dst, rtmp); 6473 6474 // Swap LSB and MSB 2 bits of each nibble. 6475 movl(rtmp, dst); 6476 andl(rtmp, 0x33333333); 6477 shll(rtmp, 2); 6478 andl(dst, 0xCCCCCCCC); 6479 shrl(dst, 2); 6480 orl(dst, rtmp); 6481 6482 // Swap LSB and MSB 4 bits of each byte. 6483 movl(rtmp, dst); 6484 andl(rtmp, 0x0F0F0F0F); 6485 shll(rtmp, 4); 6486 andl(dst, 0xF0F0F0F0); 6487 shrl(dst, 4); 6488 orl(dst, rtmp); 6489 } 6490 bswapl(dst); 6491 } 6492 6493 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6494 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6495 if(VM_Version::supports_gfni()) { 6496 // Galois field instruction based bit reversal based on following algorithm. 6497 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6498 mov64(rtmp1, 0x8040201008040201L); 6499 movq(xtmp1, src); 6500 movq(xtmp2, rtmp1); 6501 gf2p8affineqb(xtmp1, xtmp2, 0); 6502 movq(dst, xtmp1); 6503 } else { 6504 // Swap even and odd numbered bits. 6505 movq(rtmp1, src); 6506 mov64(rtmp2, 0x5555555555555555L); 6507 andq(rtmp1, rtmp2); 6508 shlq(rtmp1, 1); 6509 movq(dst, src); 6510 notq(rtmp2); 6511 andq(dst, rtmp2); 6512 shrq(dst, 1); 6513 orq(dst, rtmp1); 6514 6515 // Swap LSB and MSB 2 bits of each nibble. 6516 movq(rtmp1, dst); 6517 mov64(rtmp2, 0x3333333333333333L); 6518 andq(rtmp1, rtmp2); 6519 shlq(rtmp1, 2); 6520 notq(rtmp2); 6521 andq(dst, rtmp2); 6522 shrq(dst, 2); 6523 orq(dst, rtmp1); 6524 6525 // Swap LSB and MSB 4 bits of each byte. 6526 movq(rtmp1, dst); 6527 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6528 andq(rtmp1, rtmp2); 6529 shlq(rtmp1, 4); 6530 notq(rtmp2); 6531 andq(dst, rtmp2); 6532 shrq(dst, 4); 6533 orq(dst, rtmp1); 6534 } 6535 bswapq(dst); 6536 } 6537 6538 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6539 Label done; 6540 Label neg_divisor_fastpath; 6541 cmpq(divisor, 0); 6542 jccb(Assembler::less, neg_divisor_fastpath); 6543 xorl(rdx, rdx); 6544 divq(divisor); 6545 jmpb(done); 6546 bind(neg_divisor_fastpath); 6547 // Fastpath for divisor < 0: 6548 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6549 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6550 movq(rdx, rax); 6551 subq(rdx, divisor); 6552 if (VM_Version::supports_bmi1()) { 6553 andnq(rax, rdx, rax); 6554 } else { 6555 notq(rdx); 6556 andq(rax, rdx); 6557 } 6558 shrq(rax, 63); 6559 bind(done); 6560 } 6561 6562 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6563 Label done; 6564 Label neg_divisor_fastpath; 6565 cmpq(divisor, 0); 6566 jccb(Assembler::less, neg_divisor_fastpath); 6567 xorq(rdx, rdx); 6568 divq(divisor); 6569 jmp(done); 6570 bind(neg_divisor_fastpath); 6571 // Fastpath when divisor < 0: 6572 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6573 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6574 movq(rdx, rax); 6575 subq(rax, divisor); 6576 if (VM_Version::supports_bmi1()) { 6577 andnq(rax, rax, rdx); 6578 } else { 6579 notq(rax); 6580 andq(rax, rdx); 6581 } 6582 sarq(rax, 63); 6583 andq(rax, divisor); 6584 subq(rdx, rax); 6585 bind(done); 6586 } 6587 6588 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6589 Label done; 6590 Label neg_divisor_fastpath; 6591 cmpq(divisor, 0); 6592 jccb(Assembler::less, neg_divisor_fastpath); 6593 xorq(rdx, rdx); 6594 divq(divisor); 6595 jmp(done); 6596 bind(neg_divisor_fastpath); 6597 // Fastpath for divisor < 0: 6598 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6599 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6600 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6601 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6602 movq(rdx, rax); 6603 subq(rax, divisor); 6604 if (VM_Version::supports_bmi1()) { 6605 andnq(rax, rax, rdx); 6606 } else { 6607 notq(rax); 6608 andq(rax, rdx); 6609 } 6610 movq(tmp, rax); 6611 shrq(rax, 63); // quotient 6612 sarq(tmp, 63); 6613 andq(tmp, divisor); 6614 subq(rdx, tmp); // remainder 6615 bind(done); 6616 } 6617 #endif 6618 6619 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6620 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6621 int vlen_enc) { 6622 assert(VM_Version::supports_avx512bw(), ""); 6623 // Byte shuffles are inlane operations and indices are determined using 6624 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6625 // normalized to index range 0-15. This makes sure that all the multiples 6626 // of an index value are placed at same relative position in 128 bit 6627 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6628 // will be 16th element in their respective 128 bit lanes. 6629 movl(rtmp, 16); 6630 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6631 6632 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6633 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6634 // original shuffle indices and move the shuffled lanes corresponding to true 6635 // mask to destination vector. 6636 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6637 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6638 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6639 6640 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6641 // and broadcasting second 128 bit lane. 6642 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6643 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6644 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6645 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6646 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6647 6648 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6649 // and broadcasting third 128 bit lane. 6650 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6651 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6652 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6653 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6654 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6655 6656 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6657 // and broadcasting third 128 bit lane. 6658 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6659 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6660 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6661 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6662 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6663 } 6664 6665 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6666 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6667 if (vlen_enc == AVX_128bit) { 6668 vpermilps(dst, src, shuffle, vlen_enc); 6669 } else if (bt == T_INT) { 6670 vpermd(dst, shuffle, src, vlen_enc); 6671 } else { 6672 assert(bt == T_FLOAT, ""); 6673 vpermps(dst, shuffle, src, vlen_enc); 6674 } 6675 } 6676 6677 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 6678 switch(opcode) { 6679 case Op_AddHF: vaddsh(dst, src1, src2); break; 6680 case Op_SubHF: vsubsh(dst, src1, src2); break; 6681 case Op_MulHF: vmulsh(dst, src1, src2); break; 6682 case Op_DivHF: vdivsh(dst, src1, src2); break; 6683 case Op_MaxHF: vmaxsh(dst, src1, src2); break; 6684 case Op_MinHF: vminsh(dst, src1, src2); break; 6685 default: assert(false, "%s", NodeClassNames[opcode]); break; 6686 } 6687 } 6688 6689 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6690 switch(elem_bt) { 6691 case T_BYTE: 6692 if (ideal_opc == Op_SaturatingAddV) { 6693 vpaddsb(dst, src1, src2, vlen_enc); 6694 } else { 6695 assert(ideal_opc == Op_SaturatingSubV, ""); 6696 vpsubsb(dst, src1, src2, vlen_enc); 6697 } 6698 break; 6699 case T_SHORT: 6700 if (ideal_opc == Op_SaturatingAddV) { 6701 vpaddsw(dst, src1, src2, vlen_enc); 6702 } else { 6703 assert(ideal_opc == Op_SaturatingSubV, ""); 6704 vpsubsw(dst, src1, src2, vlen_enc); 6705 } 6706 break; 6707 default: 6708 fatal("Unsupported type %s", type2name(elem_bt)); 6709 break; 6710 } 6711 } 6712 6713 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6714 switch(elem_bt) { 6715 case T_BYTE: 6716 if (ideal_opc == Op_SaturatingAddV) { 6717 vpaddusb(dst, src1, src2, vlen_enc); 6718 } else { 6719 assert(ideal_opc == Op_SaturatingSubV, ""); 6720 vpsubusb(dst, src1, src2, vlen_enc); 6721 } 6722 break; 6723 case T_SHORT: 6724 if (ideal_opc == Op_SaturatingAddV) { 6725 vpaddusw(dst, src1, src2, vlen_enc); 6726 } else { 6727 assert(ideal_opc == Op_SaturatingSubV, ""); 6728 vpsubusw(dst, src1, src2, vlen_enc); 6729 } 6730 break; 6731 default: 6732 fatal("Unsupported type %s", type2name(elem_bt)); 6733 break; 6734 } 6735 } 6736 6737 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6738 XMMRegister src2, KRegister ktmp, int vlen_enc) { 6739 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input. 6740 // overflow_mask = Inp1 <u Inp2 6741 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc); 6742 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative) 6743 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false); 6744 } 6745 6746 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6747 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6748 // Emulate unsigned comparison using signed comparison 6749 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE 6750 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true); 6751 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc); 6752 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc); 6753 6754 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc); 6755 6756 // Res = INP1 - INP2 (non-commutative and non-associative) 6757 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6758 // Res = Mask ? Zero : Res 6759 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); 6760 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc); 6761 } 6762 6763 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6764 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) { 6765 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation. 6766 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2) 6767 // Res = Signed Add INP1, INP2 6768 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6769 // T1 = SRC1 | SRC2 6770 vpor(xtmp1, src1, src2, vlen_enc); 6771 // Max_Unsigned = -1 6772 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6773 // Unsigned compare: Mask = Res <u T1 6774 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc); 6775 // res = Mask ? Max_Unsigned : Res 6776 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc); 6777 } 6778 6779 // 6780 // Section 2-13 Hacker's Delight list following overflow detection check for saturating 6781 // unsigned addition operation. 6782 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1 6783 // 6784 // We empirically determined its semantic equivalence to following reduced expression 6785 // overflow_mask = (a + b) <u (a | b) 6786 // 6787 // and also verified it though Alive2 solver. 6788 // (https://alive2.llvm.org/ce/z/XDQ7dY) 6789 // 6790 6791 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6792 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) { 6793 // Res = Signed Add INP1, INP2 6794 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6795 // Compute T1 = INP1 | INP2 6796 vpor(xtmp3, src1, src2, vlen_enc); 6797 // T1 = Minimum signed value. 6798 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6799 // Convert T1 to signed value, T1 = T1 + MIN_VALUE 6800 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc); 6801 // Convert Res to signed value, Res<s> = Res + MIN_VALUE 6802 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc); 6803 // Compute overflow detection mask = Res<1> <s T1 6804 if (elem_bt == T_INT) { 6805 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc); 6806 } else { 6807 assert(elem_bt == T_LONG, ""); 6808 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc); 6809 } 6810 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6811 } 6812 6813 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6814 int vlen_enc, bool xtmp2_hold_M1) { 6815 if (VM_Version::supports_avx512dq()) { 6816 evpmovq2m(ktmp, src, vlen_enc); 6817 } else { 6818 assert(VM_Version::supports_evex(), ""); 6819 if (!xtmp2_hold_M1) { 6820 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6821 } 6822 evpsraq(xtmp1, src, 63, vlen_enc); 6823 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6824 } 6825 } 6826 6827 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6828 int vlen_enc, bool xtmp2_hold_M1) { 6829 if (VM_Version::supports_avx512dq()) { 6830 evpmovd2m(ktmp, src, vlen_enc); 6831 } else { 6832 assert(VM_Version::supports_evex(), ""); 6833 if (!xtmp2_hold_M1) { 6834 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6835 } 6836 vpsrad(xtmp1, src, 31, vlen_enc); 6837 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6838 } 6839 } 6840 6841 6842 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 6843 if (elem_bt == T_LONG) { 6844 if (VM_Version::supports_evex()) { 6845 evpsraq(dst, src, 63, vlen_enc); 6846 } else { 6847 vpsrad(dst, src, 31, vlen_enc); 6848 vpshufd(dst, dst, 0xF5, vlen_enc); 6849 } 6850 } else { 6851 assert(elem_bt == T_INT, ""); 6852 vpsrad(dst, src, 31, vlen_enc); 6853 } 6854 } 6855 6856 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6857 if (compute_allones) { 6858 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6859 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6860 } else { 6861 vpcmpeqq(allones, allones, allones, vlen_enc); 6862 } 6863 } 6864 if (elem_bt == T_LONG) { 6865 vpsrlq(dst, allones, 1, vlen_enc); 6866 } else { 6867 assert(elem_bt == T_INT, ""); 6868 vpsrld(dst, allones, 1, vlen_enc); 6869 } 6870 } 6871 6872 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6873 if (compute_allones) { 6874 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6875 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6876 } else { 6877 vpcmpeqq(allones, allones, allones, vlen_enc); 6878 } 6879 } 6880 if (elem_bt == T_LONG) { 6881 vpsllq(dst, allones, 63, vlen_enc); 6882 } else { 6883 assert(elem_bt == T_INT, ""); 6884 vpslld(dst, allones, 31, vlen_enc); 6885 } 6886 } 6887 6888 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2, 6889 Assembler::ComparisonPredicate cond, int vlen_enc) { 6890 switch(elem_bt) { 6891 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break; 6892 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break; 6893 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break; 6894 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break; 6895 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6896 } 6897 } 6898 6899 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6900 switch(elem_bt) { 6901 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break; 6902 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break; 6903 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break; 6904 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break; 6905 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6906 } 6907 } 6908 6909 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1, 6910 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) { 6911 if (elem_bt == T_LONG) { 6912 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6913 } else { 6914 assert(elem_bt == T_INT, ""); 6915 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6916 } 6917 } 6918 6919 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6920 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6921 KRegister ktmp1, KRegister ktmp2, int vlen_enc) { 6922 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6923 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6924 // Overflow detection based on Hacker's delight section 2-13. 6925 if (ideal_opc == Op_SaturatingAddV) { 6926 // res = src1 + src2 6927 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6928 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6929 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6930 vpxor(xtmp1, dst, src1, vlen_enc); 6931 vpxor(xtmp2, dst, src2, vlen_enc); 6932 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6933 } else { 6934 assert(ideal_opc == Op_SaturatingSubV, ""); 6935 // res = src1 - src2 6936 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6937 // Overflow occurs when both inputs have opposite polarity and 6938 // result polarity does not comply with first input polarity. 6939 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6940 vpxor(xtmp1, src1, src2, vlen_enc); 6941 vpxor(xtmp2, dst, src1, vlen_enc); 6942 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6943 } 6944 6945 // Compute overflow detection mask. 6946 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc); 6947 // Note: xtmp1 hold -1 in all its lanes after above call. 6948 6949 // Compute mask based on first input polarity. 6950 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true); 6951 6952 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6953 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6954 6955 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to 6956 // set bits in first input polarity mask holds a min value. 6957 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc); 6958 // Blend destination lanes with saturated values using overflow detection mask. 6959 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc); 6960 } 6961 6962 6963 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6964 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6965 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) { 6966 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6967 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6968 // Overflow detection based on Hacker's delight section 2-13. 6969 if (ideal_opc == Op_SaturatingAddV) { 6970 // res = src1 + src2 6971 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6972 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6973 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6974 vpxor(xtmp1, dst, src1, vlen_enc); 6975 vpxor(xtmp2, dst, src2, vlen_enc); 6976 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6977 } else { 6978 assert(ideal_opc == Op_SaturatingSubV, ""); 6979 // res = src1 - src2 6980 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6981 // Overflow occurs when both inputs have opposite polarity and 6982 // result polarity does not comply with first input polarity. 6983 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6984 vpxor(xtmp1, src1, src2, vlen_enc); 6985 vpxor(xtmp2, dst, src1, vlen_enc); 6986 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6987 } 6988 6989 // Sign-extend to compute overflow detection mask. 6990 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc); 6991 6992 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc); 6993 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc); 6994 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6995 6996 // Compose saturating min/max vector using first input polarity mask. 6997 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc); 6998 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc); 6999 7000 // Blend result with saturating vector using overflow detection mask. 7001 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 7002 } 7003 7004 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7005 switch(elem_bt) { 7006 case T_BYTE: 7007 if (ideal_opc == Op_SaturatingAddV) { 7008 vpaddsb(dst, src1, src2, vlen_enc); 7009 } else { 7010 assert(ideal_opc == Op_SaturatingSubV, ""); 7011 vpsubsb(dst, src1, src2, vlen_enc); 7012 } 7013 break; 7014 case T_SHORT: 7015 if (ideal_opc == Op_SaturatingAddV) { 7016 vpaddsw(dst, src1, src2, vlen_enc); 7017 } else { 7018 assert(ideal_opc == Op_SaturatingSubV, ""); 7019 vpsubsw(dst, src1, src2, vlen_enc); 7020 } 7021 break; 7022 default: 7023 fatal("Unsupported type %s", type2name(elem_bt)); 7024 break; 7025 } 7026 } 7027 7028 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7029 switch(elem_bt) { 7030 case T_BYTE: 7031 if (ideal_opc == Op_SaturatingAddV) { 7032 vpaddusb(dst, src1, src2, vlen_enc); 7033 } else { 7034 assert(ideal_opc == Op_SaturatingSubV, ""); 7035 vpsubusb(dst, src1, src2, vlen_enc); 7036 } 7037 break; 7038 case T_SHORT: 7039 if (ideal_opc == Op_SaturatingAddV) { 7040 vpaddusw(dst, src1, src2, vlen_enc); 7041 } else { 7042 assert(ideal_opc == Op_SaturatingSubV, ""); 7043 vpsubusw(dst, src1, src2, vlen_enc); 7044 } 7045 break; 7046 default: 7047 fatal("Unsupported type %s", type2name(elem_bt)); 7048 break; 7049 } 7050 } 7051 7052 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 7053 XMMRegister src2, int vlen_enc) { 7054 switch(elem_bt) { 7055 case T_BYTE: 7056 evpermi2b(dst, src1, src2, vlen_enc); 7057 break; 7058 case T_SHORT: 7059 evpermi2w(dst, src1, src2, vlen_enc); 7060 break; 7061 case T_INT: 7062 evpermi2d(dst, src1, src2, vlen_enc); 7063 break; 7064 case T_LONG: 7065 evpermi2q(dst, src1, src2, vlen_enc); 7066 break; 7067 case T_FLOAT: 7068 evpermi2ps(dst, src1, src2, vlen_enc); 7069 break; 7070 case T_DOUBLE: 7071 evpermi2pd(dst, src1, src2, vlen_enc); 7072 break; 7073 default: 7074 fatal("Unsupported type %s", type2name(elem_bt)); 7075 break; 7076 } 7077 } 7078 7079 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) { 7080 if (is_unsigned) { 7081 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7082 } else { 7083 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7084 } 7085 } 7086 7087 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) { 7088 if (is_unsigned) { 7089 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7090 } else { 7091 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7092 } 7093 }