1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/globals.hpp" 37 #include "runtime/objectMonitor.hpp" 38 #include "runtime/stubRoutines.hpp" 39 #include "utilities/checkedCast.hpp" 40 #include "utilities/globalDefinitions.hpp" 41 #include "utilities/powerOfTwo.hpp" 42 #include "utilities/sizes.hpp" 43 44 #ifdef PRODUCT 45 #define BLOCK_COMMENT(str) /* nothing */ 46 #define STOP(error) stop(error) 47 #else 48 #define BLOCK_COMMENT(str) block_comment(str) 49 #define STOP(error) block_comment(error); stop(error) 50 #endif 51 52 // C2 compiled method's prolog code. 53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 54 55 // WARNING: Initial instruction MUST be 5 bytes or longer so that 56 // NativeJump::patch_verified_entry will be able to patch out the entry 57 // code safely. The push to verify stack depth is ok at 5 bytes, 58 // the frame allocation can be either 3 or 6 bytes. So if we don't do 59 // stack bang then we must use the 6 byte frame allocation even if 60 // we have no frame. :-( 61 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 62 63 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 64 // Remove word for return addr 65 framesize -= wordSize; 66 stack_bang_size -= wordSize; 67 68 // Calls to C2R adapters often do not accept exceptional returns. 69 // We require that their callers must bang for them. But be careful, because 70 // some VM calls (such as call site linkage) can use several kilobytes of 71 // stack. But the stack safety zone should account for that. 72 // See bugs 4446381, 4468289, 4497237. 73 if (stack_bang_size > 0) { 74 generate_stack_overflow_check(stack_bang_size); 75 76 // We always push rbp, so that on return to interpreter rbp, will be 77 // restored correctly and we can correct the stack. 78 push(rbp); 79 // Save caller's stack pointer into RBP if the frame pointer is preserved. 80 if (PreserveFramePointer) { 81 mov(rbp, rsp); 82 } 83 // Remove word for ebp 84 framesize -= wordSize; 85 86 // Create frame 87 if (framesize) { 88 subptr(rsp, framesize); 89 } 90 } else { 91 // Create frame (force generation of a 4 byte immediate value) 92 subptr_imm32(rsp, framesize); 93 94 // Save RBP register now. 95 framesize -= wordSize; 96 movptr(Address(rsp, framesize), rbp); 97 // Save caller's stack pointer into RBP if the frame pointer is preserved. 98 if (PreserveFramePointer) { 99 movptr(rbp, rsp); 100 if (framesize > 0) { 101 addptr(rbp, framesize); 102 } 103 } 104 } 105 106 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 107 framesize -= wordSize; 108 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 109 } 110 111 #ifndef _LP64 112 // If method sets FPU control word do it now 113 if (fp_mode_24b) { 114 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 115 } 116 if (UseSSE >= 2 && VerifyFPU) { 117 verify_FPU(0, "FPU stack must be clean on entry"); 118 } 119 #endif 120 121 #ifdef ASSERT 122 if (VerifyStackAtCalls) { 123 Label L; 124 push(rax); 125 mov(rax, rsp); 126 andptr(rax, StackAlignmentInBytes-1); 127 cmpptr(rax, StackAlignmentInBytes-wordSize); 128 pop(rax); 129 jcc(Assembler::equal, L); 130 STOP("Stack is not properly aligned!"); 131 bind(L); 132 } 133 #endif 134 135 if (!is_stub) { 136 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 137 #ifdef _LP64 138 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 139 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 140 Label dummy_slow_path; 141 Label dummy_continuation; 142 Label* slow_path = &dummy_slow_path; 143 Label* continuation = &dummy_continuation; 144 if (!Compile::current()->output()->in_scratch_emit_size()) { 145 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 146 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 147 Compile::current()->output()->add_stub(stub); 148 slow_path = &stub->entry(); 149 continuation = &stub->continuation(); 150 } 151 bs->nmethod_entry_barrier(this, slow_path, continuation); 152 } 153 #else 154 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 155 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 156 #endif 157 } 158 } 159 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 161 switch (vlen_in_bytes) { 162 case 4: // fall-through 163 case 8: // fall-through 164 case 16: return Assembler::AVX_128bit; 165 case 32: return Assembler::AVX_256bit; 166 case 64: return Assembler::AVX_512bit; 167 168 default: { 169 ShouldNotReachHere(); 170 return Assembler::AVX_NoVec; 171 } 172 } 173 } 174 175 // fast_lock and fast_unlock used by C2 176 177 // Because the transitions from emitted code to the runtime 178 // monitorenter/exit helper stubs are so slow it's critical that 179 // we inline both the stack-locking fast path and the inflated fast path. 180 // 181 // See also: cmpFastLock and cmpFastUnlock. 182 // 183 // What follows is a specialized inline transliteration of the code 184 // in enter() and exit(). If we're concerned about I$ bloat another 185 // option would be to emit TrySlowEnter and TrySlowExit methods 186 // at startup-time. These methods would accept arguments as 187 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 188 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 189 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 190 // In practice, however, the # of lock sites is bounded and is usually small. 191 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 192 // if the processor uses simple bimodal branch predictors keyed by EIP 193 // Since the helper routines would be called from multiple synchronization 194 // sites. 195 // 196 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 197 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 198 // to those specialized methods. That'd give us a mostly platform-independent 199 // implementation that the JITs could optimize and inline at their pleasure. 200 // Done correctly, the only time we'd need to cross to native could would be 201 // to park() or unpark() threads. We'd also need a few more unsafe operators 202 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 203 // (b) explicit barriers or fence operations. 204 // 205 // TODO: 206 // 207 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 208 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 209 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 210 // the lock operators would typically be faster than reifying Self. 211 // 212 // * Ideally I'd define the primitives as: 213 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 214 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 215 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 216 // Instead, we're stuck with a rather awkward and brittle register assignments below. 217 // Furthermore the register assignments are overconstrained, possibly resulting in 218 // sub-optimal code near the synchronization site. 219 // 220 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 221 // Alternately, use a better sp-proximity test. 222 // 223 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 224 // Either one is sufficient to uniquely identify a thread. 225 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 226 // 227 // * Intrinsify notify() and notifyAll() for the common cases where the 228 // object is locked by the calling thread but the waitlist is empty. 229 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 230 // 231 // * use jccb and jmpb instead of jcc and jmp to improve code density. 232 // But beware of excessive branch density on AMD Opterons. 233 // 234 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 235 // or failure of the fast path. If the fast path fails then we pass 236 // control to the slow path, typically in C. In fast_lock and 237 // fast_unlock we often branch to DONE_LABEL, just to find that C2 238 // will emit a conditional branch immediately after the node. 239 // So we have branches to branches and lots of ICC.ZF games. 240 // Instead, it might be better to have C2 pass a "FailureLabel" 241 // into fast_lock and fast_unlock. In the case of success, control 242 // will drop through the node. ICC.ZF is undefined at exit. 243 // In the case of failure, the node will branch directly to the 244 // FailureLabel 245 246 247 // obj: object to lock 248 // box: on-stack box address (displaced header location) - KILLED 249 // rax,: tmp -- KILLED 250 // scr: tmp -- KILLED 251 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 252 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 253 Metadata* method_data) { 254 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 255 // Ensure the register assignments are disjoint 256 assert(tmpReg == rax, ""); 257 assert(cx1Reg == noreg, ""); 258 assert(cx2Reg == noreg, ""); 259 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 260 261 // Possible cases that we'll encounter in fast_lock 262 // ------------------------------------------------ 263 // * Inflated 264 // -- unlocked 265 // -- Locked 266 // = by self 267 // = by other 268 // * neutral 269 // * stack-locked 270 // -- by self 271 // = sp-proximity test hits 272 // = sp-proximity test generates false-negative 273 // -- by other 274 // 275 276 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 277 278 if (DiagnoseSyncOnValueBasedClasses != 0) { 279 load_klass(tmpReg, objReg, scrReg); 280 testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 281 jcc(Assembler::notZero, DONE_LABEL); 282 } 283 284 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 285 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 286 jcc(Assembler::notZero, IsInflated); 287 288 if (LockingMode == LM_MONITOR) { 289 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 290 testptr(objReg, objReg); 291 } else { 292 assert(LockingMode == LM_LEGACY, "must be"); 293 // Attempt stack-locking ... 294 orptr (tmpReg, markWord::unlocked_value); 295 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 296 lock(); 297 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 298 jcc(Assembler::equal, COUNT); // Success 299 300 // Recursive locking. 301 // The object is stack-locked: markword contains stack pointer to BasicLock. 302 // Locked by current thread if difference with current SP is less than one page. 303 subptr(tmpReg, rsp); 304 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 305 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 306 movptr(Address(boxReg, 0), tmpReg); 307 } 308 jmp(DONE_LABEL); 309 310 bind(IsInflated); 311 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 312 313 #ifndef _LP64 314 // Just take slow path to avoid dealing with 64 bit atomic instructions here. 315 orl(boxReg, 1); // set ICC.ZF=0 to indicate failure 316 #else 317 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 318 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 319 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 320 321 // It's inflated and we use scrReg for ObjectMonitor* in this section. 322 movptr(boxReg, Address(r15_thread, JavaThread::monitor_owner_id_offset())); 323 movq(scrReg, tmpReg); 324 xorq(tmpReg, tmpReg); 325 lock(); 326 cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 327 328 // Propagate ICC.ZF from CAS above into DONE_LABEL. 329 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 330 331 cmpptr(boxReg, rax); // Check if we are already the owner (recursive lock) 332 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 333 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 334 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 335 #endif // _LP64 336 bind(DONE_LABEL); 337 338 // ZFlag == 1 count in fast path 339 // ZFlag == 0 count in slow path 340 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 341 342 bind(COUNT); 343 if (LockingMode == LM_LEGACY) { 344 #ifdef _LP64 345 // Count monitors in fast path 346 increment(Address(thread, JavaThread::held_monitor_count_offset())); 347 #endif 348 } 349 xorl(tmpReg, tmpReg); // Set ZF == 1 350 351 bind(NO_COUNT); 352 353 // At NO_COUNT the icc ZFlag is set as follows ... 354 // fast_unlock uses the same protocol. 355 // ZFlag == 1 -> Success 356 // ZFlag == 0 -> Failure - force control through the slow path 357 } 358 359 // obj: object to unlock 360 // box: box address (displaced header location), killed. Must be EAX. 361 // tmp: killed, cannot be obj nor box. 362 // 363 // Some commentary on balanced locking: 364 // 365 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 366 // Methods that don't have provably balanced locking are forced to run in the 367 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 368 // The interpreter provides two properties: 369 // I1: At return-time the interpreter automatically and quietly unlocks any 370 // objects acquired the current activation (frame). Recall that the 371 // interpreter maintains an on-stack list of locks currently held by 372 // a frame. 373 // I2: If a method attempts to unlock an object that is not held by the 374 // the frame the interpreter throws IMSX. 375 // 376 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 377 // B() doesn't have provably balanced locking so it runs in the interpreter. 378 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 379 // is still locked by A(). 380 // 381 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 382 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 383 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 384 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 385 // Arguably given that the spec legislates the JNI case as undefined our implementation 386 // could reasonably *avoid* checking owner in fast_unlock(). 387 // In the interest of performance we elide m->Owner==Self check in unlock. 388 // A perfectly viable alternative is to elide the owner check except when 389 // Xcheck:jni is enabled. 390 391 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { 392 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 393 assert(boxReg == rax, ""); 394 assert_different_registers(objReg, boxReg, tmpReg); 395 396 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 397 398 if (LockingMode == LM_LEGACY) { 399 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 400 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 401 } 402 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 403 if (LockingMode != LM_MONITOR) { 404 testptr(tmpReg, markWord::monitor_value); // Inflated? 405 jcc(Assembler::zero, Stacked); 406 } 407 408 // It's inflated. 409 410 #ifndef _LP64 411 // Just take slow path to avoid dealing with 64 bit atomic instructions here. 412 orl(boxReg, 1); // set ICC.ZF=0 to indicate failure 413 jmpb(DONE_LABEL); 414 #else 415 // Despite our balanced locking property we still check that m->_owner == Self 416 // as java routines or native JNI code called by this thread might 417 // have released the lock. 418 // Refer to the comments in synchronizer.cpp for how we might encode extra 419 // state in _succ so we can avoid fetching EntryList|cxq. 420 // 421 // If there's no contention try a 1-0 exit. That is, exit without 422 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 423 // we detect and recover from the race that the 1-0 exit admits. 424 // 425 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 426 // before it STs null into _owner, releasing the lock. Updates 427 // to data protected by the critical section must be visible before 428 // we drop the lock (and thus before any other thread could acquire 429 // the lock and observe the fields protected by the lock). 430 // IA32's memory-model is SPO, so STs are ordered with respect to 431 // each other and there's no need for an explicit barrier (fence). 432 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 433 Label LSuccess, LNotRecursive; 434 435 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 436 jccb(Assembler::equal, LNotRecursive); 437 438 // Recursive inflated unlock 439 decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 440 jmpb(LSuccess); 441 442 bind(LNotRecursive); 443 444 // Set owner to null. 445 // Release to satisfy the JMM 446 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 447 // We need a full fence after clearing owner to avoid stranding. 448 // StoreLoad achieves this. 449 membar(StoreLoad); 450 451 // Check if the entry lists are empty (EntryList first - by convention). 452 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 453 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 454 jccb(Assembler::zero, LSuccess); // If so we are done. 455 456 // Check if there is a successor. 457 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 458 jccb(Assembler::notZero, LSuccess); // If so we are done. 459 460 // Save the monitor pointer in the current thread, so we can try to 461 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 462 andptr(tmpReg, ~(int32_t)markWord::monitor_value); 463 movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg); 464 465 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 466 jmpb (DONE_LABEL); 467 468 bind (LSuccess); 469 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 470 jmpb (DONE_LABEL); 471 #endif // _LP64 472 473 if (LockingMode == LM_LEGACY) { 474 bind (Stacked); 475 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 476 lock(); 477 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 478 // Intentional fall-thru into DONE_LABEL 479 } 480 481 bind(DONE_LABEL); 482 483 // ZFlag == 1 count in fast path 484 // ZFlag == 0 count in slow path 485 jccb(Assembler::notZero, NO_COUNT); 486 487 bind(COUNT); 488 489 if (LockingMode == LM_LEGACY) { 490 // Count monitors in fast path 491 #ifdef _LP64 492 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 493 #endif 494 } 495 496 xorl(tmpReg, tmpReg); // Set ZF == 1 497 498 bind(NO_COUNT); 499 } 500 501 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 502 Register t, Register thread) { 503 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 504 assert(rax_reg == rax, "Used for CAS"); 505 assert_different_registers(obj, box, rax_reg, t, thread); 506 507 // Handle inflated monitor. 508 Label inflated; 509 // Finish fast lock successfully. ZF value is irrelevant. 510 Label locked; 511 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 512 Label slow_path; 513 514 if (UseObjectMonitorTable) { 515 // Clear cache in case fast locking succeeds. 516 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 517 } 518 519 if (DiagnoseSyncOnValueBasedClasses != 0) { 520 load_klass(rax_reg, obj, t); 521 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 522 jcc(Assembler::notZero, slow_path); 523 } 524 525 const Register mark = t; 526 527 { // Lightweight Lock 528 529 Label push; 530 531 const Register top = UseObjectMonitorTable ? rax_reg : box; 532 533 // Load the mark. 534 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 535 536 // Prefetch top. 537 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 538 539 // Check for monitor (0b10). 540 testptr(mark, markWord::monitor_value); 541 jcc(Assembler::notZero, inflated); 542 543 // Check if lock-stack is full. 544 cmpl(top, LockStack::end_offset() - 1); 545 jcc(Assembler::greater, slow_path); 546 547 // Check if recursive. 548 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 549 jccb(Assembler::equal, push); 550 551 // Try to lock. Transition lock bits 0b01 => 0b00 552 movptr(rax_reg, mark); 553 orptr(rax_reg, markWord::unlocked_value); 554 andptr(mark, ~(int32_t)markWord::unlocked_value); 555 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 556 jcc(Assembler::notEqual, slow_path); 557 558 if (UseObjectMonitorTable) { 559 // Need to reload top, clobbered by CAS. 560 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 561 } 562 bind(push); 563 // After successful lock, push object on lock-stack. 564 movptr(Address(thread, top), obj); 565 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 566 jmpb(locked); 567 } 568 569 { // Handle inflated monitor. 570 bind(inflated); 571 572 #ifndef _LP64 573 // Just take slow path to avoid dealing with 64 bit atomic instructions here. 574 orl(box, 1); // set ICC.ZF=0 to indicate failure 575 jmpb(slow_path); 576 #else 577 const Register monitor = t; 578 579 if (!UseObjectMonitorTable) { 580 assert(mark == monitor, "should be the same here"); 581 } else { 582 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 583 // Fetch ObjectMonitor* from the cache or take the slow-path. 584 Label monitor_found; 585 586 // Load cache address 587 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 588 589 const int num_unrolled = 2; 590 for (int i = 0; i < num_unrolled; i++) { 591 cmpptr(obj, Address(t)); 592 jccb(Assembler::equal, monitor_found); 593 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 594 } 595 596 Label loop; 597 598 // Search for obj in cache. 599 bind(loop); 600 601 // Check for match. 602 cmpptr(obj, Address(t)); 603 jccb(Assembler::equal, monitor_found); 604 605 // Search until null encountered, guaranteed _null_sentinel at end. 606 cmpptr(Address(t), 1); 607 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 608 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 609 jmpb(loop); 610 611 // Cache hit. 612 bind(monitor_found); 613 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 614 } 615 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 616 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 617 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 618 619 Label monitor_locked; 620 // Lock the monitor. 621 622 if (UseObjectMonitorTable) { 623 // Cache the monitor for unlock before trashing box. On failure to acquire 624 // the lock, the slow path will reset the entry accordingly (see CacheSetter). 625 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 626 } 627 628 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 629 xorptr(rax_reg, rax_reg); 630 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset())); 631 lock(); cmpxchgptr(box, owner_address); 632 jccb(Assembler::equal, monitor_locked); 633 634 // Check if recursive. 635 cmpptr(box, rax_reg); 636 jccb(Assembler::notEqual, slow_path); 637 638 // Recursive. 639 increment(recursions_address); 640 641 bind(monitor_locked); 642 #endif // _LP64 643 } 644 645 bind(locked); 646 // Set ZF = 1 647 xorl(rax_reg, rax_reg); 648 649 #ifdef ASSERT 650 // Check that locked label is reached with ZF set. 651 Label zf_correct; 652 Label zf_bad_zero; 653 jcc(Assembler::zero, zf_correct); 654 jmp(zf_bad_zero); 655 #endif 656 657 bind(slow_path); 658 #ifdef ASSERT 659 // Check that slow_path label is reached with ZF not set. 660 jcc(Assembler::notZero, zf_correct); 661 stop("Fast Lock ZF != 0"); 662 bind(zf_bad_zero); 663 stop("Fast Lock ZF != 1"); 664 bind(zf_correct); 665 #endif 666 // C2 uses the value of ZF to determine the continuation. 667 } 668 669 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 670 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 671 assert(reg_rax == rax, "Used for CAS"); 672 assert_different_registers(obj, reg_rax, t); 673 674 // Handle inflated monitor. 675 Label inflated, inflated_check_lock_stack; 676 // Finish fast unlock successfully. MUST jump with ZF == 1 677 Label unlocked, slow_path; 678 679 const Register mark = t; 680 const Register monitor = t; 681 const Register top = UseObjectMonitorTable ? t : reg_rax; 682 const Register box = reg_rax; 683 684 Label dummy; 685 C2FastUnlockLightweightStub* stub = nullptr; 686 687 if (!Compile::current()->output()->in_scratch_emit_size()) { 688 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 689 Compile::current()->output()->add_stub(stub); 690 } 691 692 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 693 694 { // Lightweight Unlock 695 696 // Load top. 697 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 698 699 if (!UseObjectMonitorTable) { 700 // Prefetch mark. 701 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 702 } 703 704 // Check if obj is top of lock-stack. 705 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 706 // Top of lock stack was not obj. Must be monitor. 707 jcc(Assembler::notEqual, inflated_check_lock_stack); 708 709 // Pop lock-stack. 710 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 711 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 712 713 // Check if recursive. 714 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 715 jcc(Assembler::equal, unlocked); 716 717 // We elide the monitor check, let the CAS fail instead. 718 719 if (UseObjectMonitorTable) { 720 // Load mark. 721 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 722 } 723 724 // Try to unlock. Transition lock bits 0b00 => 0b01 725 movptr(reg_rax, mark); 726 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 727 orptr(mark, markWord::unlocked_value); 728 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 729 jcc(Assembler::notEqual, push_and_slow_path); 730 jmp(unlocked); 731 } 732 733 734 { // Handle inflated monitor. 735 bind(inflated_check_lock_stack); 736 #ifdef ASSERT 737 Label check_done; 738 subl(top, oopSize); 739 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 740 jcc(Assembler::below, check_done); 741 cmpptr(obj, Address(thread, top)); 742 jccb(Assembler::notEqual, inflated_check_lock_stack); 743 stop("Fast Unlock lock on stack"); 744 bind(check_done); 745 if (UseObjectMonitorTable) { 746 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 747 } 748 testptr(mark, markWord::monitor_value); 749 jccb(Assembler::notZero, inflated); 750 stop("Fast Unlock not monitor"); 751 #endif 752 753 bind(inflated); 754 755 #ifndef _LP64 756 // Just take slow path to avoid dealing with 64 bit atomic instructions here. 757 orl(t, 1); // set ICC.ZF=0 to indicate failure 758 jmpb(slow_path); 759 #else 760 if (!UseObjectMonitorTable) { 761 assert(mark == monitor, "should be the same here"); 762 } else { 763 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 764 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 765 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 766 cmpptr(monitor, alignof(ObjectMonitor*)); 767 jcc(Assembler::below, slow_path); 768 } 769 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 770 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 771 const Address cxq_address{monitor, ObjectMonitor::cxq_offset() - monitor_tag}; 772 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 773 const Address EntryList_address{monitor, ObjectMonitor::EntryList_offset() - monitor_tag}; 774 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 775 776 Label recursive; 777 778 // Check if recursive. 779 cmpptr(recursions_address, 0); 780 jccb(Assembler::notZero, recursive); 781 782 // Set owner to null. 783 // Release to satisfy the JMM 784 movptr(owner_address, NULL_WORD); 785 // We need a full fence after clearing owner to avoid stranding. 786 // StoreLoad achieves this. 787 membar(StoreLoad); 788 789 // Check if the entry lists are empty (EntryList first - by convention). 790 movptr(reg_rax, EntryList_address); 791 orptr(reg_rax, cxq_address); 792 jccb(Assembler::zero, unlocked); // If so we are done. 793 794 // Check if there is a successor. 795 cmpptr(succ_address, NULL_WORD); 796 jccb(Assembler::notZero, unlocked); // If so we are done. 797 798 // Save the monitor pointer in the current thread, so we can try to 799 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 800 if (!UseObjectMonitorTable) { 801 andptr(monitor, ~(int32_t)markWord::monitor_value); 802 } 803 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 804 805 orl(t, 1); // Fast Unlock ZF = 0 806 jmpb(slow_path); 807 808 // Recursive unlock. 809 bind(recursive); 810 decrement(recursions_address); 811 #endif // _LP64 812 } 813 814 bind(unlocked); 815 xorl(t, t); // Fast Unlock ZF = 1 816 817 #ifdef ASSERT 818 // Check that unlocked label is reached with ZF set. 819 Label zf_correct; 820 Label zf_bad_zero; 821 jcc(Assembler::zero, zf_correct); 822 jmp(zf_bad_zero); 823 #endif 824 825 bind(slow_path); 826 if (stub != nullptr) { 827 bind(stub->slow_path_continuation()); 828 } 829 #ifdef ASSERT 830 // Check that stub->continuation() label is reached with ZF not set. 831 jcc(Assembler::notZero, zf_correct); 832 stop("Fast Unlock ZF != 0"); 833 bind(zf_bad_zero); 834 stop("Fast Unlock ZF != 1"); 835 bind(zf_correct); 836 #endif 837 // C2 uses the value of ZF to determine the continuation. 838 } 839 840 //------------------------------------------------------------------------------------------- 841 // Generic instructions support for use in .ad files C2 code generation 842 843 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 844 if (dst != src) { 845 movdqu(dst, src); 846 } 847 if (opcode == Op_AbsVD) { 848 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 849 } else { 850 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 851 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 852 } 853 } 854 855 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 856 if (opcode == Op_AbsVD) { 857 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 858 } else { 859 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 860 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 861 } 862 } 863 864 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 865 if (dst != src) { 866 movdqu(dst, src); 867 } 868 if (opcode == Op_AbsVF) { 869 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 870 } else { 871 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 872 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 873 } 874 } 875 876 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 877 if (opcode == Op_AbsVF) { 878 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 879 } else { 880 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 881 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 882 } 883 } 884 885 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 886 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 887 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 888 889 if (opcode == Op_MinV) { 890 if (elem_bt == T_BYTE) { 891 pminsb(dst, src); 892 } else if (elem_bt == T_SHORT) { 893 pminsw(dst, src); 894 } else if (elem_bt == T_INT) { 895 pminsd(dst, src); 896 } else { 897 assert(elem_bt == T_LONG, "required"); 898 assert(tmp == xmm0, "required"); 899 assert_different_registers(dst, src, tmp); 900 movdqu(xmm0, dst); 901 pcmpgtq(xmm0, src); 902 blendvpd(dst, src); // xmm0 as mask 903 } 904 } else { // opcode == Op_MaxV 905 if (elem_bt == T_BYTE) { 906 pmaxsb(dst, src); 907 } else if (elem_bt == T_SHORT) { 908 pmaxsw(dst, src); 909 } else if (elem_bt == T_INT) { 910 pmaxsd(dst, src); 911 } else { 912 assert(elem_bt == T_LONG, "required"); 913 assert(tmp == xmm0, "required"); 914 assert_different_registers(dst, src, tmp); 915 movdqu(xmm0, src); 916 pcmpgtq(xmm0, dst); 917 blendvpd(dst, src); // xmm0 as mask 918 } 919 } 920 } 921 922 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 923 XMMRegister src1, Address src2, int vlen_enc) { 924 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 925 if (opcode == Op_UMinV) { 926 switch(elem_bt) { 927 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 928 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 929 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 930 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 931 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 932 } 933 } else { 934 assert(opcode == Op_UMaxV, "required"); 935 switch(elem_bt) { 936 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 937 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 938 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 939 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 940 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 941 } 942 } 943 } 944 945 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 946 // For optimality, leverage a full vector width of 512 bits 947 // for operations over smaller vector sizes on AVX512 targets. 948 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { 949 if (opcode == Op_UMaxV) { 950 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 951 } else { 952 assert(opcode == Op_UMinV, "required"); 953 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 954 } 955 } else { 956 // T1 = -1 957 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc); 958 // T1 = -1 << 63 959 vpsllq(xtmp1, xtmp1, 63, vlen_enc); 960 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2 961 vpaddq(xtmp2, xtmp1, src2, vlen_enc); 962 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1 963 vpaddq(xtmp1, xtmp1, src1, vlen_enc); 964 // Mask = T2 > T1 965 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc); 966 if (opcode == Op_UMaxV) { 967 // Res = Mask ? Src2 : Src1 968 vpblendvb(dst, src1, src2, xtmp1, vlen_enc); 969 } else { 970 // Res = Mask ? Src1 : Src2 971 vpblendvb(dst, src2, src1, xtmp1, vlen_enc); 972 } 973 } 974 } 975 976 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 977 XMMRegister src1, XMMRegister src2, int vlen_enc) { 978 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 979 if (opcode == Op_UMinV) { 980 switch(elem_bt) { 981 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 982 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 983 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 984 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 985 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 986 } 987 } else { 988 assert(opcode == Op_UMaxV, "required"); 989 switch(elem_bt) { 990 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 991 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 992 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 993 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 994 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 995 } 996 } 997 } 998 999 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1000 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1001 int vlen_enc) { 1002 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1003 1004 if (opcode == Op_MinV) { 1005 if (elem_bt == T_BYTE) { 1006 vpminsb(dst, src1, src2, vlen_enc); 1007 } else if (elem_bt == T_SHORT) { 1008 vpminsw(dst, src1, src2, vlen_enc); 1009 } else if (elem_bt == T_INT) { 1010 vpminsd(dst, src1, src2, vlen_enc); 1011 } else { 1012 assert(elem_bt == T_LONG, "required"); 1013 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1014 vpminsq(dst, src1, src2, vlen_enc); 1015 } else { 1016 assert_different_registers(dst, src1, src2); 1017 vpcmpgtq(dst, src1, src2, vlen_enc); 1018 vblendvpd(dst, src1, src2, dst, vlen_enc); 1019 } 1020 } 1021 } else { // opcode == Op_MaxV 1022 if (elem_bt == T_BYTE) { 1023 vpmaxsb(dst, src1, src2, vlen_enc); 1024 } else if (elem_bt == T_SHORT) { 1025 vpmaxsw(dst, src1, src2, vlen_enc); 1026 } else if (elem_bt == T_INT) { 1027 vpmaxsd(dst, src1, src2, vlen_enc); 1028 } else { 1029 assert(elem_bt == T_LONG, "required"); 1030 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1031 vpmaxsq(dst, src1, src2, vlen_enc); 1032 } else { 1033 assert_different_registers(dst, src1, src2); 1034 vpcmpgtq(dst, src1, src2, vlen_enc); 1035 vblendvpd(dst, src2, src1, dst, vlen_enc); 1036 } 1037 } 1038 } 1039 } 1040 1041 // Float/Double min max 1042 1043 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1044 XMMRegister dst, XMMRegister a, XMMRegister b, 1045 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1046 int vlen_enc) { 1047 assert(UseAVX > 0, "required"); 1048 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1049 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1050 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1051 assert_different_registers(a, tmp, atmp, btmp); 1052 assert_different_registers(b, tmp, atmp, btmp); 1053 1054 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1055 bool is_double_word = is_double_word_type(elem_bt); 1056 1057 /* Note on 'non-obvious' assembly sequence: 1058 * 1059 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1060 * and Java on how they handle floats: 1061 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1062 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1063 * 1064 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1065 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1066 * (only useful when signs differ, noop otherwise) 1067 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1068 1069 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1070 * btmp = (b < +0.0) ? a : b 1071 * atmp = (b < +0.0) ? b : a 1072 * Tmp = Max_Float(atmp , btmp) 1073 * Res = (atmp == NaN) ? atmp : Tmp 1074 */ 1075 1076 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1077 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1078 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1079 XMMRegister mask; 1080 1081 if (!is_double_word && is_min) { 1082 mask = a; 1083 vblend = &MacroAssembler::vblendvps; 1084 vmaxmin = &MacroAssembler::vminps; 1085 vcmp = &MacroAssembler::vcmpps; 1086 } else if (!is_double_word && !is_min) { 1087 mask = b; 1088 vblend = &MacroAssembler::vblendvps; 1089 vmaxmin = &MacroAssembler::vmaxps; 1090 vcmp = &MacroAssembler::vcmpps; 1091 } else if (is_double_word && is_min) { 1092 mask = a; 1093 vblend = &MacroAssembler::vblendvpd; 1094 vmaxmin = &MacroAssembler::vminpd; 1095 vcmp = &MacroAssembler::vcmppd; 1096 } else { 1097 assert(is_double_word && !is_min, "sanity"); 1098 mask = b; 1099 vblend = &MacroAssembler::vblendvpd; 1100 vmaxmin = &MacroAssembler::vmaxpd; 1101 vcmp = &MacroAssembler::vcmppd; 1102 } 1103 1104 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1105 XMMRegister maxmin, scratch; 1106 if (dst == btmp) { 1107 maxmin = btmp; 1108 scratch = tmp; 1109 } else { 1110 maxmin = tmp; 1111 scratch = btmp; 1112 } 1113 1114 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1115 if (precompute_mask && !is_double_word) { 1116 vpsrad(tmp, mask, 32, vlen_enc); 1117 mask = tmp; 1118 } else if (precompute_mask && is_double_word) { 1119 vpxor(tmp, tmp, tmp, vlen_enc); 1120 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1121 mask = tmp; 1122 } 1123 1124 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1125 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1126 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1127 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1128 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1129 } 1130 1131 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1132 XMMRegister dst, XMMRegister a, XMMRegister b, 1133 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1134 int vlen_enc) { 1135 assert(UseAVX > 2, "required"); 1136 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1137 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1138 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1139 assert_different_registers(dst, a, atmp, btmp); 1140 assert_different_registers(dst, b, atmp, btmp); 1141 1142 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1143 bool is_double_word = is_double_word_type(elem_bt); 1144 bool merge = true; 1145 1146 if (!is_double_word && is_min) { 1147 evpmovd2m(ktmp, a, vlen_enc); 1148 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1149 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1150 vminps(dst, atmp, btmp, vlen_enc); 1151 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1152 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1153 } else if (!is_double_word && !is_min) { 1154 evpmovd2m(ktmp, b, vlen_enc); 1155 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1156 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1157 vmaxps(dst, atmp, btmp, vlen_enc); 1158 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1159 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1160 } else if (is_double_word && is_min) { 1161 evpmovq2m(ktmp, a, vlen_enc); 1162 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1163 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1164 vminpd(dst, atmp, btmp, vlen_enc); 1165 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1166 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1167 } else { 1168 assert(is_double_word && !is_min, "sanity"); 1169 evpmovq2m(ktmp, b, vlen_enc); 1170 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1171 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1172 vmaxpd(dst, atmp, btmp, vlen_enc); 1173 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1174 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1175 } 1176 } 1177 1178 // Float/Double signum 1179 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1180 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1181 1182 Label DONE_LABEL; 1183 1184 if (opcode == Op_SignumF) { 1185 assert(UseSSE > 0, "required"); 1186 ucomiss(dst, zero); 1187 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1188 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1189 movflt(dst, one); 1190 jcc(Assembler::above, DONE_LABEL); 1191 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1192 } else if (opcode == Op_SignumD) { 1193 assert(UseSSE > 1, "required"); 1194 ucomisd(dst, zero); 1195 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1196 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1197 movdbl(dst, one); 1198 jcc(Assembler::above, DONE_LABEL); 1199 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1200 } 1201 1202 bind(DONE_LABEL); 1203 } 1204 1205 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1206 if (sign) { 1207 pmovsxbw(dst, src); 1208 } else { 1209 pmovzxbw(dst, src); 1210 } 1211 } 1212 1213 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1214 if (sign) { 1215 vpmovsxbw(dst, src, vector_len); 1216 } else { 1217 vpmovzxbw(dst, src, vector_len); 1218 } 1219 } 1220 1221 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1222 if (sign) { 1223 vpmovsxbd(dst, src, vector_len); 1224 } else { 1225 vpmovzxbd(dst, src, vector_len); 1226 } 1227 } 1228 1229 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1230 if (sign) { 1231 vpmovsxwd(dst, src, vector_len); 1232 } else { 1233 vpmovzxwd(dst, src, vector_len); 1234 } 1235 } 1236 1237 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1238 int shift, int vector_len) { 1239 if (opcode == Op_RotateLeftV) { 1240 if (etype == T_INT) { 1241 evprold(dst, src, shift, vector_len); 1242 } else { 1243 assert(etype == T_LONG, "expected type T_LONG"); 1244 evprolq(dst, src, shift, vector_len); 1245 } 1246 } else { 1247 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1248 if (etype == T_INT) { 1249 evprord(dst, src, shift, vector_len); 1250 } else { 1251 assert(etype == T_LONG, "expected type T_LONG"); 1252 evprorq(dst, src, shift, vector_len); 1253 } 1254 } 1255 } 1256 1257 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1258 XMMRegister shift, int vector_len) { 1259 if (opcode == Op_RotateLeftV) { 1260 if (etype == T_INT) { 1261 evprolvd(dst, src, shift, vector_len); 1262 } else { 1263 assert(etype == T_LONG, "expected type T_LONG"); 1264 evprolvq(dst, src, shift, vector_len); 1265 } 1266 } else { 1267 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1268 if (etype == T_INT) { 1269 evprorvd(dst, src, shift, vector_len); 1270 } else { 1271 assert(etype == T_LONG, "expected type T_LONG"); 1272 evprorvq(dst, src, shift, vector_len); 1273 } 1274 } 1275 } 1276 1277 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1278 if (opcode == Op_RShiftVI) { 1279 psrad(dst, shift); 1280 } else if (opcode == Op_LShiftVI) { 1281 pslld(dst, shift); 1282 } else { 1283 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1284 psrld(dst, shift); 1285 } 1286 } 1287 1288 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1289 switch (opcode) { 1290 case Op_RShiftVI: psrad(dst, shift); break; 1291 case Op_LShiftVI: pslld(dst, shift); break; 1292 case Op_URShiftVI: psrld(dst, shift); break; 1293 1294 default: assert(false, "%s", NodeClassNames[opcode]); 1295 } 1296 } 1297 1298 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1299 if (opcode == Op_RShiftVI) { 1300 vpsrad(dst, nds, shift, vector_len); 1301 } else if (opcode == Op_LShiftVI) { 1302 vpslld(dst, nds, shift, vector_len); 1303 } else { 1304 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1305 vpsrld(dst, nds, shift, vector_len); 1306 } 1307 } 1308 1309 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1310 switch (opcode) { 1311 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1312 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1313 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1314 1315 default: assert(false, "%s", NodeClassNames[opcode]); 1316 } 1317 } 1318 1319 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1320 switch (opcode) { 1321 case Op_RShiftVB: // fall-through 1322 case Op_RShiftVS: psraw(dst, shift); break; 1323 1324 case Op_LShiftVB: // fall-through 1325 case Op_LShiftVS: psllw(dst, shift); break; 1326 1327 case Op_URShiftVS: // fall-through 1328 case Op_URShiftVB: psrlw(dst, shift); break; 1329 1330 default: assert(false, "%s", NodeClassNames[opcode]); 1331 } 1332 } 1333 1334 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1335 switch (opcode) { 1336 case Op_RShiftVB: // fall-through 1337 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1338 1339 case Op_LShiftVB: // fall-through 1340 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1341 1342 case Op_URShiftVS: // fall-through 1343 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1344 1345 default: assert(false, "%s", NodeClassNames[opcode]); 1346 } 1347 } 1348 1349 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1350 switch (opcode) { 1351 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1352 case Op_LShiftVL: psllq(dst, shift); break; 1353 case Op_URShiftVL: psrlq(dst, shift); break; 1354 1355 default: assert(false, "%s", NodeClassNames[opcode]); 1356 } 1357 } 1358 1359 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1360 if (opcode == Op_RShiftVL) { 1361 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1362 } else if (opcode == Op_LShiftVL) { 1363 psllq(dst, shift); 1364 } else { 1365 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1366 psrlq(dst, shift); 1367 } 1368 } 1369 1370 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1371 switch (opcode) { 1372 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1373 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1374 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1375 1376 default: assert(false, "%s", NodeClassNames[opcode]); 1377 } 1378 } 1379 1380 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1381 if (opcode == Op_RShiftVL) { 1382 evpsraq(dst, nds, shift, vector_len); 1383 } else if (opcode == Op_LShiftVL) { 1384 vpsllq(dst, nds, shift, vector_len); 1385 } else { 1386 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1387 vpsrlq(dst, nds, shift, vector_len); 1388 } 1389 } 1390 1391 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1392 switch (opcode) { 1393 case Op_RShiftVB: // fall-through 1394 case Op_RShiftVS: // fall-through 1395 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1396 1397 case Op_LShiftVB: // fall-through 1398 case Op_LShiftVS: // fall-through 1399 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1400 1401 case Op_URShiftVB: // fall-through 1402 case Op_URShiftVS: // fall-through 1403 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1404 1405 default: assert(false, "%s", NodeClassNames[opcode]); 1406 } 1407 } 1408 1409 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1410 switch (opcode) { 1411 case Op_RShiftVB: // fall-through 1412 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1413 1414 case Op_LShiftVB: // fall-through 1415 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1416 1417 case Op_URShiftVB: // fall-through 1418 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1419 1420 default: assert(false, "%s", NodeClassNames[opcode]); 1421 } 1422 } 1423 1424 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1425 assert(UseAVX >= 2, "required"); 1426 switch (opcode) { 1427 case Op_RShiftVL: { 1428 if (UseAVX > 2) { 1429 assert(tmp == xnoreg, "not used"); 1430 if (!VM_Version::supports_avx512vl()) { 1431 vlen_enc = Assembler::AVX_512bit; 1432 } 1433 evpsravq(dst, src, shift, vlen_enc); 1434 } else { 1435 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1436 vpsrlvq(dst, src, shift, vlen_enc); 1437 vpsrlvq(tmp, tmp, shift, vlen_enc); 1438 vpxor(dst, dst, tmp, vlen_enc); 1439 vpsubq(dst, dst, tmp, vlen_enc); 1440 } 1441 break; 1442 } 1443 case Op_LShiftVL: { 1444 assert(tmp == xnoreg, "not used"); 1445 vpsllvq(dst, src, shift, vlen_enc); 1446 break; 1447 } 1448 case Op_URShiftVL: { 1449 assert(tmp == xnoreg, "not used"); 1450 vpsrlvq(dst, src, shift, vlen_enc); 1451 break; 1452 } 1453 default: assert(false, "%s", NodeClassNames[opcode]); 1454 } 1455 } 1456 1457 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1458 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1459 assert(opcode == Op_LShiftVB || 1460 opcode == Op_RShiftVB || 1461 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1462 bool sign = (opcode != Op_URShiftVB); 1463 assert(vector_len == 0, "required"); 1464 vextendbd(sign, dst, src, 1); 1465 vpmovzxbd(vtmp, shift, 1); 1466 varshiftd(opcode, dst, dst, vtmp, 1); 1467 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1468 vextracti128_high(vtmp, dst); 1469 vpackusdw(dst, dst, vtmp, 0); 1470 } 1471 1472 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1473 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1474 assert(opcode == Op_LShiftVB || 1475 opcode == Op_RShiftVB || 1476 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1477 bool sign = (opcode != Op_URShiftVB); 1478 int ext_vector_len = vector_len + 1; 1479 vextendbw(sign, dst, src, ext_vector_len); 1480 vpmovzxbw(vtmp, shift, ext_vector_len); 1481 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1482 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1483 if (vector_len == 0) { 1484 vextracti128_high(vtmp, dst); 1485 vpackuswb(dst, dst, vtmp, vector_len); 1486 } else { 1487 vextracti64x4_high(vtmp, dst); 1488 vpackuswb(dst, dst, vtmp, vector_len); 1489 vpermq(dst, dst, 0xD8, vector_len); 1490 } 1491 } 1492 1493 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1494 switch(typ) { 1495 case T_BYTE: 1496 pinsrb(dst, val, idx); 1497 break; 1498 case T_SHORT: 1499 pinsrw(dst, val, idx); 1500 break; 1501 case T_INT: 1502 pinsrd(dst, val, idx); 1503 break; 1504 case T_LONG: 1505 pinsrq(dst, val, idx); 1506 break; 1507 default: 1508 assert(false,"Should not reach here."); 1509 break; 1510 } 1511 } 1512 1513 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1514 switch(typ) { 1515 case T_BYTE: 1516 vpinsrb(dst, src, val, idx); 1517 break; 1518 case T_SHORT: 1519 vpinsrw(dst, src, val, idx); 1520 break; 1521 case T_INT: 1522 vpinsrd(dst, src, val, idx); 1523 break; 1524 case T_LONG: 1525 vpinsrq(dst, src, val, idx); 1526 break; 1527 default: 1528 assert(false,"Should not reach here."); 1529 break; 1530 } 1531 } 1532 1533 #ifdef _LP64 1534 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1535 XMMRegister dst, Register base, 1536 Register idx_base, 1537 Register offset, Register mask, 1538 Register mask_idx, Register rtmp, 1539 int vlen_enc) { 1540 vpxor(dst, dst, dst, vlen_enc); 1541 if (elem_bt == T_SHORT) { 1542 for (int i = 0; i < 4; i++) { 1543 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1544 Label skip_load; 1545 btq(mask, mask_idx); 1546 jccb(Assembler::carryClear, skip_load); 1547 movl(rtmp, Address(idx_base, i * 4)); 1548 if (offset != noreg) { 1549 addl(rtmp, offset); 1550 } 1551 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1552 bind(skip_load); 1553 incq(mask_idx); 1554 } 1555 } else { 1556 assert(elem_bt == T_BYTE, ""); 1557 for (int i = 0; i < 8; i++) { 1558 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1559 Label skip_load; 1560 btq(mask, mask_idx); 1561 jccb(Assembler::carryClear, skip_load); 1562 movl(rtmp, Address(idx_base, i * 4)); 1563 if (offset != noreg) { 1564 addl(rtmp, offset); 1565 } 1566 pinsrb(dst, Address(base, rtmp), i); 1567 bind(skip_load); 1568 incq(mask_idx); 1569 } 1570 } 1571 } 1572 #endif // _LP64 1573 1574 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1575 Register base, Register idx_base, 1576 Register offset, Register rtmp, 1577 int vlen_enc) { 1578 vpxor(dst, dst, dst, vlen_enc); 1579 if (elem_bt == T_SHORT) { 1580 for (int i = 0; i < 4; i++) { 1581 // dst[i] = src[offset + idx_base[i]] 1582 movl(rtmp, Address(idx_base, i * 4)); 1583 if (offset != noreg) { 1584 addl(rtmp, offset); 1585 } 1586 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1587 } 1588 } else { 1589 assert(elem_bt == T_BYTE, ""); 1590 for (int i = 0; i < 8; i++) { 1591 // dst[i] = src[offset + idx_base[i]] 1592 movl(rtmp, Address(idx_base, i * 4)); 1593 if (offset != noreg) { 1594 addl(rtmp, offset); 1595 } 1596 pinsrb(dst, Address(base, rtmp), i); 1597 } 1598 } 1599 } 1600 1601 /* 1602 * Gather using hybrid algorithm, first partially unroll scalar loop 1603 * to accumulate values from gather indices into a quad-word(64bit) slice. 1604 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1605 * permutation to place the slice into appropriate vector lane 1606 * locations in destination vector. Following pseudo code describes the 1607 * algorithm in detail: 1608 * 1609 * DST_VEC = ZERO_VEC 1610 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1611 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1612 * FOREACH_ITER: 1613 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1614 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1615 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1616 * PERM_INDEX = PERM_INDEX - TWO_VEC 1617 * 1618 * With each iteration, doubleword permute indices (0,1) corresponding 1619 * to gathered quadword gets right shifted by two lane positions. 1620 * 1621 */ 1622 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1623 Register base, Register idx_base, 1624 Register offset, Register mask, 1625 XMMRegister xtmp1, XMMRegister xtmp2, 1626 XMMRegister temp_dst, Register rtmp, 1627 Register mask_idx, Register length, 1628 int vector_len, int vlen_enc) { 1629 Label GATHER8_LOOP; 1630 assert(is_subword_type(elem_ty), ""); 1631 movl(length, vector_len); 1632 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1633 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1634 vallones(xtmp2, vlen_enc); 1635 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1636 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1637 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1638 1639 bind(GATHER8_LOOP); 1640 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1641 if (mask == noreg) { 1642 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1643 } else { 1644 LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc)); 1645 } 1646 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1647 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1648 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1649 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1650 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1651 vpor(dst, dst, temp_dst, vlen_enc); 1652 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1653 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1654 jcc(Assembler::notEqual, GATHER8_LOOP); 1655 } 1656 1657 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1658 switch(typ) { 1659 case T_INT: 1660 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1661 break; 1662 case T_FLOAT: 1663 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1664 break; 1665 case T_LONG: 1666 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1667 break; 1668 case T_DOUBLE: 1669 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1670 break; 1671 default: 1672 assert(false,"Should not reach here."); 1673 break; 1674 } 1675 } 1676 1677 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1678 switch(typ) { 1679 case T_INT: 1680 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1681 break; 1682 case T_FLOAT: 1683 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1684 break; 1685 case T_LONG: 1686 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1687 break; 1688 case T_DOUBLE: 1689 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1690 break; 1691 default: 1692 assert(false,"Should not reach here."); 1693 break; 1694 } 1695 } 1696 1697 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1698 switch(typ) { 1699 case T_INT: 1700 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1701 break; 1702 case T_FLOAT: 1703 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1704 break; 1705 case T_LONG: 1706 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1707 break; 1708 case T_DOUBLE: 1709 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1710 break; 1711 default: 1712 assert(false,"Should not reach here."); 1713 break; 1714 } 1715 } 1716 1717 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1718 if (vlen_in_bytes <= 16) { 1719 pxor (dst, dst); 1720 psubb(dst, src); 1721 switch (elem_bt) { 1722 case T_BYTE: /* nothing to do */ break; 1723 case T_SHORT: pmovsxbw(dst, dst); break; 1724 case T_INT: pmovsxbd(dst, dst); break; 1725 case T_FLOAT: pmovsxbd(dst, dst); break; 1726 case T_LONG: pmovsxbq(dst, dst); break; 1727 case T_DOUBLE: pmovsxbq(dst, dst); break; 1728 1729 default: assert(false, "%s", type2name(elem_bt)); 1730 } 1731 } else { 1732 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1733 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1734 1735 vpxor (dst, dst, dst, vlen_enc); 1736 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1737 1738 switch (elem_bt) { 1739 case T_BYTE: /* nothing to do */ break; 1740 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1741 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1742 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1743 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1744 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1745 1746 default: assert(false, "%s", type2name(elem_bt)); 1747 } 1748 } 1749 } 1750 1751 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1752 if (novlbwdq) { 1753 vpmovsxbd(xtmp, src, vlen_enc); 1754 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1755 Assembler::eq, true, vlen_enc, noreg); 1756 } else { 1757 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1758 vpsubb(xtmp, xtmp, src, vlen_enc); 1759 evpmovb2m(dst, xtmp, vlen_enc); 1760 } 1761 } 1762 1763 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) { 1764 if (is_integral_type(bt)) { 1765 switch (vlen_in_bytes) { 1766 case 4: movdl(dst, src); break; 1767 case 8: movq(dst, src); break; 1768 case 16: movdqu(dst, src); break; 1769 case 32: vmovdqu(dst, src); break; 1770 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1771 default: ShouldNotReachHere(); 1772 } 1773 } else { 1774 switch (vlen_in_bytes) { 1775 case 4: movflt(dst, src); break; 1776 case 8: movdbl(dst, src); break; 1777 case 16: movups(dst, src); break; 1778 case 32: vmovups(dst, src, Assembler::AVX_256bit); break; 1779 case 64: vmovups(dst, src, Assembler::AVX_512bit); break; 1780 default: ShouldNotReachHere(); 1781 } 1782 } 1783 } 1784 1785 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1786 assert(rscratch != noreg || always_reachable(src), "missing"); 1787 1788 if (reachable(src)) { 1789 load_vector(bt, dst, as_Address(src), vlen_in_bytes); 1790 } else { 1791 lea(rscratch, src); 1792 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes); 1793 } 1794 } 1795 1796 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1797 int vlen_enc = vector_length_encoding(vlen); 1798 if (VM_Version::supports_avx()) { 1799 if (bt == T_LONG) { 1800 if (VM_Version::supports_avx2()) { 1801 vpbroadcastq(dst, src, vlen_enc); 1802 } else { 1803 vmovddup(dst, src, vlen_enc); 1804 } 1805 } else if (bt == T_DOUBLE) { 1806 if (vlen_enc != Assembler::AVX_128bit) { 1807 vbroadcastsd(dst, src, vlen_enc, noreg); 1808 } else { 1809 vmovddup(dst, src, vlen_enc); 1810 } 1811 } else { 1812 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1813 vpbroadcastd(dst, src, vlen_enc); 1814 } else { 1815 vbroadcastss(dst, src, vlen_enc); 1816 } 1817 } 1818 } else if (VM_Version::supports_sse3()) { 1819 movddup(dst, src); 1820 } else { 1821 load_vector(bt, dst, src, vlen); 1822 } 1823 } 1824 1825 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1826 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1827 int offset = exact_log2(type2aelembytes(bt)) << 6; 1828 if (is_floating_point_type(bt)) { 1829 offset += 128; 1830 } 1831 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1832 load_vector(T_BYTE, dst, addr, vlen_in_bytes); 1833 } 1834 1835 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1836 1837 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1838 int vector_len = Assembler::AVX_128bit; 1839 1840 switch (opcode) { 1841 case Op_AndReductionV: pand(dst, src); break; 1842 case Op_OrReductionV: por (dst, src); break; 1843 case Op_XorReductionV: pxor(dst, src); break; 1844 case Op_MinReductionV: 1845 switch (typ) { 1846 case T_BYTE: pminsb(dst, src); break; 1847 case T_SHORT: pminsw(dst, src); break; 1848 case T_INT: pminsd(dst, src); break; 1849 case T_LONG: assert(UseAVX > 2, "required"); 1850 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1851 default: assert(false, "wrong type"); 1852 } 1853 break; 1854 case Op_MaxReductionV: 1855 switch (typ) { 1856 case T_BYTE: pmaxsb(dst, src); break; 1857 case T_SHORT: pmaxsw(dst, src); break; 1858 case T_INT: pmaxsd(dst, src); break; 1859 case T_LONG: assert(UseAVX > 2, "required"); 1860 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1861 default: assert(false, "wrong type"); 1862 } 1863 break; 1864 case Op_AddReductionVF: addss(dst, src); break; 1865 case Op_AddReductionVD: addsd(dst, src); break; 1866 case Op_AddReductionVI: 1867 switch (typ) { 1868 case T_BYTE: paddb(dst, src); break; 1869 case T_SHORT: paddw(dst, src); break; 1870 case T_INT: paddd(dst, src); break; 1871 default: assert(false, "wrong type"); 1872 } 1873 break; 1874 case Op_AddReductionVL: paddq(dst, src); break; 1875 case Op_MulReductionVF: mulss(dst, src); break; 1876 case Op_MulReductionVD: mulsd(dst, src); break; 1877 case Op_MulReductionVI: 1878 switch (typ) { 1879 case T_SHORT: pmullw(dst, src); break; 1880 case T_INT: pmulld(dst, src); break; 1881 default: assert(false, "wrong type"); 1882 } 1883 break; 1884 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1885 evpmullq(dst, dst, src, vector_len); break; 1886 default: assert(false, "wrong opcode"); 1887 } 1888 } 1889 1890 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1891 switch (opcode) { 1892 case Op_AddReductionVF: addps(dst, src); break; 1893 case Op_AddReductionVD: addpd(dst, src); break; 1894 case Op_MulReductionVF: mulps(dst, src); break; 1895 case Op_MulReductionVD: mulpd(dst, src); break; 1896 default: assert(false, "%s", NodeClassNames[opcode]); 1897 } 1898 } 1899 1900 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1901 int vector_len = Assembler::AVX_256bit; 1902 1903 switch (opcode) { 1904 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1905 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1906 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1907 case Op_MinReductionV: 1908 switch (typ) { 1909 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1910 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1911 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1912 case T_LONG: assert(UseAVX > 2, "required"); 1913 vpminsq(dst, src1, src2, vector_len); break; 1914 default: assert(false, "wrong type"); 1915 } 1916 break; 1917 case Op_MaxReductionV: 1918 switch (typ) { 1919 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1920 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1921 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1922 case T_LONG: assert(UseAVX > 2, "required"); 1923 vpmaxsq(dst, src1, src2, vector_len); break; 1924 default: assert(false, "wrong type"); 1925 } 1926 break; 1927 case Op_AddReductionVI: 1928 switch (typ) { 1929 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1930 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1931 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1932 default: assert(false, "wrong type"); 1933 } 1934 break; 1935 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1936 case Op_MulReductionVI: 1937 switch (typ) { 1938 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1939 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1940 default: assert(false, "wrong type"); 1941 } 1942 break; 1943 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1944 default: assert(false, "wrong opcode"); 1945 } 1946 } 1947 1948 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1949 int vector_len = Assembler::AVX_256bit; 1950 1951 switch (opcode) { 1952 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 1953 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 1954 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 1955 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 1956 default: assert(false, "%s", NodeClassNames[opcode]); 1957 } 1958 } 1959 1960 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1961 XMMRegister dst, XMMRegister src, 1962 XMMRegister vtmp1, XMMRegister vtmp2) { 1963 switch (opcode) { 1964 case Op_AddReductionVF: 1965 case Op_MulReductionVF: 1966 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1967 break; 1968 1969 case Op_AddReductionVD: 1970 case Op_MulReductionVD: 1971 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1972 break; 1973 1974 default: assert(false, "wrong opcode"); 1975 } 1976 } 1977 1978 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 1979 XMMRegister dst, XMMRegister src, 1980 XMMRegister vtmp1, XMMRegister vtmp2) { 1981 switch (opcode) { 1982 case Op_AddReductionVF: 1983 case Op_MulReductionVF: 1984 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1985 break; 1986 1987 case Op_AddReductionVD: 1988 case Op_MulReductionVD: 1989 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1990 break; 1991 1992 default: assert(false, "%s", NodeClassNames[opcode]); 1993 } 1994 } 1995 1996 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1997 Register dst, Register src1, XMMRegister src2, 1998 XMMRegister vtmp1, XMMRegister vtmp2) { 1999 switch (vlen) { 2000 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2001 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2002 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2003 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2004 2005 default: assert(false, "wrong vector length"); 2006 } 2007 } 2008 2009 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 2010 Register dst, Register src1, XMMRegister src2, 2011 XMMRegister vtmp1, XMMRegister vtmp2) { 2012 switch (vlen) { 2013 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2014 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2015 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2016 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2017 2018 default: assert(false, "wrong vector length"); 2019 } 2020 } 2021 2022 void C2_MacroAssembler::reduceS(int opcode, int vlen, 2023 Register dst, Register src1, XMMRegister src2, 2024 XMMRegister vtmp1, XMMRegister vtmp2) { 2025 switch (vlen) { 2026 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2027 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2028 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2029 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2030 2031 default: assert(false, "wrong vector length"); 2032 } 2033 } 2034 2035 void C2_MacroAssembler::reduceI(int opcode, int vlen, 2036 Register dst, Register src1, XMMRegister src2, 2037 XMMRegister vtmp1, XMMRegister vtmp2) { 2038 switch (vlen) { 2039 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2040 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2041 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2042 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2043 2044 default: assert(false, "wrong vector length"); 2045 } 2046 } 2047 2048 #ifdef _LP64 2049 void C2_MacroAssembler::reduceL(int opcode, int vlen, 2050 Register dst, Register src1, XMMRegister src2, 2051 XMMRegister vtmp1, XMMRegister vtmp2) { 2052 switch (vlen) { 2053 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2054 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2055 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2056 2057 default: assert(false, "wrong vector length"); 2058 } 2059 } 2060 #endif // _LP64 2061 2062 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2063 switch (vlen) { 2064 case 2: 2065 assert(vtmp2 == xnoreg, ""); 2066 reduce2F(opcode, dst, src, vtmp1); 2067 break; 2068 case 4: 2069 assert(vtmp2 == xnoreg, ""); 2070 reduce4F(opcode, dst, src, vtmp1); 2071 break; 2072 case 8: 2073 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2074 break; 2075 case 16: 2076 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2077 break; 2078 default: assert(false, "wrong vector length"); 2079 } 2080 } 2081 2082 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2083 switch (vlen) { 2084 case 2: 2085 assert(vtmp2 == xnoreg, ""); 2086 reduce2D(opcode, dst, src, vtmp1); 2087 break; 2088 case 4: 2089 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2090 break; 2091 case 8: 2092 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2093 break; 2094 default: assert(false, "wrong vector length"); 2095 } 2096 } 2097 2098 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2099 switch (vlen) { 2100 case 2: 2101 assert(vtmp1 == xnoreg, ""); 2102 assert(vtmp2 == xnoreg, ""); 2103 unorderedReduce2F(opcode, dst, src); 2104 break; 2105 case 4: 2106 assert(vtmp2 == xnoreg, ""); 2107 unorderedReduce4F(opcode, dst, src, vtmp1); 2108 break; 2109 case 8: 2110 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 2111 break; 2112 case 16: 2113 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 2114 break; 2115 default: assert(false, "wrong vector length"); 2116 } 2117 } 2118 2119 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2120 switch (vlen) { 2121 case 2: 2122 assert(vtmp1 == xnoreg, ""); 2123 assert(vtmp2 == xnoreg, ""); 2124 unorderedReduce2D(opcode, dst, src); 2125 break; 2126 case 4: 2127 assert(vtmp2 == xnoreg, ""); 2128 unorderedReduce4D(opcode, dst, src, vtmp1); 2129 break; 2130 case 8: 2131 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2132 break; 2133 default: assert(false, "wrong vector length"); 2134 } 2135 } 2136 2137 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2138 if (opcode == Op_AddReductionVI) { 2139 if (vtmp1 != src2) { 2140 movdqu(vtmp1, src2); 2141 } 2142 phaddd(vtmp1, vtmp1); 2143 } else { 2144 pshufd(vtmp1, src2, 0x1); 2145 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2146 } 2147 movdl(vtmp2, src1); 2148 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2149 movdl(dst, vtmp1); 2150 } 2151 2152 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2153 if (opcode == Op_AddReductionVI) { 2154 if (vtmp1 != src2) { 2155 movdqu(vtmp1, src2); 2156 } 2157 phaddd(vtmp1, src2); 2158 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2159 } else { 2160 pshufd(vtmp2, src2, 0xE); 2161 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2162 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2163 } 2164 } 2165 2166 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2167 if (opcode == Op_AddReductionVI) { 2168 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2169 vextracti128_high(vtmp2, vtmp1); 2170 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2171 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2172 } else { 2173 vextracti128_high(vtmp1, src2); 2174 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2175 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2176 } 2177 } 2178 2179 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2180 vextracti64x4_high(vtmp2, src2); 2181 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2182 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2183 } 2184 2185 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2186 pshufd(vtmp2, src2, 0x1); 2187 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2188 movdqu(vtmp1, vtmp2); 2189 psrldq(vtmp1, 2); 2190 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2191 movdqu(vtmp2, vtmp1); 2192 psrldq(vtmp2, 1); 2193 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2194 movdl(vtmp2, src1); 2195 pmovsxbd(vtmp1, vtmp1); 2196 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2197 pextrb(dst, vtmp1, 0x0); 2198 movsbl(dst, dst); 2199 } 2200 2201 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2202 pshufd(vtmp1, src2, 0xE); 2203 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2204 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2205 } 2206 2207 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2208 vextracti128_high(vtmp2, src2); 2209 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2210 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2211 } 2212 2213 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2214 vextracti64x4_high(vtmp1, src2); 2215 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2216 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2217 } 2218 2219 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2220 pmovsxbw(vtmp2, src2); 2221 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2222 } 2223 2224 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2225 if (UseAVX > 1) { 2226 int vector_len = Assembler::AVX_256bit; 2227 vpmovsxbw(vtmp1, src2, vector_len); 2228 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2229 } else { 2230 pmovsxbw(vtmp2, src2); 2231 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2232 pshufd(vtmp2, src2, 0x1); 2233 pmovsxbw(vtmp2, src2); 2234 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2235 } 2236 } 2237 2238 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2239 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2240 int vector_len = Assembler::AVX_512bit; 2241 vpmovsxbw(vtmp1, src2, vector_len); 2242 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2243 } else { 2244 assert(UseAVX >= 2,"Should not reach here."); 2245 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2246 vextracti128_high(vtmp2, src2); 2247 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2248 } 2249 } 2250 2251 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2252 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2253 vextracti64x4_high(vtmp2, src2); 2254 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2255 } 2256 2257 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2258 if (opcode == Op_AddReductionVI) { 2259 if (vtmp1 != src2) { 2260 movdqu(vtmp1, src2); 2261 } 2262 phaddw(vtmp1, vtmp1); 2263 phaddw(vtmp1, vtmp1); 2264 } else { 2265 pshufd(vtmp2, src2, 0x1); 2266 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2267 movdqu(vtmp1, vtmp2); 2268 psrldq(vtmp1, 2); 2269 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2270 } 2271 movdl(vtmp2, src1); 2272 pmovsxwd(vtmp1, vtmp1); 2273 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2274 pextrw(dst, vtmp1, 0x0); 2275 movswl(dst, dst); 2276 } 2277 2278 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2279 if (opcode == Op_AddReductionVI) { 2280 if (vtmp1 != src2) { 2281 movdqu(vtmp1, src2); 2282 } 2283 phaddw(vtmp1, src2); 2284 } else { 2285 pshufd(vtmp1, src2, 0xE); 2286 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2287 } 2288 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2289 } 2290 2291 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2292 if (opcode == Op_AddReductionVI) { 2293 int vector_len = Assembler::AVX_256bit; 2294 vphaddw(vtmp2, src2, src2, vector_len); 2295 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2296 } else { 2297 vextracti128_high(vtmp2, src2); 2298 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2299 } 2300 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2301 } 2302 2303 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2304 int vector_len = Assembler::AVX_256bit; 2305 vextracti64x4_high(vtmp1, src2); 2306 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2307 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2308 } 2309 2310 #ifdef _LP64 2311 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2312 pshufd(vtmp2, src2, 0xE); 2313 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2314 movdq(vtmp1, src1); 2315 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2316 movdq(dst, vtmp1); 2317 } 2318 2319 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2320 vextracti128_high(vtmp1, src2); 2321 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2322 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2323 } 2324 2325 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2326 vextracti64x4_high(vtmp2, src2); 2327 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2328 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2329 } 2330 2331 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2332 mov64(temp, -1L); 2333 bzhiq(temp, temp, len); 2334 kmovql(dst, temp); 2335 } 2336 #endif // _LP64 2337 2338 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2339 reduce_operation_128(T_FLOAT, opcode, dst, src); 2340 pshufd(vtmp, src, 0x1); 2341 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2342 } 2343 2344 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2345 reduce2F(opcode, dst, src, vtmp); 2346 pshufd(vtmp, src, 0x2); 2347 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2348 pshufd(vtmp, src, 0x3); 2349 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2350 } 2351 2352 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2353 reduce4F(opcode, dst, src, vtmp2); 2354 vextractf128_high(vtmp2, src); 2355 reduce4F(opcode, dst, vtmp2, vtmp1); 2356 } 2357 2358 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2359 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2360 vextracti64x4_high(vtmp1, src); 2361 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2362 } 2363 2364 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2365 pshufd(dst, src, 0x1); 2366 reduce_operation_128(T_FLOAT, opcode, dst, src); 2367 } 2368 2369 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2370 pshufd(vtmp, src, 0xE); 2371 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2372 unorderedReduce2F(opcode, dst, vtmp); 2373 } 2374 2375 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2376 vextractf128_high(vtmp1, src); 2377 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2378 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2379 } 2380 2381 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2382 vextractf64x4_high(vtmp2, src); 2383 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2384 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2385 } 2386 2387 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2388 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2389 pshufd(vtmp, src, 0xE); 2390 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2391 } 2392 2393 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2394 reduce2D(opcode, dst, src, vtmp2); 2395 vextractf128_high(vtmp2, src); 2396 reduce2D(opcode, dst, vtmp2, vtmp1); 2397 } 2398 2399 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2400 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2401 vextracti64x4_high(vtmp1, src); 2402 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2403 } 2404 2405 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2406 pshufd(dst, src, 0xE); 2407 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2408 } 2409 2410 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2411 vextractf128_high(vtmp, src); 2412 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2413 unorderedReduce2D(opcode, dst, vtmp); 2414 } 2415 2416 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2417 vextractf64x4_high(vtmp2, src); 2418 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2419 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2420 } 2421 2422 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2423 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2424 } 2425 2426 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2427 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2428 } 2429 2430 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 2431 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2432 } 2433 2434 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2435 int vec_enc) { 2436 switch(elem_bt) { 2437 case T_INT: 2438 case T_FLOAT: 2439 vmaskmovps(dst, src, mask, vec_enc); 2440 break; 2441 case T_LONG: 2442 case T_DOUBLE: 2443 vmaskmovpd(dst, src, mask, vec_enc); 2444 break; 2445 default: 2446 fatal("Unsupported type %s", type2name(elem_bt)); 2447 break; 2448 } 2449 } 2450 2451 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2452 int vec_enc) { 2453 switch(elem_bt) { 2454 case T_INT: 2455 case T_FLOAT: 2456 vmaskmovps(dst, src, mask, vec_enc); 2457 break; 2458 case T_LONG: 2459 case T_DOUBLE: 2460 vmaskmovpd(dst, src, mask, vec_enc); 2461 break; 2462 default: 2463 fatal("Unsupported type %s", type2name(elem_bt)); 2464 break; 2465 } 2466 } 2467 2468 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2469 XMMRegister dst, XMMRegister src, 2470 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2471 XMMRegister xmm_0, XMMRegister xmm_1) { 2472 const int permconst[] = {1, 14}; 2473 XMMRegister wsrc = src; 2474 XMMRegister wdst = xmm_0; 2475 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2476 2477 int vlen_enc = Assembler::AVX_128bit; 2478 if (vlen == 16) { 2479 vlen_enc = Assembler::AVX_256bit; 2480 } 2481 2482 for (int i = log2(vlen) - 1; i >=0; i--) { 2483 if (i == 0 && !is_dst_valid) { 2484 wdst = dst; 2485 } 2486 if (i == 3) { 2487 vextracti64x4_high(wtmp, wsrc); 2488 } else if (i == 2) { 2489 vextracti128_high(wtmp, wsrc); 2490 } else { // i = [0,1] 2491 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2492 } 2493 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2494 wsrc = wdst; 2495 vlen_enc = Assembler::AVX_128bit; 2496 } 2497 if (is_dst_valid) { 2498 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2499 } 2500 } 2501 2502 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2503 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2504 XMMRegister xmm_0, XMMRegister xmm_1) { 2505 XMMRegister wsrc = src; 2506 XMMRegister wdst = xmm_0; 2507 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2508 int vlen_enc = Assembler::AVX_128bit; 2509 if (vlen == 8) { 2510 vlen_enc = Assembler::AVX_256bit; 2511 } 2512 for (int i = log2(vlen) - 1; i >=0; i--) { 2513 if (i == 0 && !is_dst_valid) { 2514 wdst = dst; 2515 } 2516 if (i == 1) { 2517 vextracti128_high(wtmp, wsrc); 2518 } else if (i == 2) { 2519 vextracti64x4_high(wtmp, wsrc); 2520 } else { 2521 assert(i == 0, "%d", i); 2522 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2523 } 2524 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2525 wsrc = wdst; 2526 vlen_enc = Assembler::AVX_128bit; 2527 } 2528 if (is_dst_valid) { 2529 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2530 } 2531 } 2532 2533 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2534 switch (bt) { 2535 case T_BYTE: pextrb(dst, src, idx); break; 2536 case T_SHORT: pextrw(dst, src, idx); break; 2537 case T_INT: pextrd(dst, src, idx); break; 2538 case T_LONG: pextrq(dst, src, idx); break; 2539 2540 default: 2541 assert(false,"Should not reach here."); 2542 break; 2543 } 2544 } 2545 2546 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2547 int esize = type2aelembytes(typ); 2548 int elem_per_lane = 16/esize; 2549 int lane = elemindex / elem_per_lane; 2550 int eindex = elemindex % elem_per_lane; 2551 2552 if (lane >= 2) { 2553 assert(UseAVX > 2, "required"); 2554 vextractf32x4(dst, src, lane & 3); 2555 return dst; 2556 } else if (lane > 0) { 2557 assert(UseAVX > 0, "required"); 2558 vextractf128(dst, src, lane); 2559 return dst; 2560 } else { 2561 return src; 2562 } 2563 } 2564 2565 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2566 if (typ == T_BYTE) { 2567 movsbl(dst, dst); 2568 } else if (typ == T_SHORT) { 2569 movswl(dst, dst); 2570 } 2571 } 2572 2573 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2574 int esize = type2aelembytes(typ); 2575 int elem_per_lane = 16/esize; 2576 int eindex = elemindex % elem_per_lane; 2577 assert(is_integral_type(typ),"required"); 2578 2579 if (eindex == 0) { 2580 if (typ == T_LONG) { 2581 movq(dst, src); 2582 } else { 2583 movdl(dst, src); 2584 movsxl(typ, dst); 2585 } 2586 } else { 2587 extract(typ, dst, src, eindex); 2588 movsxl(typ, dst); 2589 } 2590 } 2591 2592 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2593 int esize = type2aelembytes(typ); 2594 int elem_per_lane = 16/esize; 2595 int eindex = elemindex % elem_per_lane; 2596 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2597 2598 if (eindex == 0) { 2599 movq(dst, src); 2600 } else { 2601 if (typ == T_FLOAT) { 2602 if (UseAVX == 0) { 2603 movdqu(dst, src); 2604 shufps(dst, dst, eindex); 2605 } else { 2606 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2607 } 2608 } else { 2609 if (UseAVX == 0) { 2610 movdqu(dst, src); 2611 psrldq(dst, eindex*esize); 2612 } else { 2613 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2614 } 2615 movq(dst, dst); 2616 } 2617 } 2618 // Zero upper bits 2619 if (typ == T_FLOAT) { 2620 if (UseAVX == 0) { 2621 assert(vtmp != xnoreg, "required."); 2622 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2623 pand(dst, vtmp); 2624 } else { 2625 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2626 } 2627 } 2628 } 2629 2630 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2631 switch(typ) { 2632 case T_BYTE: 2633 case T_BOOLEAN: 2634 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2635 break; 2636 case T_SHORT: 2637 case T_CHAR: 2638 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2639 break; 2640 case T_INT: 2641 case T_FLOAT: 2642 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2643 break; 2644 case T_LONG: 2645 case T_DOUBLE: 2646 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2647 break; 2648 default: 2649 assert(false,"Should not reach here."); 2650 break; 2651 } 2652 } 2653 2654 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2655 assert(rscratch != noreg || always_reachable(src2), "missing"); 2656 2657 switch(typ) { 2658 case T_BOOLEAN: 2659 case T_BYTE: 2660 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2661 break; 2662 case T_CHAR: 2663 case T_SHORT: 2664 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2665 break; 2666 case T_INT: 2667 case T_FLOAT: 2668 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2669 break; 2670 case T_LONG: 2671 case T_DOUBLE: 2672 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2673 break; 2674 default: 2675 assert(false,"Should not reach here."); 2676 break; 2677 } 2678 } 2679 2680 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2681 switch(typ) { 2682 case T_BYTE: 2683 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2684 break; 2685 case T_SHORT: 2686 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2687 break; 2688 case T_INT: 2689 case T_FLOAT: 2690 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2691 break; 2692 case T_LONG: 2693 case T_DOUBLE: 2694 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2695 break; 2696 default: 2697 assert(false,"Should not reach here."); 2698 break; 2699 } 2700 } 2701 2702 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2703 assert(vlen_in_bytes <= 32, ""); 2704 int esize = type2aelembytes(bt); 2705 if (vlen_in_bytes == 32) { 2706 assert(vtmp == xnoreg, "required."); 2707 if (esize >= 4) { 2708 vtestps(src1, src2, AVX_256bit); 2709 } else { 2710 vptest(src1, src2, AVX_256bit); 2711 } 2712 return; 2713 } 2714 if (vlen_in_bytes < 16) { 2715 // Duplicate the lower part to fill the whole register, 2716 // Don't need to do so for src2 2717 assert(vtmp != xnoreg, "required"); 2718 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2719 pshufd(vtmp, src1, shuffle_imm); 2720 } else { 2721 assert(vtmp == xnoreg, "required"); 2722 vtmp = src1; 2723 } 2724 if (esize >= 4 && VM_Version::supports_avx()) { 2725 vtestps(vtmp, src2, AVX_128bit); 2726 } else { 2727 ptest(vtmp, src2); 2728 } 2729 } 2730 2731 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2732 #ifdef ASSERT 2733 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2734 bool is_bw_supported = VM_Version::supports_avx512bw(); 2735 if (is_bw && !is_bw_supported) { 2736 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2737 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2738 "XMM register should be 0-15"); 2739 } 2740 #endif // ASSERT 2741 switch (elem_bt) { 2742 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2743 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2744 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2745 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2746 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2747 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2748 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2749 } 2750 } 2751 2752 #ifdef _LP64 2753 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2754 assert(UseAVX >= 2, "required"); 2755 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2756 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2757 if ((UseAVX > 2) && 2758 (!is_bw || VM_Version::supports_avx512bw()) && 2759 (!is_vl || VM_Version::supports_avx512vl())) { 2760 switch (elem_bt) { 2761 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2762 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2763 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2764 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2765 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2766 } 2767 } else { 2768 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2769 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2770 switch (elem_bt) { 2771 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2772 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2773 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2774 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2775 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2776 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2777 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2778 } 2779 } 2780 } 2781 #endif 2782 2783 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2784 switch (to_elem_bt) { 2785 case T_SHORT: 2786 vpmovsxbw(dst, src, vlen_enc); 2787 break; 2788 case T_INT: 2789 vpmovsxbd(dst, src, vlen_enc); 2790 break; 2791 case T_FLOAT: 2792 vpmovsxbd(dst, src, vlen_enc); 2793 vcvtdq2ps(dst, dst, vlen_enc); 2794 break; 2795 case T_LONG: 2796 vpmovsxbq(dst, src, vlen_enc); 2797 break; 2798 case T_DOUBLE: { 2799 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2800 vpmovsxbd(dst, src, mid_vlen_enc); 2801 vcvtdq2pd(dst, dst, vlen_enc); 2802 break; 2803 } 2804 default: 2805 fatal("Unsupported type %s", type2name(to_elem_bt)); 2806 break; 2807 } 2808 } 2809 2810 //------------------------------------------------------------------------------------------- 2811 2812 // IndexOf for constant substrings with size >= 8 chars 2813 // which don't need to be loaded through stack. 2814 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2815 Register cnt1, Register cnt2, 2816 int int_cnt2, Register result, 2817 XMMRegister vec, Register tmp, 2818 int ae) { 2819 ShortBranchVerifier sbv(this); 2820 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2821 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2822 2823 // This method uses the pcmpestri instruction with bound registers 2824 // inputs: 2825 // xmm - substring 2826 // rax - substring length (elements count) 2827 // mem - scanned string 2828 // rdx - string length (elements count) 2829 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2830 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2831 // outputs: 2832 // rcx - matched index in string 2833 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2834 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2835 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2836 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2837 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2838 2839 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2840 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2841 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2842 2843 // Note, inline_string_indexOf() generates checks: 2844 // if (substr.count > string.count) return -1; 2845 // if (substr.count == 0) return 0; 2846 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2847 2848 // Load substring. 2849 if (ae == StrIntrinsicNode::UL) { 2850 pmovzxbw(vec, Address(str2, 0)); 2851 } else { 2852 movdqu(vec, Address(str2, 0)); 2853 } 2854 movl(cnt2, int_cnt2); 2855 movptr(result, str1); // string addr 2856 2857 if (int_cnt2 > stride) { 2858 jmpb(SCAN_TO_SUBSTR); 2859 2860 // Reload substr for rescan, this code 2861 // is executed only for large substrings (> 8 chars) 2862 bind(RELOAD_SUBSTR); 2863 if (ae == StrIntrinsicNode::UL) { 2864 pmovzxbw(vec, Address(str2, 0)); 2865 } else { 2866 movdqu(vec, Address(str2, 0)); 2867 } 2868 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2869 2870 bind(RELOAD_STR); 2871 // We came here after the beginning of the substring was 2872 // matched but the rest of it was not so we need to search 2873 // again. Start from the next element after the previous match. 2874 2875 // cnt2 is number of substring reminding elements and 2876 // cnt1 is number of string reminding elements when cmp failed. 2877 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2878 subl(cnt1, cnt2); 2879 addl(cnt1, int_cnt2); 2880 movl(cnt2, int_cnt2); // Now restore cnt2 2881 2882 decrementl(cnt1); // Shift to next element 2883 cmpl(cnt1, cnt2); 2884 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2885 2886 addptr(result, (1<<scale1)); 2887 2888 } // (int_cnt2 > 8) 2889 2890 // Scan string for start of substr in 16-byte vectors 2891 bind(SCAN_TO_SUBSTR); 2892 pcmpestri(vec, Address(result, 0), mode); 2893 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2894 subl(cnt1, stride); 2895 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2896 cmpl(cnt1, cnt2); 2897 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2898 addptr(result, 16); 2899 jmpb(SCAN_TO_SUBSTR); 2900 2901 // Found a potential substr 2902 bind(FOUND_CANDIDATE); 2903 // Matched whole vector if first element matched (tmp(rcx) == 0). 2904 if (int_cnt2 == stride) { 2905 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2906 } else { // int_cnt2 > 8 2907 jccb(Assembler::overflow, FOUND_SUBSTR); 2908 } 2909 // After pcmpestri tmp(rcx) contains matched element index 2910 // Compute start addr of substr 2911 lea(result, Address(result, tmp, scale1)); 2912 2913 // Make sure string is still long enough 2914 subl(cnt1, tmp); 2915 cmpl(cnt1, cnt2); 2916 if (int_cnt2 == stride) { 2917 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2918 } else { // int_cnt2 > 8 2919 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2920 } 2921 // Left less then substring. 2922 2923 bind(RET_NOT_FOUND); 2924 movl(result, -1); 2925 jmp(EXIT); 2926 2927 if (int_cnt2 > stride) { 2928 // This code is optimized for the case when whole substring 2929 // is matched if its head is matched. 2930 bind(MATCH_SUBSTR_HEAD); 2931 pcmpestri(vec, Address(result, 0), mode); 2932 // Reload only string if does not match 2933 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2934 2935 Label CONT_SCAN_SUBSTR; 2936 // Compare the rest of substring (> 8 chars). 2937 bind(FOUND_SUBSTR); 2938 // First 8 chars are already matched. 2939 negptr(cnt2); 2940 addptr(cnt2, stride); 2941 2942 bind(SCAN_SUBSTR); 2943 subl(cnt1, stride); 2944 cmpl(cnt2, -stride); // Do not read beyond substring 2945 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2946 // Back-up strings to avoid reading beyond substring: 2947 // cnt1 = cnt1 - cnt2 + 8 2948 addl(cnt1, cnt2); // cnt2 is negative 2949 addl(cnt1, stride); 2950 movl(cnt2, stride); negptr(cnt2); 2951 bind(CONT_SCAN_SUBSTR); 2952 if (int_cnt2 < (int)G) { 2953 int tail_off1 = int_cnt2<<scale1; 2954 int tail_off2 = int_cnt2<<scale2; 2955 if (ae == StrIntrinsicNode::UL) { 2956 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2957 } else { 2958 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2959 } 2960 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2961 } else { 2962 // calculate index in register to avoid integer overflow (int_cnt2*2) 2963 movl(tmp, int_cnt2); 2964 addptr(tmp, cnt2); 2965 if (ae == StrIntrinsicNode::UL) { 2966 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2967 } else { 2968 movdqu(vec, Address(str2, tmp, scale2, 0)); 2969 } 2970 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2971 } 2972 // Need to reload strings pointers if not matched whole vector 2973 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2974 addptr(cnt2, stride); 2975 jcc(Assembler::negative, SCAN_SUBSTR); 2976 // Fall through if found full substring 2977 2978 } // (int_cnt2 > 8) 2979 2980 bind(RET_FOUND); 2981 // Found result if we matched full small substring. 2982 // Compute substr offset 2983 subptr(result, str1); 2984 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2985 shrl(result, 1); // index 2986 } 2987 bind(EXIT); 2988 2989 } // string_indexofC8 2990 2991 // Small strings are loaded through stack if they cross page boundary. 2992 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2993 Register cnt1, Register cnt2, 2994 int int_cnt2, Register result, 2995 XMMRegister vec, Register tmp, 2996 int ae) { 2997 ShortBranchVerifier sbv(this); 2998 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2999 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3000 3001 // 3002 // int_cnt2 is length of small (< 8 chars) constant substring 3003 // or (-1) for non constant substring in which case its length 3004 // is in cnt2 register. 3005 // 3006 // Note, inline_string_indexOf() generates checks: 3007 // if (substr.count > string.count) return -1; 3008 // if (substr.count == 0) return 0; 3009 // 3010 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 3011 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 3012 // This method uses the pcmpestri instruction with bound registers 3013 // inputs: 3014 // xmm - substring 3015 // rax - substring length (elements count) 3016 // mem - scanned string 3017 // rdx - string length (elements count) 3018 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 3019 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 3020 // outputs: 3021 // rcx - matched index in string 3022 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3023 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 3024 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 3025 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 3026 3027 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 3028 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 3029 FOUND_CANDIDATE; 3030 3031 { //======================================================== 3032 // We don't know where these strings are located 3033 // and we can't read beyond them. Load them through stack. 3034 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 3035 3036 movptr(tmp, rsp); // save old SP 3037 3038 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 3039 if (int_cnt2 == (1>>scale2)) { // One byte 3040 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 3041 load_unsigned_byte(result, Address(str2, 0)); 3042 movdl(vec, result); // move 32 bits 3043 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 3044 // Not enough header space in 32-bit VM: 12+3 = 15. 3045 movl(result, Address(str2, -1)); 3046 shrl(result, 8); 3047 movdl(vec, result); // move 32 bits 3048 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 3049 load_unsigned_short(result, Address(str2, 0)); 3050 movdl(vec, result); // move 32 bits 3051 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 3052 movdl(vec, Address(str2, 0)); // move 32 bits 3053 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 3054 movq(vec, Address(str2, 0)); // move 64 bits 3055 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 3056 // Array header size is 12 bytes in 32-bit VM 3057 // + 6 bytes for 3 chars == 18 bytes, 3058 // enough space to load vec and shift. 3059 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3060 if (ae == StrIntrinsicNode::UL) { 3061 int tail_off = int_cnt2-8; 3062 pmovzxbw(vec, Address(str2, tail_off)); 3063 psrldq(vec, -2*tail_off); 3064 } 3065 else { 3066 int tail_off = int_cnt2*(1<<scale2); 3067 movdqu(vec, Address(str2, tail_off-16)); 3068 psrldq(vec, 16-tail_off); 3069 } 3070 } 3071 } else { // not constant substring 3072 cmpl(cnt2, stride); 3073 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3074 3075 // We can read beyond string if srt+16 does not cross page boundary 3076 // since heaps are aligned and mapped by pages. 3077 assert(os::vm_page_size() < (int)G, "default page should be small"); 3078 movl(result, str2); // We need only low 32 bits 3079 andl(result, ((int)os::vm_page_size()-1)); 3080 cmpl(result, ((int)os::vm_page_size()-16)); 3081 jccb(Assembler::belowEqual, CHECK_STR); 3082 3083 // Move small strings to stack to allow load 16 bytes into vec. 3084 subptr(rsp, 16); 3085 int stk_offset = wordSize-(1<<scale2); 3086 push(cnt2); 3087 3088 bind(COPY_SUBSTR); 3089 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3090 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3091 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3092 } else if (ae == StrIntrinsicNode::UU) { 3093 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3094 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3095 } 3096 decrement(cnt2); 3097 jccb(Assembler::notZero, COPY_SUBSTR); 3098 3099 pop(cnt2); 3100 movptr(str2, rsp); // New substring address 3101 } // non constant 3102 3103 bind(CHECK_STR); 3104 cmpl(cnt1, stride); 3105 jccb(Assembler::aboveEqual, BIG_STRINGS); 3106 3107 // Check cross page boundary. 3108 movl(result, str1); // We need only low 32 bits 3109 andl(result, ((int)os::vm_page_size()-1)); 3110 cmpl(result, ((int)os::vm_page_size()-16)); 3111 jccb(Assembler::belowEqual, BIG_STRINGS); 3112 3113 subptr(rsp, 16); 3114 int stk_offset = -(1<<scale1); 3115 if (int_cnt2 < 0) { // not constant 3116 push(cnt2); 3117 stk_offset += wordSize; 3118 } 3119 movl(cnt2, cnt1); 3120 3121 bind(COPY_STR); 3122 if (ae == StrIntrinsicNode::LL) { 3123 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3124 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3125 } else { 3126 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3127 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3128 } 3129 decrement(cnt2); 3130 jccb(Assembler::notZero, COPY_STR); 3131 3132 if (int_cnt2 < 0) { // not constant 3133 pop(cnt2); 3134 } 3135 movptr(str1, rsp); // New string address 3136 3137 bind(BIG_STRINGS); 3138 // Load substring. 3139 if (int_cnt2 < 0) { // -1 3140 if (ae == StrIntrinsicNode::UL) { 3141 pmovzxbw(vec, Address(str2, 0)); 3142 } else { 3143 movdqu(vec, Address(str2, 0)); 3144 } 3145 push(cnt2); // substr count 3146 push(str2); // substr addr 3147 push(str1); // string addr 3148 } else { 3149 // Small (< 8 chars) constant substrings are loaded already. 3150 movl(cnt2, int_cnt2); 3151 } 3152 push(tmp); // original SP 3153 3154 } // Finished loading 3155 3156 //======================================================== 3157 // Start search 3158 // 3159 3160 movptr(result, str1); // string addr 3161 3162 if (int_cnt2 < 0) { // Only for non constant substring 3163 jmpb(SCAN_TO_SUBSTR); 3164 3165 // SP saved at sp+0 3166 // String saved at sp+1*wordSize 3167 // Substr saved at sp+2*wordSize 3168 // Substr count saved at sp+3*wordSize 3169 3170 // Reload substr for rescan, this code 3171 // is executed only for large substrings (> 8 chars) 3172 bind(RELOAD_SUBSTR); 3173 movptr(str2, Address(rsp, 2*wordSize)); 3174 movl(cnt2, Address(rsp, 3*wordSize)); 3175 if (ae == StrIntrinsicNode::UL) { 3176 pmovzxbw(vec, Address(str2, 0)); 3177 } else { 3178 movdqu(vec, Address(str2, 0)); 3179 } 3180 // We came here after the beginning of the substring was 3181 // matched but the rest of it was not so we need to search 3182 // again. Start from the next element after the previous match. 3183 subptr(str1, result); // Restore counter 3184 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3185 shrl(str1, 1); 3186 } 3187 addl(cnt1, str1); 3188 decrementl(cnt1); // Shift to next element 3189 cmpl(cnt1, cnt2); 3190 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3191 3192 addptr(result, (1<<scale1)); 3193 } // non constant 3194 3195 // Scan string for start of substr in 16-byte vectors 3196 bind(SCAN_TO_SUBSTR); 3197 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3198 pcmpestri(vec, Address(result, 0), mode); 3199 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3200 subl(cnt1, stride); 3201 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3202 cmpl(cnt1, cnt2); 3203 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3204 addptr(result, 16); 3205 3206 bind(ADJUST_STR); 3207 cmpl(cnt1, stride); // Do not read beyond string 3208 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3209 // Back-up string to avoid reading beyond string. 3210 lea(result, Address(result, cnt1, scale1, -16)); 3211 movl(cnt1, stride); 3212 jmpb(SCAN_TO_SUBSTR); 3213 3214 // Found a potential substr 3215 bind(FOUND_CANDIDATE); 3216 // After pcmpestri tmp(rcx) contains matched element index 3217 3218 // Make sure string is still long enough 3219 subl(cnt1, tmp); 3220 cmpl(cnt1, cnt2); 3221 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3222 // Left less then substring. 3223 3224 bind(RET_NOT_FOUND); 3225 movl(result, -1); 3226 jmp(CLEANUP); 3227 3228 bind(FOUND_SUBSTR); 3229 // Compute start addr of substr 3230 lea(result, Address(result, tmp, scale1)); 3231 if (int_cnt2 > 0) { // Constant substring 3232 // Repeat search for small substring (< 8 chars) 3233 // from new point without reloading substring. 3234 // Have to check that we don't read beyond string. 3235 cmpl(tmp, stride-int_cnt2); 3236 jccb(Assembler::greater, ADJUST_STR); 3237 // Fall through if matched whole substring. 3238 } else { // non constant 3239 assert(int_cnt2 == -1, "should be != 0"); 3240 3241 addl(tmp, cnt2); 3242 // Found result if we matched whole substring. 3243 cmpl(tmp, stride); 3244 jcc(Assembler::lessEqual, RET_FOUND); 3245 3246 // Repeat search for small substring (<= 8 chars) 3247 // from new point 'str1' without reloading substring. 3248 cmpl(cnt2, stride); 3249 // Have to check that we don't read beyond string. 3250 jccb(Assembler::lessEqual, ADJUST_STR); 3251 3252 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3253 // Compare the rest of substring (> 8 chars). 3254 movptr(str1, result); 3255 3256 cmpl(tmp, cnt2); 3257 // First 8 chars are already matched. 3258 jccb(Assembler::equal, CHECK_NEXT); 3259 3260 bind(SCAN_SUBSTR); 3261 pcmpestri(vec, Address(str1, 0), mode); 3262 // Need to reload strings pointers if not matched whole vector 3263 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3264 3265 bind(CHECK_NEXT); 3266 subl(cnt2, stride); 3267 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3268 addptr(str1, 16); 3269 if (ae == StrIntrinsicNode::UL) { 3270 addptr(str2, 8); 3271 } else { 3272 addptr(str2, 16); 3273 } 3274 subl(cnt1, stride); 3275 cmpl(cnt2, stride); // Do not read beyond substring 3276 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3277 // Back-up strings to avoid reading beyond substring. 3278 3279 if (ae == StrIntrinsicNode::UL) { 3280 lea(str2, Address(str2, cnt2, scale2, -8)); 3281 lea(str1, Address(str1, cnt2, scale1, -16)); 3282 } else { 3283 lea(str2, Address(str2, cnt2, scale2, -16)); 3284 lea(str1, Address(str1, cnt2, scale1, -16)); 3285 } 3286 subl(cnt1, cnt2); 3287 movl(cnt2, stride); 3288 addl(cnt1, stride); 3289 bind(CONT_SCAN_SUBSTR); 3290 if (ae == StrIntrinsicNode::UL) { 3291 pmovzxbw(vec, Address(str2, 0)); 3292 } else { 3293 movdqu(vec, Address(str2, 0)); 3294 } 3295 jmp(SCAN_SUBSTR); 3296 3297 bind(RET_FOUND_LONG); 3298 movptr(str1, Address(rsp, wordSize)); 3299 } // non constant 3300 3301 bind(RET_FOUND); 3302 // Compute substr offset 3303 subptr(result, str1); 3304 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3305 shrl(result, 1); // index 3306 } 3307 bind(CLEANUP); 3308 pop(rsp); // restore SP 3309 3310 } // string_indexof 3311 3312 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3313 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3314 ShortBranchVerifier sbv(this); 3315 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3316 3317 int stride = 8; 3318 3319 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3320 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3321 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3322 FOUND_SEQ_CHAR, DONE_LABEL; 3323 3324 movptr(result, str1); 3325 if (UseAVX >= 2) { 3326 cmpl(cnt1, stride); 3327 jcc(Assembler::less, SCAN_TO_CHAR); 3328 cmpl(cnt1, 2*stride); 3329 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3330 movdl(vec1, ch); 3331 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3332 vpxor(vec2, vec2); 3333 movl(tmp, cnt1); 3334 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3335 andl(cnt1,0x0000000F); //tail count (in chars) 3336 3337 bind(SCAN_TO_16_CHAR_LOOP); 3338 vmovdqu(vec3, Address(result, 0)); 3339 vpcmpeqw(vec3, vec3, vec1, 1); 3340 vptest(vec2, vec3); 3341 jcc(Assembler::carryClear, FOUND_CHAR); 3342 addptr(result, 32); 3343 subl(tmp, 2*stride); 3344 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3345 jmp(SCAN_TO_8_CHAR); 3346 bind(SCAN_TO_8_CHAR_INIT); 3347 movdl(vec1, ch); 3348 pshuflw(vec1, vec1, 0x00); 3349 pshufd(vec1, vec1, 0); 3350 pxor(vec2, vec2); 3351 } 3352 bind(SCAN_TO_8_CHAR); 3353 cmpl(cnt1, stride); 3354 jcc(Assembler::less, SCAN_TO_CHAR); 3355 if (UseAVX < 2) { 3356 movdl(vec1, ch); 3357 pshuflw(vec1, vec1, 0x00); 3358 pshufd(vec1, vec1, 0); 3359 pxor(vec2, vec2); 3360 } 3361 movl(tmp, cnt1); 3362 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3363 andl(cnt1,0x00000007); //tail count (in chars) 3364 3365 bind(SCAN_TO_8_CHAR_LOOP); 3366 movdqu(vec3, Address(result, 0)); 3367 pcmpeqw(vec3, vec1); 3368 ptest(vec2, vec3); 3369 jcc(Assembler::carryClear, FOUND_CHAR); 3370 addptr(result, 16); 3371 subl(tmp, stride); 3372 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3373 bind(SCAN_TO_CHAR); 3374 testl(cnt1, cnt1); 3375 jcc(Assembler::zero, RET_NOT_FOUND); 3376 bind(SCAN_TO_CHAR_LOOP); 3377 load_unsigned_short(tmp, Address(result, 0)); 3378 cmpl(ch, tmp); 3379 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3380 addptr(result, 2); 3381 subl(cnt1, 1); 3382 jccb(Assembler::zero, RET_NOT_FOUND); 3383 jmp(SCAN_TO_CHAR_LOOP); 3384 3385 bind(RET_NOT_FOUND); 3386 movl(result, -1); 3387 jmpb(DONE_LABEL); 3388 3389 bind(FOUND_CHAR); 3390 if (UseAVX >= 2) { 3391 vpmovmskb(tmp, vec3); 3392 } else { 3393 pmovmskb(tmp, vec3); 3394 } 3395 bsfl(ch, tmp); 3396 addptr(result, ch); 3397 3398 bind(FOUND_SEQ_CHAR); 3399 subptr(result, str1); 3400 shrl(result, 1); 3401 3402 bind(DONE_LABEL); 3403 } // string_indexof_char 3404 3405 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3406 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3407 ShortBranchVerifier sbv(this); 3408 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3409 3410 int stride = 16; 3411 3412 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3413 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3414 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3415 FOUND_SEQ_CHAR, DONE_LABEL; 3416 3417 movptr(result, str1); 3418 if (UseAVX >= 2) { 3419 cmpl(cnt1, stride); 3420 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3421 cmpl(cnt1, stride*2); 3422 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3423 movdl(vec1, ch); 3424 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3425 vpxor(vec2, vec2); 3426 movl(tmp, cnt1); 3427 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3428 andl(cnt1,0x0000001F); //tail count (in chars) 3429 3430 bind(SCAN_TO_32_CHAR_LOOP); 3431 vmovdqu(vec3, Address(result, 0)); 3432 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3433 vptest(vec2, vec3); 3434 jcc(Assembler::carryClear, FOUND_CHAR); 3435 addptr(result, 32); 3436 subl(tmp, stride*2); 3437 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3438 jmp(SCAN_TO_16_CHAR); 3439 3440 bind(SCAN_TO_16_CHAR_INIT); 3441 movdl(vec1, ch); 3442 pxor(vec2, vec2); 3443 pshufb(vec1, vec2); 3444 } 3445 3446 bind(SCAN_TO_16_CHAR); 3447 cmpl(cnt1, stride); 3448 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3449 if (UseAVX < 2) { 3450 movdl(vec1, ch); 3451 pxor(vec2, vec2); 3452 pshufb(vec1, vec2); 3453 } 3454 movl(tmp, cnt1); 3455 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3456 andl(cnt1,0x0000000F); //tail count (in bytes) 3457 3458 bind(SCAN_TO_16_CHAR_LOOP); 3459 movdqu(vec3, Address(result, 0)); 3460 pcmpeqb(vec3, vec1); 3461 ptest(vec2, vec3); 3462 jcc(Assembler::carryClear, FOUND_CHAR); 3463 addptr(result, 16); 3464 subl(tmp, stride); 3465 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3466 3467 bind(SCAN_TO_CHAR_INIT); 3468 testl(cnt1, cnt1); 3469 jcc(Assembler::zero, RET_NOT_FOUND); 3470 bind(SCAN_TO_CHAR_LOOP); 3471 load_unsigned_byte(tmp, Address(result, 0)); 3472 cmpl(ch, tmp); 3473 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3474 addptr(result, 1); 3475 subl(cnt1, 1); 3476 jccb(Assembler::zero, RET_NOT_FOUND); 3477 jmp(SCAN_TO_CHAR_LOOP); 3478 3479 bind(RET_NOT_FOUND); 3480 movl(result, -1); 3481 jmpb(DONE_LABEL); 3482 3483 bind(FOUND_CHAR); 3484 if (UseAVX >= 2) { 3485 vpmovmskb(tmp, vec3); 3486 } else { 3487 pmovmskb(tmp, vec3); 3488 } 3489 bsfl(ch, tmp); 3490 addptr(result, ch); 3491 3492 bind(FOUND_SEQ_CHAR); 3493 subptr(result, str1); 3494 3495 bind(DONE_LABEL); 3496 } // stringL_indexof_char 3497 3498 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3499 switch (eltype) { 3500 case T_BOOLEAN: return sizeof(jboolean); 3501 case T_BYTE: return sizeof(jbyte); 3502 case T_SHORT: return sizeof(jshort); 3503 case T_CHAR: return sizeof(jchar); 3504 case T_INT: return sizeof(jint); 3505 default: 3506 ShouldNotReachHere(); 3507 return -1; 3508 } 3509 } 3510 3511 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3512 switch (eltype) { 3513 // T_BOOLEAN used as surrogate for unsigned byte 3514 case T_BOOLEAN: movzbl(dst, src); break; 3515 case T_BYTE: movsbl(dst, src); break; 3516 case T_SHORT: movswl(dst, src); break; 3517 case T_CHAR: movzwl(dst, src); break; 3518 case T_INT: movl(dst, src); break; 3519 default: 3520 ShouldNotReachHere(); 3521 } 3522 } 3523 3524 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3525 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3526 } 3527 3528 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3529 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3530 } 3531 3532 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3533 const int vlen = Assembler::AVX_256bit; 3534 switch (eltype) { 3535 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3536 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3537 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3538 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3539 case T_INT: 3540 // do nothing 3541 break; 3542 default: 3543 ShouldNotReachHere(); 3544 } 3545 } 3546 3547 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3548 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3549 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3550 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3551 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3552 BasicType eltype) { 3553 ShortBranchVerifier sbv(this); 3554 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3555 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3556 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3557 3558 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3559 SHORT_UNROLLED_LOOP_EXIT, 3560 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3561 UNROLLED_VECTOR_LOOP_BEGIN, 3562 END; 3563 switch (eltype) { 3564 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3565 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3566 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3567 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3568 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3569 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3570 } 3571 3572 // For "renaming" for readibility of the code 3573 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3574 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3575 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3576 3577 const int elsize = arrays_hashcode_elsize(eltype); 3578 3579 /* 3580 if (cnt1 >= 2) { 3581 if (cnt1 >= 32) { 3582 UNROLLED VECTOR LOOP 3583 } 3584 UNROLLED SCALAR LOOP 3585 } 3586 SINGLE SCALAR 3587 */ 3588 3589 cmpl(cnt1, 32); 3590 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3591 3592 // cnt1 >= 32 && generate_vectorized_loop 3593 xorl(index, index); 3594 3595 // vresult = IntVector.zero(I256); 3596 for (int idx = 0; idx < 4; idx++) { 3597 vpxor(vresult[idx], vresult[idx]); 3598 } 3599 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3600 Register bound = tmp2; 3601 Register next = tmp3; 3602 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3603 movl(next, Address(tmp2, 0)); 3604 movdl(vnext, next); 3605 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3606 3607 // index = 0; 3608 // bound = cnt1 & ~(32 - 1); 3609 movl(bound, cnt1); 3610 andl(bound, ~(32 - 1)); 3611 // for (; index < bound; index += 32) { 3612 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3613 // result *= next; 3614 imull(result, next); 3615 // loop fission to upfront the cost of fetching from memory, OOO execution 3616 // can then hopefully do a better job of prefetching 3617 for (int idx = 0; idx < 4; idx++) { 3618 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3619 } 3620 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3621 for (int idx = 0; idx < 4; idx++) { 3622 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3623 arrays_hashcode_elvcast(vtmp[idx], eltype); 3624 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3625 } 3626 // index += 32; 3627 addl(index, 32); 3628 // index < bound; 3629 cmpl(index, bound); 3630 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3631 // } 3632 3633 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3634 subl(cnt1, bound); 3635 // release bound 3636 3637 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3638 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3639 for (int idx = 0; idx < 4; idx++) { 3640 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, (int)((8 * idx + 1) * sizeof(jint))), T_INT); 3641 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3642 } 3643 // result += vresult.reduceLanes(ADD); 3644 for (int idx = 0; idx < 4; idx++) { 3645 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3646 } 3647 3648 // } else if (cnt1 < 32) { 3649 3650 bind(SHORT_UNROLLED_BEGIN); 3651 // int i = 1; 3652 movl(index, 1); 3653 cmpl(index, cnt1); 3654 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3655 3656 // for (; i < cnt1 ; i += 2) { 3657 bind(SHORT_UNROLLED_LOOP_BEGIN); 3658 movl(tmp3, 961); 3659 imull(result, tmp3); 3660 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3661 movl(tmp3, tmp2); 3662 shll(tmp3, 5); 3663 subl(tmp3, tmp2); 3664 addl(result, tmp3); 3665 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3666 addl(result, tmp3); 3667 addl(index, 2); 3668 cmpl(index, cnt1); 3669 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3670 3671 // } 3672 // if (i >= cnt1) { 3673 bind(SHORT_UNROLLED_LOOP_EXIT); 3674 jccb(Assembler::greater, END); 3675 movl(tmp2, result); 3676 shll(result, 5); 3677 subl(result, tmp2); 3678 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3679 addl(result, tmp3); 3680 // } 3681 bind(END); 3682 3683 BLOCK_COMMENT("} // arrays_hashcode"); 3684 3685 } // arrays_hashcode 3686 3687 // helper function for string_compare 3688 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3689 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3690 Address::ScaleFactor scale2, Register index, int ae) { 3691 if (ae == StrIntrinsicNode::LL) { 3692 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3693 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3694 } else if (ae == StrIntrinsicNode::UU) { 3695 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3696 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3697 } else { 3698 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3699 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3700 } 3701 } 3702 3703 // Compare strings, used for char[] and byte[]. 3704 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3705 Register cnt1, Register cnt2, Register result, 3706 XMMRegister vec1, int ae, KRegister mask) { 3707 ShortBranchVerifier sbv(this); 3708 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3709 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3710 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3711 int stride2x2 = 0x40; 3712 Address::ScaleFactor scale = Address::no_scale; 3713 Address::ScaleFactor scale1 = Address::no_scale; 3714 Address::ScaleFactor scale2 = Address::no_scale; 3715 3716 if (ae != StrIntrinsicNode::LL) { 3717 stride2x2 = 0x20; 3718 } 3719 3720 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3721 shrl(cnt2, 1); 3722 } 3723 // Compute the minimum of the string lengths and the 3724 // difference of the string lengths (stack). 3725 // Do the conditional move stuff 3726 movl(result, cnt1); 3727 subl(cnt1, cnt2); 3728 push(cnt1); 3729 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3730 3731 // Is the minimum length zero? 3732 testl(cnt2, cnt2); 3733 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3734 if (ae == StrIntrinsicNode::LL) { 3735 // Load first bytes 3736 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3737 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3738 } else if (ae == StrIntrinsicNode::UU) { 3739 // Load first characters 3740 load_unsigned_short(result, Address(str1, 0)); 3741 load_unsigned_short(cnt1, Address(str2, 0)); 3742 } else { 3743 load_unsigned_byte(result, Address(str1, 0)); 3744 load_unsigned_short(cnt1, Address(str2, 0)); 3745 } 3746 subl(result, cnt1); 3747 jcc(Assembler::notZero, POP_LABEL); 3748 3749 if (ae == StrIntrinsicNode::UU) { 3750 // Divide length by 2 to get number of chars 3751 shrl(cnt2, 1); 3752 } 3753 cmpl(cnt2, 1); 3754 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3755 3756 // Check if the strings start at the same location and setup scale and stride 3757 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3758 cmpptr(str1, str2); 3759 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3760 if (ae == StrIntrinsicNode::LL) { 3761 scale = Address::times_1; 3762 stride = 16; 3763 } else { 3764 scale = Address::times_2; 3765 stride = 8; 3766 } 3767 } else { 3768 scale1 = Address::times_1; 3769 scale2 = Address::times_2; 3770 // scale not used 3771 stride = 8; 3772 } 3773 3774 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3775 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3776 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3777 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3778 Label COMPARE_TAIL_LONG; 3779 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3780 3781 int pcmpmask = 0x19; 3782 if (ae == StrIntrinsicNode::LL) { 3783 pcmpmask &= ~0x01; 3784 } 3785 3786 // Setup to compare 16-chars (32-bytes) vectors, 3787 // start from first character again because it has aligned address. 3788 if (ae == StrIntrinsicNode::LL) { 3789 stride2 = 32; 3790 } else { 3791 stride2 = 16; 3792 } 3793 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3794 adr_stride = stride << scale; 3795 } else { 3796 adr_stride1 = 8; //stride << scale1; 3797 adr_stride2 = 16; //stride << scale2; 3798 } 3799 3800 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3801 // rax and rdx are used by pcmpestri as elements counters 3802 movl(result, cnt2); 3803 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3804 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3805 3806 // fast path : compare first 2 8-char vectors. 3807 bind(COMPARE_16_CHARS); 3808 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3809 movdqu(vec1, Address(str1, 0)); 3810 } else { 3811 pmovzxbw(vec1, Address(str1, 0)); 3812 } 3813 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3814 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3815 3816 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3817 movdqu(vec1, Address(str1, adr_stride)); 3818 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3819 } else { 3820 pmovzxbw(vec1, Address(str1, adr_stride1)); 3821 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3822 } 3823 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3824 addl(cnt1, stride); 3825 3826 // Compare the characters at index in cnt1 3827 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3828 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3829 subl(result, cnt2); 3830 jmp(POP_LABEL); 3831 3832 // Setup the registers to start vector comparison loop 3833 bind(COMPARE_WIDE_VECTORS); 3834 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3835 lea(str1, Address(str1, result, scale)); 3836 lea(str2, Address(str2, result, scale)); 3837 } else { 3838 lea(str1, Address(str1, result, scale1)); 3839 lea(str2, Address(str2, result, scale2)); 3840 } 3841 subl(result, stride2); 3842 subl(cnt2, stride2); 3843 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3844 negptr(result); 3845 3846 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3847 bind(COMPARE_WIDE_VECTORS_LOOP); 3848 3849 #ifdef _LP64 3850 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3851 cmpl(cnt2, stride2x2); 3852 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3853 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3854 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3855 3856 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3857 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3858 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3859 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3860 } else { 3861 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3862 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3863 } 3864 kortestql(mask, mask); 3865 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3866 addptr(result, stride2x2); // update since we already compared at this addr 3867 subl(cnt2, stride2x2); // and sub the size too 3868 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3869 3870 vpxor(vec1, vec1); 3871 jmpb(COMPARE_WIDE_TAIL); 3872 }//if (VM_Version::supports_avx512vlbw()) 3873 #endif // _LP64 3874 3875 3876 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3877 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3878 vmovdqu(vec1, Address(str1, result, scale)); 3879 vpxor(vec1, Address(str2, result, scale)); 3880 } else { 3881 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3882 vpxor(vec1, Address(str2, result, scale2)); 3883 } 3884 vptest(vec1, vec1); 3885 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3886 addptr(result, stride2); 3887 subl(cnt2, stride2); 3888 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3889 // clean upper bits of YMM registers 3890 vpxor(vec1, vec1); 3891 3892 // compare wide vectors tail 3893 bind(COMPARE_WIDE_TAIL); 3894 testptr(result, result); 3895 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3896 3897 movl(result, stride2); 3898 movl(cnt2, result); 3899 negptr(result); 3900 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3901 3902 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3903 bind(VECTOR_NOT_EQUAL); 3904 // clean upper bits of YMM registers 3905 vpxor(vec1, vec1); 3906 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3907 lea(str1, Address(str1, result, scale)); 3908 lea(str2, Address(str2, result, scale)); 3909 } else { 3910 lea(str1, Address(str1, result, scale1)); 3911 lea(str2, Address(str2, result, scale2)); 3912 } 3913 jmp(COMPARE_16_CHARS); 3914 3915 // Compare tail chars, length between 1 to 15 chars 3916 bind(COMPARE_TAIL_LONG); 3917 movl(cnt2, result); 3918 cmpl(cnt2, stride); 3919 jcc(Assembler::less, COMPARE_SMALL_STR); 3920 3921 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3922 movdqu(vec1, Address(str1, 0)); 3923 } else { 3924 pmovzxbw(vec1, Address(str1, 0)); 3925 } 3926 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3927 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3928 subptr(cnt2, stride); 3929 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3930 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3931 lea(str1, Address(str1, result, scale)); 3932 lea(str2, Address(str2, result, scale)); 3933 } else { 3934 lea(str1, Address(str1, result, scale1)); 3935 lea(str2, Address(str2, result, scale2)); 3936 } 3937 negptr(cnt2); 3938 jmpb(WHILE_HEAD_LABEL); 3939 3940 bind(COMPARE_SMALL_STR); 3941 } else if (UseSSE42Intrinsics) { 3942 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3943 int pcmpmask = 0x19; 3944 // Setup to compare 8-char (16-byte) vectors, 3945 // start from first character again because it has aligned address. 3946 movl(result, cnt2); 3947 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3948 if (ae == StrIntrinsicNode::LL) { 3949 pcmpmask &= ~0x01; 3950 } 3951 jcc(Assembler::zero, COMPARE_TAIL); 3952 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3953 lea(str1, Address(str1, result, scale)); 3954 lea(str2, Address(str2, result, scale)); 3955 } else { 3956 lea(str1, Address(str1, result, scale1)); 3957 lea(str2, Address(str2, result, scale2)); 3958 } 3959 negptr(result); 3960 3961 // pcmpestri 3962 // inputs: 3963 // vec1- substring 3964 // rax - negative string length (elements count) 3965 // mem - scanned string 3966 // rdx - string length (elements count) 3967 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3968 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3969 // outputs: 3970 // rcx - first mismatched element index 3971 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3972 3973 bind(COMPARE_WIDE_VECTORS); 3974 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3975 movdqu(vec1, Address(str1, result, scale)); 3976 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3977 } else { 3978 pmovzxbw(vec1, Address(str1, result, scale1)); 3979 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3980 } 3981 // After pcmpestri cnt1(rcx) contains mismatched element index 3982 3983 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3984 addptr(result, stride); 3985 subptr(cnt2, stride); 3986 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3987 3988 // compare wide vectors tail 3989 testptr(result, result); 3990 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3991 3992 movl(cnt2, stride); 3993 movl(result, stride); 3994 negptr(result); 3995 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3996 movdqu(vec1, Address(str1, result, scale)); 3997 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3998 } else { 3999 pmovzxbw(vec1, Address(str1, result, scale1)); 4000 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4001 } 4002 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 4003 4004 // Mismatched characters in the vectors 4005 bind(VECTOR_NOT_EQUAL); 4006 addptr(cnt1, result); 4007 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 4008 subl(result, cnt2); 4009 jmpb(POP_LABEL); 4010 4011 bind(COMPARE_TAIL); // limit is zero 4012 movl(cnt2, result); 4013 // Fallthru to tail compare 4014 } 4015 // Shift str2 and str1 to the end of the arrays, negate min 4016 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4017 lea(str1, Address(str1, cnt2, scale)); 4018 lea(str2, Address(str2, cnt2, scale)); 4019 } else { 4020 lea(str1, Address(str1, cnt2, scale1)); 4021 lea(str2, Address(str2, cnt2, scale2)); 4022 } 4023 decrementl(cnt2); // first character was compared already 4024 negptr(cnt2); 4025 4026 // Compare the rest of the elements 4027 bind(WHILE_HEAD_LABEL); 4028 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 4029 subl(result, cnt1); 4030 jccb(Assembler::notZero, POP_LABEL); 4031 increment(cnt2); 4032 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 4033 4034 // Strings are equal up to min length. Return the length difference. 4035 bind(LENGTH_DIFF_LABEL); 4036 pop(result); 4037 if (ae == StrIntrinsicNode::UU) { 4038 // Divide diff by 2 to get number of chars 4039 sarl(result, 1); 4040 } 4041 jmpb(DONE_LABEL); 4042 4043 #ifdef _LP64 4044 if (VM_Version::supports_avx512vlbw()) { 4045 4046 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 4047 4048 kmovql(cnt1, mask); 4049 notq(cnt1); 4050 bsfq(cnt2, cnt1); 4051 if (ae != StrIntrinsicNode::LL) { 4052 // Divide diff by 2 to get number of chars 4053 sarl(cnt2, 1); 4054 } 4055 addq(result, cnt2); 4056 if (ae == StrIntrinsicNode::LL) { 4057 load_unsigned_byte(cnt1, Address(str2, result)); 4058 load_unsigned_byte(result, Address(str1, result)); 4059 } else if (ae == StrIntrinsicNode::UU) { 4060 load_unsigned_short(cnt1, Address(str2, result, scale)); 4061 load_unsigned_short(result, Address(str1, result, scale)); 4062 } else { 4063 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4064 load_unsigned_byte(result, Address(str1, result, scale1)); 4065 } 4066 subl(result, cnt1); 4067 jmpb(POP_LABEL); 4068 }//if (VM_Version::supports_avx512vlbw()) 4069 #endif // _LP64 4070 4071 // Discard the stored length difference 4072 bind(POP_LABEL); 4073 pop(cnt1); 4074 4075 // That's it 4076 bind(DONE_LABEL); 4077 if(ae == StrIntrinsicNode::UL) { 4078 negl(result); 4079 } 4080 4081 } 4082 4083 // Search for Non-ASCII character (Negative byte value) in a byte array, 4084 // return the index of the first such character, otherwise the length 4085 // of the array segment searched. 4086 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4087 // @IntrinsicCandidate 4088 // public static int countPositives(byte[] ba, int off, int len) { 4089 // for (int i = off; i < off + len; i++) { 4090 // if (ba[i] < 0) { 4091 // return i - off; 4092 // } 4093 // } 4094 // return len; 4095 // } 4096 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4097 Register result, Register tmp1, 4098 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4099 // rsi: byte array 4100 // rcx: len 4101 // rax: result 4102 ShortBranchVerifier sbv(this); 4103 assert_different_registers(ary1, len, result, tmp1); 4104 assert_different_registers(vec1, vec2); 4105 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4106 4107 movl(result, len); // copy 4108 // len == 0 4109 testl(len, len); 4110 jcc(Assembler::zero, DONE); 4111 4112 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4113 VM_Version::supports_avx512vlbw() && 4114 VM_Version::supports_bmi2()) { 4115 4116 Label test_64_loop, test_tail, BREAK_LOOP; 4117 movl(tmp1, len); 4118 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4119 4120 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4121 andl(len, 0xffffffc0); // vector count (in chars) 4122 jccb(Assembler::zero, test_tail); 4123 4124 lea(ary1, Address(ary1, len, Address::times_1)); 4125 negptr(len); 4126 4127 bind(test_64_loop); 4128 // Check whether our 64 elements of size byte contain negatives 4129 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4130 kortestql(mask1, mask1); 4131 jcc(Assembler::notZero, BREAK_LOOP); 4132 4133 addptr(len, 64); 4134 jccb(Assembler::notZero, test_64_loop); 4135 4136 bind(test_tail); 4137 // bail out when there is nothing to be done 4138 testl(tmp1, -1); 4139 jcc(Assembler::zero, DONE); 4140 4141 4142 // check the tail for absense of negatives 4143 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4144 #ifdef _LP64 4145 { 4146 Register tmp3_aliased = len; 4147 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4148 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4149 notq(tmp3_aliased); 4150 kmovql(mask2, tmp3_aliased); 4151 } 4152 #else 4153 Label k_init; 4154 jmp(k_init); 4155 4156 // We could not read 64-bits from a general purpose register thus we move 4157 // data required to compose 64 1's to the instruction stream 4158 // We emit 64 byte wide series of elements from 0..63 which later on would 4159 // be used as a compare targets with tail count contained in tmp1 register. 4160 // Result would be a k register having tmp1 consecutive number or 1 4161 // counting from least significant bit. 4162 address tmp = pc(); 4163 emit_int64(0x0706050403020100); 4164 emit_int64(0x0F0E0D0C0B0A0908); 4165 emit_int64(0x1716151413121110); 4166 emit_int64(0x1F1E1D1C1B1A1918); 4167 emit_int64(0x2726252423222120); 4168 emit_int64(0x2F2E2D2C2B2A2928); 4169 emit_int64(0x3736353433323130); 4170 emit_int64(0x3F3E3D3C3B3A3938); 4171 4172 bind(k_init); 4173 lea(len, InternalAddress(tmp)); 4174 // create mask to test for negative byte inside a vector 4175 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 4176 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 4177 4178 #endif 4179 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4180 ktestq(mask1, mask2); 4181 jcc(Assembler::zero, DONE); 4182 4183 // do a full check for negative registers in the tail 4184 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4185 // ary1 already pointing to the right place 4186 jmpb(TAIL_START); 4187 4188 bind(BREAK_LOOP); 4189 // At least one byte in the last 64 byte block was negative. 4190 // Set up to look at the last 64 bytes as if they were a tail 4191 lea(ary1, Address(ary1, len, Address::times_1)); 4192 addptr(result, len); 4193 // Ignore the very last byte: if all others are positive, 4194 // it must be negative, so we can skip right to the 2+1 byte 4195 // end comparison at this point 4196 orl(result, 63); 4197 movl(len, 63); 4198 // Fallthru to tail compare 4199 } else { 4200 4201 if (UseAVX >= 2 && UseSSE >= 2) { 4202 // With AVX2, use 32-byte vector compare 4203 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4204 4205 // Compare 32-byte vectors 4206 testl(len, 0xffffffe0); // vector count (in bytes) 4207 jccb(Assembler::zero, TAIL_START); 4208 4209 andl(len, 0xffffffe0); 4210 lea(ary1, Address(ary1, len, Address::times_1)); 4211 negptr(len); 4212 4213 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4214 movdl(vec2, tmp1); 4215 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4216 4217 bind(COMPARE_WIDE_VECTORS); 4218 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4219 vptest(vec1, vec2); 4220 jccb(Assembler::notZero, BREAK_LOOP); 4221 addptr(len, 32); 4222 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4223 4224 testl(result, 0x0000001f); // any bytes remaining? 4225 jcc(Assembler::zero, DONE); 4226 4227 // Quick test using the already prepared vector mask 4228 movl(len, result); 4229 andl(len, 0x0000001f); 4230 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4231 vptest(vec1, vec2); 4232 jcc(Assembler::zero, DONE); 4233 // There are zeros, jump to the tail to determine exactly where 4234 jmpb(TAIL_START); 4235 4236 bind(BREAK_LOOP); 4237 // At least one byte in the last 32-byte vector is negative. 4238 // Set up to look at the last 32 bytes as if they were a tail 4239 lea(ary1, Address(ary1, len, Address::times_1)); 4240 addptr(result, len); 4241 // Ignore the very last byte: if all others are positive, 4242 // it must be negative, so we can skip right to the 2+1 byte 4243 // end comparison at this point 4244 orl(result, 31); 4245 movl(len, 31); 4246 // Fallthru to tail compare 4247 } else if (UseSSE42Intrinsics) { 4248 // With SSE4.2, use double quad vector compare 4249 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4250 4251 // Compare 16-byte vectors 4252 testl(len, 0xfffffff0); // vector count (in bytes) 4253 jcc(Assembler::zero, TAIL_START); 4254 4255 andl(len, 0xfffffff0); 4256 lea(ary1, Address(ary1, len, Address::times_1)); 4257 negptr(len); 4258 4259 movl(tmp1, 0x80808080); 4260 movdl(vec2, tmp1); 4261 pshufd(vec2, vec2, 0); 4262 4263 bind(COMPARE_WIDE_VECTORS); 4264 movdqu(vec1, Address(ary1, len, Address::times_1)); 4265 ptest(vec1, vec2); 4266 jccb(Assembler::notZero, BREAK_LOOP); 4267 addptr(len, 16); 4268 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4269 4270 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4271 jcc(Assembler::zero, DONE); 4272 4273 // Quick test using the already prepared vector mask 4274 movl(len, result); 4275 andl(len, 0x0000000f); // tail count (in bytes) 4276 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4277 ptest(vec1, vec2); 4278 jcc(Assembler::zero, DONE); 4279 jmpb(TAIL_START); 4280 4281 bind(BREAK_LOOP); 4282 // At least one byte in the last 16-byte vector is negative. 4283 // Set up and look at the last 16 bytes as if they were a tail 4284 lea(ary1, Address(ary1, len, Address::times_1)); 4285 addptr(result, len); 4286 // Ignore the very last byte: if all others are positive, 4287 // it must be negative, so we can skip right to the 2+1 byte 4288 // end comparison at this point 4289 orl(result, 15); 4290 movl(len, 15); 4291 // Fallthru to tail compare 4292 } 4293 } 4294 4295 bind(TAIL_START); 4296 // Compare 4-byte vectors 4297 andl(len, 0xfffffffc); // vector count (in bytes) 4298 jccb(Assembler::zero, COMPARE_CHAR); 4299 4300 lea(ary1, Address(ary1, len, Address::times_1)); 4301 negptr(len); 4302 4303 bind(COMPARE_VECTORS); 4304 movl(tmp1, Address(ary1, len, Address::times_1)); 4305 andl(tmp1, 0x80808080); 4306 jccb(Assembler::notZero, TAIL_ADJUST); 4307 addptr(len, 4); 4308 jccb(Assembler::notZero, COMPARE_VECTORS); 4309 4310 // Compare trailing char (final 2-3 bytes), if any 4311 bind(COMPARE_CHAR); 4312 4313 testl(result, 0x2); // tail char 4314 jccb(Assembler::zero, COMPARE_BYTE); 4315 load_unsigned_short(tmp1, Address(ary1, 0)); 4316 andl(tmp1, 0x00008080); 4317 jccb(Assembler::notZero, CHAR_ADJUST); 4318 lea(ary1, Address(ary1, 2)); 4319 4320 bind(COMPARE_BYTE); 4321 testl(result, 0x1); // tail byte 4322 jccb(Assembler::zero, DONE); 4323 load_unsigned_byte(tmp1, Address(ary1, 0)); 4324 testl(tmp1, 0x00000080); 4325 jccb(Assembler::zero, DONE); 4326 subptr(result, 1); 4327 jmpb(DONE); 4328 4329 bind(TAIL_ADJUST); 4330 // there are negative bits in the last 4 byte block. 4331 // Adjust result and check the next three bytes 4332 addptr(result, len); 4333 orl(result, 3); 4334 lea(ary1, Address(ary1, len, Address::times_1)); 4335 jmpb(COMPARE_CHAR); 4336 4337 bind(CHAR_ADJUST); 4338 // We are looking at a char + optional byte tail, and found that one 4339 // of the bytes in the char is negative. Adjust the result, check the 4340 // first byte and readjust if needed. 4341 andl(result, 0xfffffffc); 4342 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4343 jccb(Assembler::notZero, DONE); 4344 addptr(result, 1); 4345 4346 // That's it 4347 bind(DONE); 4348 if (UseAVX >= 2 && UseSSE >= 2) { 4349 // clean upper bits of YMM registers 4350 vpxor(vec1, vec1); 4351 vpxor(vec2, vec2); 4352 } 4353 } 4354 4355 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4356 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4357 Register limit, Register result, Register chr, 4358 XMMRegister vec1, XMMRegister vec2, bool is_char, 4359 KRegister mask, bool expand_ary2) { 4360 // for expand_ary2, limit is the (smaller) size of the second array. 4361 ShortBranchVerifier sbv(this); 4362 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4363 4364 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4365 "Expansion only implemented for AVX2"); 4366 4367 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4368 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4369 4370 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4371 int scaleIncr = expand_ary2 ? 8 : 16; 4372 4373 if (is_array_equ) { 4374 // Check the input args 4375 cmpoop(ary1, ary2); 4376 jcc(Assembler::equal, TRUE_LABEL); 4377 4378 // Need additional checks for arrays_equals. 4379 testptr(ary1, ary1); 4380 jcc(Assembler::zero, FALSE_LABEL); 4381 testptr(ary2, ary2); 4382 jcc(Assembler::zero, FALSE_LABEL); 4383 4384 // Check the lengths 4385 movl(limit, Address(ary1, length_offset)); 4386 cmpl(limit, Address(ary2, length_offset)); 4387 jcc(Assembler::notEqual, FALSE_LABEL); 4388 } 4389 4390 // count == 0 4391 testl(limit, limit); 4392 jcc(Assembler::zero, TRUE_LABEL); 4393 4394 if (is_array_equ) { 4395 // Load array address 4396 lea(ary1, Address(ary1, base_offset)); 4397 lea(ary2, Address(ary2, base_offset)); 4398 } 4399 4400 if (is_array_equ && is_char) { 4401 // arrays_equals when used for char[]. 4402 shll(limit, 1); // byte count != 0 4403 } 4404 movl(result, limit); // copy 4405 4406 if (UseAVX >= 2) { 4407 // With AVX2, use 32-byte vector compare 4408 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4409 4410 // Compare 32-byte vectors 4411 if (expand_ary2) { 4412 andl(result, 0x0000000f); // tail count (in bytes) 4413 andl(limit, 0xfffffff0); // vector count (in bytes) 4414 jcc(Assembler::zero, COMPARE_TAIL); 4415 } else { 4416 andl(result, 0x0000001f); // tail count (in bytes) 4417 andl(limit, 0xffffffe0); // vector count (in bytes) 4418 jcc(Assembler::zero, COMPARE_TAIL_16); 4419 } 4420 4421 lea(ary1, Address(ary1, limit, scaleFactor)); 4422 lea(ary2, Address(ary2, limit, Address::times_1)); 4423 negptr(limit); 4424 4425 #ifdef _LP64 4426 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4427 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4428 4429 cmpl(limit, -64); 4430 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4431 4432 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4433 4434 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4435 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4436 kortestql(mask, mask); 4437 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4438 addptr(limit, 64); // update since we already compared at this addr 4439 cmpl(limit, -64); 4440 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4441 4442 // At this point we may still need to compare -limit+result bytes. 4443 // We could execute the next two instruction and just continue via non-wide path: 4444 // cmpl(limit, 0); 4445 // jcc(Assembler::equal, COMPARE_TAIL); // true 4446 // But since we stopped at the points ary{1,2}+limit which are 4447 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4448 // (|limit| <= 32 and result < 32), 4449 // we may just compare the last 64 bytes. 4450 // 4451 addptr(result, -64); // it is safe, bc we just came from this area 4452 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4453 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4454 kortestql(mask, mask); 4455 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4456 4457 jmp(TRUE_LABEL); 4458 4459 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4460 4461 }//if (VM_Version::supports_avx512vlbw()) 4462 #endif //_LP64 4463 bind(COMPARE_WIDE_VECTORS); 4464 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4465 if (expand_ary2) { 4466 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4467 } else { 4468 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4469 } 4470 vpxor(vec1, vec2); 4471 4472 vptest(vec1, vec1); 4473 jcc(Assembler::notZero, FALSE_LABEL); 4474 addptr(limit, scaleIncr * 2); 4475 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4476 4477 testl(result, result); 4478 jcc(Assembler::zero, TRUE_LABEL); 4479 4480 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4481 if (expand_ary2) { 4482 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4483 } else { 4484 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4485 } 4486 vpxor(vec1, vec2); 4487 4488 vptest(vec1, vec1); 4489 jcc(Assembler::notZero, FALSE_LABEL); 4490 jmp(TRUE_LABEL); 4491 4492 bind(COMPARE_TAIL_16); // limit is zero 4493 movl(limit, result); 4494 4495 // Compare 16-byte chunks 4496 andl(result, 0x0000000f); // tail count (in bytes) 4497 andl(limit, 0xfffffff0); // vector count (in bytes) 4498 jcc(Assembler::zero, COMPARE_TAIL); 4499 4500 lea(ary1, Address(ary1, limit, scaleFactor)); 4501 lea(ary2, Address(ary2, limit, Address::times_1)); 4502 negptr(limit); 4503 4504 bind(COMPARE_WIDE_VECTORS_16); 4505 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4506 if (expand_ary2) { 4507 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4508 } else { 4509 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4510 } 4511 pxor(vec1, vec2); 4512 4513 ptest(vec1, vec1); 4514 jcc(Assembler::notZero, FALSE_LABEL); 4515 addptr(limit, scaleIncr); 4516 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4517 4518 bind(COMPARE_TAIL); // limit is zero 4519 movl(limit, result); 4520 // Fallthru to tail compare 4521 } else if (UseSSE42Intrinsics) { 4522 // With SSE4.2, use double quad vector compare 4523 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4524 4525 // Compare 16-byte vectors 4526 andl(result, 0x0000000f); // tail count (in bytes) 4527 andl(limit, 0xfffffff0); // vector count (in bytes) 4528 jcc(Assembler::zero, COMPARE_TAIL); 4529 4530 lea(ary1, Address(ary1, limit, Address::times_1)); 4531 lea(ary2, Address(ary2, limit, Address::times_1)); 4532 negptr(limit); 4533 4534 bind(COMPARE_WIDE_VECTORS); 4535 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4536 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4537 pxor(vec1, vec2); 4538 4539 ptest(vec1, vec1); 4540 jcc(Assembler::notZero, FALSE_LABEL); 4541 addptr(limit, 16); 4542 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4543 4544 testl(result, result); 4545 jcc(Assembler::zero, TRUE_LABEL); 4546 4547 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4548 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4549 pxor(vec1, vec2); 4550 4551 ptest(vec1, vec1); 4552 jccb(Assembler::notZero, FALSE_LABEL); 4553 jmpb(TRUE_LABEL); 4554 4555 bind(COMPARE_TAIL); // limit is zero 4556 movl(limit, result); 4557 // Fallthru to tail compare 4558 } 4559 4560 // Compare 4-byte vectors 4561 if (expand_ary2) { 4562 testl(result, result); 4563 jccb(Assembler::zero, TRUE_LABEL); 4564 } else { 4565 andl(limit, 0xfffffffc); // vector count (in bytes) 4566 jccb(Assembler::zero, COMPARE_CHAR); 4567 } 4568 4569 lea(ary1, Address(ary1, limit, scaleFactor)); 4570 lea(ary2, Address(ary2, limit, Address::times_1)); 4571 negptr(limit); 4572 4573 bind(COMPARE_VECTORS); 4574 if (expand_ary2) { 4575 // There are no "vector" operations for bytes to shorts 4576 movzbl(chr, Address(ary2, limit, Address::times_1)); 4577 cmpw(Address(ary1, limit, Address::times_2), chr); 4578 jccb(Assembler::notEqual, FALSE_LABEL); 4579 addptr(limit, 1); 4580 jcc(Assembler::notZero, COMPARE_VECTORS); 4581 jmp(TRUE_LABEL); 4582 } else { 4583 movl(chr, Address(ary1, limit, Address::times_1)); 4584 cmpl(chr, Address(ary2, limit, Address::times_1)); 4585 jccb(Assembler::notEqual, FALSE_LABEL); 4586 addptr(limit, 4); 4587 jcc(Assembler::notZero, COMPARE_VECTORS); 4588 } 4589 4590 // Compare trailing char (final 2 bytes), if any 4591 bind(COMPARE_CHAR); 4592 testl(result, 0x2); // tail char 4593 jccb(Assembler::zero, COMPARE_BYTE); 4594 load_unsigned_short(chr, Address(ary1, 0)); 4595 load_unsigned_short(limit, Address(ary2, 0)); 4596 cmpl(chr, limit); 4597 jccb(Assembler::notEqual, FALSE_LABEL); 4598 4599 if (is_array_equ && is_char) { 4600 bind(COMPARE_BYTE); 4601 } else { 4602 lea(ary1, Address(ary1, 2)); 4603 lea(ary2, Address(ary2, 2)); 4604 4605 bind(COMPARE_BYTE); 4606 testl(result, 0x1); // tail byte 4607 jccb(Assembler::zero, TRUE_LABEL); 4608 load_unsigned_byte(chr, Address(ary1, 0)); 4609 load_unsigned_byte(limit, Address(ary2, 0)); 4610 cmpl(chr, limit); 4611 jccb(Assembler::notEqual, FALSE_LABEL); 4612 } 4613 bind(TRUE_LABEL); 4614 movl(result, 1); // return true 4615 jmpb(DONE); 4616 4617 bind(FALSE_LABEL); 4618 xorl(result, result); // return false 4619 4620 // That's it 4621 bind(DONE); 4622 if (UseAVX >= 2) { 4623 // clean upper bits of YMM registers 4624 vpxor(vec1, vec1); 4625 vpxor(vec2, vec2); 4626 } 4627 } 4628 4629 #ifdef _LP64 4630 4631 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4632 #define __ masm. 4633 Register dst = stub.data<0>(); 4634 XMMRegister src = stub.data<1>(); 4635 address target = stub.data<2>(); 4636 __ bind(stub.entry()); 4637 __ subptr(rsp, 8); 4638 __ movdbl(Address(rsp), src); 4639 __ call(RuntimeAddress(target)); 4640 __ pop(dst); 4641 __ jmp(stub.continuation()); 4642 #undef __ 4643 } 4644 4645 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4646 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4647 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4648 4649 address slowpath_target; 4650 if (dst_bt == T_INT) { 4651 if (src_bt == T_FLOAT) { 4652 cvttss2sil(dst, src); 4653 cmpl(dst, 0x80000000); 4654 slowpath_target = StubRoutines::x86::f2i_fixup(); 4655 } else { 4656 cvttsd2sil(dst, src); 4657 cmpl(dst, 0x80000000); 4658 slowpath_target = StubRoutines::x86::d2i_fixup(); 4659 } 4660 } else { 4661 if (src_bt == T_FLOAT) { 4662 cvttss2siq(dst, src); 4663 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4664 slowpath_target = StubRoutines::x86::f2l_fixup(); 4665 } else { 4666 cvttsd2siq(dst, src); 4667 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4668 slowpath_target = StubRoutines::x86::d2l_fixup(); 4669 } 4670 } 4671 4672 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4673 jcc(Assembler::equal, stub->entry()); 4674 bind(stub->continuation()); 4675 } 4676 4677 #endif // _LP64 4678 4679 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4680 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4681 switch(ideal_opc) { 4682 case Op_LShiftVS: 4683 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4684 case Op_LShiftVI: 4685 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4686 case Op_LShiftVL: 4687 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4688 case Op_RShiftVS: 4689 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4690 case Op_RShiftVI: 4691 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4692 case Op_RShiftVL: 4693 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4694 case Op_URShiftVS: 4695 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4696 case Op_URShiftVI: 4697 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4698 case Op_URShiftVL: 4699 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4700 case Op_RotateRightV: 4701 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4702 case Op_RotateLeftV: 4703 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4704 default: 4705 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4706 break; 4707 } 4708 } 4709 4710 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4711 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) { 4712 if (is_unsigned) { 4713 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4714 } else { 4715 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4716 } 4717 } 4718 4719 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4720 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4721 switch (elem_bt) { 4722 case T_BYTE: 4723 if (ideal_opc == Op_SaturatingAddV) { 4724 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4725 } else { 4726 assert(ideal_opc == Op_SaturatingSubV, ""); 4727 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4728 } 4729 break; 4730 case T_SHORT: 4731 if (ideal_opc == Op_SaturatingAddV) { 4732 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4733 } else { 4734 assert(ideal_opc == Op_SaturatingSubV, ""); 4735 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4736 } 4737 break; 4738 default: 4739 fatal("Unsupported type %s", type2name(elem_bt)); 4740 break; 4741 } 4742 } 4743 4744 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4745 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4746 switch (elem_bt) { 4747 case T_BYTE: 4748 if (ideal_opc == Op_SaturatingAddV) { 4749 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4750 } else { 4751 assert(ideal_opc == Op_SaturatingSubV, ""); 4752 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4753 } 4754 break; 4755 case T_SHORT: 4756 if (ideal_opc == Op_SaturatingAddV) { 4757 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4758 } else { 4759 assert(ideal_opc == Op_SaturatingSubV, ""); 4760 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4761 } 4762 break; 4763 default: 4764 fatal("Unsupported type %s", type2name(elem_bt)); 4765 break; 4766 } 4767 } 4768 4769 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4770 Address src2, bool is_unsigned, bool merge, int vlen_enc) { 4771 if (is_unsigned) { 4772 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4773 } else { 4774 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4775 } 4776 } 4777 4778 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4779 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4780 switch (elem_bt) { 4781 case T_BYTE: 4782 if (ideal_opc == Op_SaturatingAddV) { 4783 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4784 } else { 4785 assert(ideal_opc == Op_SaturatingSubV, ""); 4786 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4787 } 4788 break; 4789 case T_SHORT: 4790 if (ideal_opc == Op_SaturatingAddV) { 4791 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4792 } else { 4793 assert(ideal_opc == Op_SaturatingSubV, ""); 4794 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4795 } 4796 break; 4797 default: 4798 fatal("Unsupported type %s", type2name(elem_bt)); 4799 break; 4800 } 4801 } 4802 4803 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4804 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4805 switch (elem_bt) { 4806 case T_BYTE: 4807 if (ideal_opc == Op_SaturatingAddV) { 4808 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4809 } else { 4810 assert(ideal_opc == Op_SaturatingSubV, ""); 4811 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4812 } 4813 break; 4814 case T_SHORT: 4815 if (ideal_opc == Op_SaturatingAddV) { 4816 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4817 } else { 4818 assert(ideal_opc == Op_SaturatingSubV, ""); 4819 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4820 } 4821 break; 4822 default: 4823 fatal("Unsupported type %s", type2name(elem_bt)); 4824 break; 4825 } 4826 } 4827 4828 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4829 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4830 bool is_varshift) { 4831 switch (ideal_opc) { 4832 case Op_AddVB: 4833 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4834 case Op_AddVS: 4835 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4836 case Op_AddVI: 4837 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4838 case Op_AddVL: 4839 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4840 case Op_AddVF: 4841 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4842 case Op_AddVD: 4843 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4844 case Op_SubVB: 4845 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4846 case Op_SubVS: 4847 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4848 case Op_SubVI: 4849 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4850 case Op_SubVL: 4851 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4852 case Op_SubVF: 4853 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4854 case Op_SubVD: 4855 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4856 case Op_MulVS: 4857 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4858 case Op_MulVI: 4859 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4860 case Op_MulVL: 4861 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4862 case Op_MulVF: 4863 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4864 case Op_MulVD: 4865 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4866 case Op_DivVF: 4867 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4868 case Op_DivVD: 4869 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4870 case Op_SqrtVF: 4871 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4872 case Op_SqrtVD: 4873 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4874 case Op_AbsVB: 4875 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4876 case Op_AbsVS: 4877 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4878 case Op_AbsVI: 4879 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4880 case Op_AbsVL: 4881 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4882 case Op_FmaVF: 4883 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4884 case Op_FmaVD: 4885 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4886 case Op_VectorRearrange: 4887 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4888 case Op_LShiftVS: 4889 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4890 case Op_LShiftVI: 4891 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4892 case Op_LShiftVL: 4893 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4894 case Op_RShiftVS: 4895 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4896 case Op_RShiftVI: 4897 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4898 case Op_RShiftVL: 4899 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4900 case Op_URShiftVS: 4901 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4902 case Op_URShiftVI: 4903 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4904 case Op_URShiftVL: 4905 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4906 case Op_RotateLeftV: 4907 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4908 case Op_RotateRightV: 4909 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4910 case Op_MaxV: 4911 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4912 case Op_MinV: 4913 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4914 case Op_UMinV: 4915 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4916 case Op_UMaxV: 4917 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4918 case Op_XorV: 4919 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4920 case Op_OrV: 4921 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4922 case Op_AndV: 4923 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4924 default: 4925 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4926 break; 4927 } 4928 } 4929 4930 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4931 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4932 switch (ideal_opc) { 4933 case Op_AddVB: 4934 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4935 case Op_AddVS: 4936 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4937 case Op_AddVI: 4938 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4939 case Op_AddVL: 4940 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4941 case Op_AddVF: 4942 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4943 case Op_AddVD: 4944 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4945 case Op_SubVB: 4946 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4947 case Op_SubVS: 4948 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4949 case Op_SubVI: 4950 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4951 case Op_SubVL: 4952 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4953 case Op_SubVF: 4954 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4955 case Op_SubVD: 4956 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4957 case Op_MulVS: 4958 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4959 case Op_MulVI: 4960 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4961 case Op_MulVL: 4962 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4963 case Op_MulVF: 4964 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4965 case Op_MulVD: 4966 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4967 case Op_DivVF: 4968 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4969 case Op_DivVD: 4970 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4971 case Op_FmaVF: 4972 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4973 case Op_FmaVD: 4974 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4975 case Op_MaxV: 4976 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4977 case Op_MinV: 4978 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4979 case Op_UMaxV: 4980 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4981 case Op_UMinV: 4982 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4983 case Op_XorV: 4984 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4985 case Op_OrV: 4986 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4987 case Op_AndV: 4988 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4989 default: 4990 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4991 break; 4992 } 4993 } 4994 4995 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4996 KRegister src1, KRegister src2) { 4997 BasicType etype = T_ILLEGAL; 4998 switch(mask_len) { 4999 case 2: 5000 case 4: 5001 case 8: etype = T_BYTE; break; 5002 case 16: etype = T_SHORT; break; 5003 case 32: etype = T_INT; break; 5004 case 64: etype = T_LONG; break; 5005 default: fatal("Unsupported type"); break; 5006 } 5007 assert(etype != T_ILLEGAL, ""); 5008 switch(ideal_opc) { 5009 case Op_AndVMask: 5010 kand(etype, dst, src1, src2); break; 5011 case Op_OrVMask: 5012 kor(etype, dst, src1, src2); break; 5013 case Op_XorVMask: 5014 kxor(etype, dst, src1, src2); break; 5015 default: 5016 fatal("Unsupported masked operation"); break; 5017 } 5018 } 5019 5020 /* 5021 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5022 * If src is NaN, the result is 0. 5023 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 5024 * the result is equal to the value of Integer.MIN_VALUE. 5025 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 5026 * the result is equal to the value of Integer.MAX_VALUE. 5027 */ 5028 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5029 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5030 Register rscratch, AddressLiteral float_sign_flip, 5031 int vec_enc) { 5032 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5033 Label done; 5034 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 5035 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5036 vptest(xtmp2, xtmp2, vec_enc); 5037 jccb(Assembler::equal, done); 5038 5039 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 5040 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 5041 5042 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5043 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 5044 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 5045 5046 // Recompute the mask for remaining special value. 5047 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 5048 // Extract SRC values corresponding to TRUE mask lanes. 5049 vpand(xtmp4, xtmp2, src, vec_enc); 5050 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 5051 // values are set. 5052 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 5053 5054 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 5055 bind(done); 5056 } 5057 5058 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5059 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5060 Register rscratch, AddressLiteral float_sign_flip, 5061 int vec_enc) { 5062 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5063 Label done; 5064 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5065 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5066 kortestwl(ktmp1, ktmp1); 5067 jccb(Assembler::equal, done); 5068 5069 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5070 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5071 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5072 5073 kxorwl(ktmp1, ktmp1, ktmp2); 5074 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5075 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5076 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5077 bind(done); 5078 } 5079 5080 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5081 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5082 Register rscratch, AddressLiteral double_sign_flip, 5083 int vec_enc) { 5084 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5085 5086 Label done; 5087 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5088 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 5089 kortestwl(ktmp1, ktmp1); 5090 jccb(Assembler::equal, done); 5091 5092 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5093 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5094 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5095 5096 kxorwl(ktmp1, ktmp1, ktmp2); 5097 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5098 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5099 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5100 bind(done); 5101 } 5102 5103 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5104 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5105 Register rscratch, AddressLiteral float_sign_flip, 5106 int vec_enc) { 5107 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5108 Label done; 5109 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5110 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5111 kortestwl(ktmp1, ktmp1); 5112 jccb(Assembler::equal, done); 5113 5114 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5115 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5116 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5117 5118 kxorwl(ktmp1, ktmp1, ktmp2); 5119 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5120 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5121 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5122 bind(done); 5123 } 5124 5125 /* 5126 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5127 * If src is NaN, the result is 0. 5128 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 5129 * the result is equal to the value of Long.MIN_VALUE. 5130 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 5131 * the result is equal to the value of Long.MAX_VALUE. 5132 */ 5133 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5134 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5135 Register rscratch, AddressLiteral double_sign_flip, 5136 int vec_enc) { 5137 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5138 5139 Label done; 5140 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5141 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 5142 kortestwl(ktmp1, ktmp1); 5143 jccb(Assembler::equal, done); 5144 5145 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5146 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5147 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5148 5149 kxorwl(ktmp1, ktmp1, ktmp2); 5150 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5151 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5152 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5153 bind(done); 5154 } 5155 5156 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5157 XMMRegister xtmp, int index, int vec_enc) { 5158 assert(vec_enc < Assembler::AVX_512bit, ""); 5159 if (vec_enc == Assembler::AVX_256bit) { 5160 vextractf128_high(xtmp, src); 5161 vshufps(dst, src, xtmp, index, vec_enc); 5162 } else { 5163 vshufps(dst, src, zero, index, vec_enc); 5164 } 5165 } 5166 5167 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5168 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5169 AddressLiteral float_sign_flip, int src_vec_enc) { 5170 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5171 5172 Label done; 5173 // Compare the destination lanes with float_sign_flip 5174 // value to get mask for all special values. 5175 movdqu(xtmp1, float_sign_flip, rscratch); 5176 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5177 ptest(xtmp2, xtmp2); 5178 jccb(Assembler::equal, done); 5179 5180 // Flip float_sign_flip to get max integer value. 5181 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5182 pxor(xtmp1, xtmp4); 5183 5184 // Set detination lanes corresponding to unordered source lanes as zero. 5185 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5186 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5187 5188 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5189 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5190 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5191 5192 // Recompute the mask for remaining special value. 5193 pxor(xtmp2, xtmp3); 5194 // Extract mask corresponding to non-negative source lanes. 5195 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5196 5197 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5198 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5199 pand(xtmp3, xtmp2); 5200 5201 // Replace destination lanes holding special value(0x80000000) with max int 5202 // if corresponding source lane holds a +ve value. 5203 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5204 bind(done); 5205 } 5206 5207 5208 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5209 XMMRegister xtmp, Register rscratch, int vec_enc) { 5210 switch(to_elem_bt) { 5211 case T_SHORT: 5212 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5213 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5214 vpackusdw(dst, dst, zero, vec_enc); 5215 if (vec_enc == Assembler::AVX_256bit) { 5216 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5217 } 5218 break; 5219 case T_BYTE: 5220 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5221 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5222 vpackusdw(dst, dst, zero, vec_enc); 5223 if (vec_enc == Assembler::AVX_256bit) { 5224 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5225 } 5226 vpackuswb(dst, dst, zero, vec_enc); 5227 break; 5228 default: assert(false, "%s", type2name(to_elem_bt)); 5229 } 5230 } 5231 5232 /* 5233 * Algorithm for vector D2L and F2I conversions:- 5234 * a) Perform vector D2L/F2I cast. 5235 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5236 * It signifies that source value could be any of the special floating point 5237 * values(NaN,-Inf,Inf,Max,-Min). 5238 * c) Set destination to zero if source is NaN value. 5239 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5240 */ 5241 5242 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5243 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5244 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5245 int to_elem_sz = type2aelembytes(to_elem_bt); 5246 assert(to_elem_sz <= 4, ""); 5247 vcvttps2dq(dst, src, vec_enc); 5248 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5249 if (to_elem_sz < 4) { 5250 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5251 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5252 } 5253 } 5254 5255 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5256 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5257 Register rscratch, int vec_enc) { 5258 int to_elem_sz = type2aelembytes(to_elem_bt); 5259 assert(to_elem_sz <= 4, ""); 5260 vcvttps2dq(dst, src, vec_enc); 5261 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5262 switch(to_elem_bt) { 5263 case T_INT: 5264 break; 5265 case T_SHORT: 5266 evpmovdw(dst, dst, vec_enc); 5267 break; 5268 case T_BYTE: 5269 evpmovdb(dst, dst, vec_enc); 5270 break; 5271 default: assert(false, "%s", type2name(to_elem_bt)); 5272 } 5273 } 5274 5275 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5276 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5277 Register rscratch, int vec_enc) { 5278 evcvttps2qq(dst, src, vec_enc); 5279 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5280 } 5281 5282 // Handling for downcasting from double to integer or sub-word types on AVX2. 5283 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5284 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5285 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5286 int to_elem_sz = type2aelembytes(to_elem_bt); 5287 assert(to_elem_sz < 8, ""); 5288 vcvttpd2dq(dst, src, vec_enc); 5289 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5290 float_sign_flip, vec_enc); 5291 if (to_elem_sz < 4) { 5292 // xtmp4 holds all zero lanes. 5293 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5294 } 5295 } 5296 5297 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5298 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5299 KRegister ktmp2, AddressLiteral sign_flip, 5300 Register rscratch, int vec_enc) { 5301 if (VM_Version::supports_avx512dq()) { 5302 evcvttpd2qq(dst, src, vec_enc); 5303 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5304 switch(to_elem_bt) { 5305 case T_LONG: 5306 break; 5307 case T_INT: 5308 evpmovsqd(dst, dst, vec_enc); 5309 break; 5310 case T_SHORT: 5311 evpmovsqd(dst, dst, vec_enc); 5312 evpmovdw(dst, dst, vec_enc); 5313 break; 5314 case T_BYTE: 5315 evpmovsqd(dst, dst, vec_enc); 5316 evpmovdb(dst, dst, vec_enc); 5317 break; 5318 default: assert(false, "%s", type2name(to_elem_bt)); 5319 } 5320 } else { 5321 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5322 vcvttpd2dq(dst, src, vec_enc); 5323 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5324 switch(to_elem_bt) { 5325 case T_INT: 5326 break; 5327 case T_SHORT: 5328 evpmovdw(dst, dst, vec_enc); 5329 break; 5330 case T_BYTE: 5331 evpmovdb(dst, dst, vec_enc); 5332 break; 5333 default: assert(false, "%s", type2name(to_elem_bt)); 5334 } 5335 } 5336 } 5337 5338 #ifdef _LP64 5339 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5340 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5341 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5342 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5343 // and re-instantiate original MXCSR.RC mode after that. 5344 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5345 5346 mov64(tmp, julong_cast(0.5L)); 5347 evpbroadcastq(xtmp1, tmp, vec_enc); 5348 vaddpd(xtmp1, src , xtmp1, vec_enc); 5349 evcvtpd2qq(dst, xtmp1, vec_enc); 5350 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5351 double_sign_flip, vec_enc);; 5352 5353 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5354 } 5355 5356 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5357 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5358 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5359 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5360 // and re-instantiate original MXCSR.RC mode after that. 5361 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5362 5363 movl(tmp, jint_cast(0.5)); 5364 movq(xtmp1, tmp); 5365 vbroadcastss(xtmp1, xtmp1, vec_enc); 5366 vaddps(xtmp1, src , xtmp1, vec_enc); 5367 vcvtps2dq(dst, xtmp1, vec_enc); 5368 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5369 float_sign_flip, vec_enc); 5370 5371 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5372 } 5373 5374 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5375 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5376 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5377 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5378 // and re-instantiate original MXCSR.RC mode after that. 5379 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5380 5381 movl(tmp, jint_cast(0.5)); 5382 movq(xtmp1, tmp); 5383 vbroadcastss(xtmp1, xtmp1, vec_enc); 5384 vaddps(xtmp1, src , xtmp1, vec_enc); 5385 vcvtps2dq(dst, xtmp1, vec_enc); 5386 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5387 5388 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5389 } 5390 #endif // _LP64 5391 5392 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5393 BasicType from_elem_bt, BasicType to_elem_bt) { 5394 switch (from_elem_bt) { 5395 case T_BYTE: 5396 switch (to_elem_bt) { 5397 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5398 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5399 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5400 default: ShouldNotReachHere(); 5401 } 5402 break; 5403 case T_SHORT: 5404 switch (to_elem_bt) { 5405 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5406 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5407 default: ShouldNotReachHere(); 5408 } 5409 break; 5410 case T_INT: 5411 assert(to_elem_bt == T_LONG, ""); 5412 vpmovzxdq(dst, src, vlen_enc); 5413 break; 5414 default: 5415 ShouldNotReachHere(); 5416 } 5417 } 5418 5419 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5420 BasicType from_elem_bt, BasicType to_elem_bt) { 5421 switch (from_elem_bt) { 5422 case T_BYTE: 5423 switch (to_elem_bt) { 5424 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5425 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5426 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5427 default: ShouldNotReachHere(); 5428 } 5429 break; 5430 case T_SHORT: 5431 switch (to_elem_bt) { 5432 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5433 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5434 default: ShouldNotReachHere(); 5435 } 5436 break; 5437 case T_INT: 5438 assert(to_elem_bt == T_LONG, ""); 5439 vpmovsxdq(dst, src, vlen_enc); 5440 break; 5441 default: 5442 ShouldNotReachHere(); 5443 } 5444 } 5445 5446 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5447 BasicType dst_bt, BasicType src_bt, int vlen) { 5448 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5449 assert(vlen_enc != AVX_512bit, ""); 5450 5451 int dst_bt_size = type2aelembytes(dst_bt); 5452 int src_bt_size = type2aelembytes(src_bt); 5453 if (dst_bt_size > src_bt_size) { 5454 switch (dst_bt_size / src_bt_size) { 5455 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5456 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5457 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5458 default: ShouldNotReachHere(); 5459 } 5460 } else { 5461 assert(dst_bt_size < src_bt_size, ""); 5462 switch (src_bt_size / dst_bt_size) { 5463 case 2: { 5464 if (vlen_enc == AVX_128bit) { 5465 vpacksswb(dst, src, src, vlen_enc); 5466 } else { 5467 vpacksswb(dst, src, src, vlen_enc); 5468 vpermq(dst, dst, 0x08, vlen_enc); 5469 } 5470 break; 5471 } 5472 case 4: { 5473 if (vlen_enc == AVX_128bit) { 5474 vpackssdw(dst, src, src, vlen_enc); 5475 vpacksswb(dst, dst, dst, vlen_enc); 5476 } else { 5477 vpackssdw(dst, src, src, vlen_enc); 5478 vpermq(dst, dst, 0x08, vlen_enc); 5479 vpacksswb(dst, dst, dst, AVX_128bit); 5480 } 5481 break; 5482 } 5483 case 8: { 5484 if (vlen_enc == AVX_128bit) { 5485 vpshufd(dst, src, 0x08, vlen_enc); 5486 vpackssdw(dst, dst, dst, vlen_enc); 5487 vpacksswb(dst, dst, dst, vlen_enc); 5488 } else { 5489 vpshufd(dst, src, 0x08, vlen_enc); 5490 vpermq(dst, dst, 0x08, vlen_enc); 5491 vpackssdw(dst, dst, dst, AVX_128bit); 5492 vpacksswb(dst, dst, dst, AVX_128bit); 5493 } 5494 break; 5495 } 5496 default: ShouldNotReachHere(); 5497 } 5498 } 5499 } 5500 5501 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5502 bool merge, BasicType bt, int vlen_enc) { 5503 if (bt == T_INT) { 5504 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5505 } else { 5506 assert(bt == T_LONG, ""); 5507 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5508 } 5509 } 5510 5511 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5512 bool merge, BasicType bt, int vlen_enc) { 5513 if (bt == T_INT) { 5514 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5515 } else { 5516 assert(bt == T_LONG, ""); 5517 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5518 } 5519 } 5520 5521 #ifdef _LP64 5522 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5523 Register rtmp2, XMMRegister xtmp, int mask_len, 5524 int vec_enc) { 5525 int index = 0; 5526 int vindex = 0; 5527 mov64(rtmp1, 0x0101010101010101L); 5528 pdepq(rtmp1, src, rtmp1); 5529 if (mask_len > 8) { 5530 movq(rtmp2, src); 5531 vpxor(xtmp, xtmp, xtmp, vec_enc); 5532 movq(xtmp, rtmp1); 5533 } 5534 movq(dst, rtmp1); 5535 5536 mask_len -= 8; 5537 while (mask_len > 0) { 5538 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5539 index++; 5540 if ((index % 2) == 0) { 5541 pxor(xtmp, xtmp); 5542 } 5543 mov64(rtmp1, 0x0101010101010101L); 5544 shrq(rtmp2, 8); 5545 pdepq(rtmp1, rtmp2, rtmp1); 5546 pinsrq(xtmp, rtmp1, index % 2); 5547 vindex = index / 2; 5548 if (vindex) { 5549 // Write entire 16 byte vector when both 64 bit 5550 // lanes are update to save redundant instructions. 5551 if (index % 2) { 5552 vinsertf128(dst, dst, xtmp, vindex); 5553 } 5554 } else { 5555 vmovdqu(dst, xtmp); 5556 } 5557 mask_len -= 8; 5558 } 5559 } 5560 5561 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5562 switch(opc) { 5563 case Op_VectorMaskTrueCount: 5564 popcntq(dst, tmp); 5565 break; 5566 case Op_VectorMaskLastTrue: 5567 if (VM_Version::supports_lzcnt()) { 5568 lzcntq(tmp, tmp); 5569 movl(dst, 63); 5570 subl(dst, tmp); 5571 } else { 5572 movl(dst, -1); 5573 bsrq(tmp, tmp); 5574 cmov32(Assembler::notZero, dst, tmp); 5575 } 5576 break; 5577 case Op_VectorMaskFirstTrue: 5578 if (VM_Version::supports_bmi1()) { 5579 if (masklen < 32) { 5580 orl(tmp, 1 << masklen); 5581 tzcntl(dst, tmp); 5582 } else if (masklen == 32) { 5583 tzcntl(dst, tmp); 5584 } else { 5585 assert(masklen == 64, ""); 5586 tzcntq(dst, tmp); 5587 } 5588 } else { 5589 if (masklen < 32) { 5590 orl(tmp, 1 << masklen); 5591 bsfl(dst, tmp); 5592 } else { 5593 assert(masklen == 32 || masklen == 64, ""); 5594 movl(dst, masklen); 5595 if (masklen == 32) { 5596 bsfl(tmp, tmp); 5597 } else { 5598 bsfq(tmp, tmp); 5599 } 5600 cmov32(Assembler::notZero, dst, tmp); 5601 } 5602 } 5603 break; 5604 case Op_VectorMaskToLong: 5605 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5606 break; 5607 default: assert(false, "Unhandled mask operation"); 5608 } 5609 } 5610 5611 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5612 int masklen, int masksize, int vec_enc) { 5613 assert(VM_Version::supports_popcnt(), ""); 5614 5615 if(VM_Version::supports_avx512bw()) { 5616 kmovql(tmp, mask); 5617 } else { 5618 assert(masklen <= 16, ""); 5619 kmovwl(tmp, mask); 5620 } 5621 5622 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5623 // operations needs to be clipped. 5624 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5625 andq(tmp, (1 << masklen) - 1); 5626 } 5627 5628 vector_mask_operation_helper(opc, dst, tmp, masklen); 5629 } 5630 5631 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5632 Register tmp, int masklen, BasicType bt, int vec_enc) { 5633 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5634 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5635 assert(VM_Version::supports_popcnt(), ""); 5636 5637 bool need_clip = false; 5638 switch(bt) { 5639 case T_BOOLEAN: 5640 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5641 vpxor(xtmp, xtmp, xtmp, vec_enc); 5642 vpsubb(xtmp, xtmp, mask, vec_enc); 5643 vpmovmskb(tmp, xtmp, vec_enc); 5644 need_clip = masklen < 16; 5645 break; 5646 case T_BYTE: 5647 vpmovmskb(tmp, mask, vec_enc); 5648 need_clip = masklen < 16; 5649 break; 5650 case T_SHORT: 5651 vpacksswb(xtmp, mask, mask, vec_enc); 5652 if (masklen >= 16) { 5653 vpermpd(xtmp, xtmp, 8, vec_enc); 5654 } 5655 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5656 need_clip = masklen < 16; 5657 break; 5658 case T_INT: 5659 case T_FLOAT: 5660 vmovmskps(tmp, mask, vec_enc); 5661 need_clip = masklen < 4; 5662 break; 5663 case T_LONG: 5664 case T_DOUBLE: 5665 vmovmskpd(tmp, mask, vec_enc); 5666 need_clip = masklen < 2; 5667 break; 5668 default: assert(false, "Unhandled type, %s", type2name(bt)); 5669 } 5670 5671 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5672 // operations needs to be clipped. 5673 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5674 // need_clip implies masklen < 32 5675 andq(tmp, (1 << masklen) - 1); 5676 } 5677 5678 vector_mask_operation_helper(opc, dst, tmp, masklen); 5679 } 5680 5681 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5682 Register rtmp2, int mask_len) { 5683 kmov(rtmp1, src); 5684 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5685 mov64(rtmp2, -1L); 5686 pextq(rtmp2, rtmp2, rtmp1); 5687 kmov(dst, rtmp2); 5688 } 5689 5690 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5691 XMMRegister mask, Register rtmp, Register rscratch, 5692 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5693 int vec_enc) { 5694 assert(type2aelembytes(bt) >= 4, ""); 5695 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5696 address compress_perm_table = nullptr; 5697 address expand_perm_table = nullptr; 5698 if (type2aelembytes(bt) == 8) { 5699 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5700 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5701 vmovmskpd(rtmp, mask, vec_enc); 5702 } else { 5703 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5704 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5705 vmovmskps(rtmp, mask, vec_enc); 5706 } 5707 shlq(rtmp, 5); // for 32 byte permute row. 5708 if (opcode == Op_CompressV) { 5709 lea(rscratch, ExternalAddress(compress_perm_table)); 5710 } else { 5711 lea(rscratch, ExternalAddress(expand_perm_table)); 5712 } 5713 addptr(rtmp, rscratch); 5714 vmovdqu(permv, Address(rtmp)); 5715 vpermps(dst, permv, src, Assembler::AVX_256bit); 5716 vpxor(xtmp, xtmp, xtmp, vec_enc); 5717 // Blend the result with zero vector using permute mask, each column entry 5718 // in a permute table row contains either a valid permute index or a -1 (default) 5719 // value, this can potentially be used as a blending mask after 5720 // compressing/expanding the source vector lanes. 5721 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5722 } 5723 5724 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5725 bool merge, BasicType bt, int vec_enc) { 5726 if (opcode == Op_CompressV) { 5727 switch(bt) { 5728 case T_BYTE: 5729 evpcompressb(dst, mask, src, merge, vec_enc); 5730 break; 5731 case T_CHAR: 5732 case T_SHORT: 5733 evpcompressw(dst, mask, src, merge, vec_enc); 5734 break; 5735 case T_INT: 5736 evpcompressd(dst, mask, src, merge, vec_enc); 5737 break; 5738 case T_FLOAT: 5739 evcompressps(dst, mask, src, merge, vec_enc); 5740 break; 5741 case T_LONG: 5742 evpcompressq(dst, mask, src, merge, vec_enc); 5743 break; 5744 case T_DOUBLE: 5745 evcompresspd(dst, mask, src, merge, vec_enc); 5746 break; 5747 default: 5748 fatal("Unsupported type %s", type2name(bt)); 5749 break; 5750 } 5751 } else { 5752 assert(opcode == Op_ExpandV, ""); 5753 switch(bt) { 5754 case T_BYTE: 5755 evpexpandb(dst, mask, src, merge, vec_enc); 5756 break; 5757 case T_CHAR: 5758 case T_SHORT: 5759 evpexpandw(dst, mask, src, merge, vec_enc); 5760 break; 5761 case T_INT: 5762 evpexpandd(dst, mask, src, merge, vec_enc); 5763 break; 5764 case T_FLOAT: 5765 evexpandps(dst, mask, src, merge, vec_enc); 5766 break; 5767 case T_LONG: 5768 evpexpandq(dst, mask, src, merge, vec_enc); 5769 break; 5770 case T_DOUBLE: 5771 evexpandpd(dst, mask, src, merge, vec_enc); 5772 break; 5773 default: 5774 fatal("Unsupported type %s", type2name(bt)); 5775 break; 5776 } 5777 } 5778 } 5779 #endif 5780 5781 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5782 KRegister ktmp1, int vec_enc) { 5783 if (opcode == Op_SignumVD) { 5784 vsubpd(dst, zero, one, vec_enc); 5785 // if src < 0 ? -1 : 1 5786 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5787 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5788 // if src == NaN, -0.0 or 0.0 return src. 5789 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5790 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5791 } else { 5792 assert(opcode == Op_SignumVF, ""); 5793 vsubps(dst, zero, one, vec_enc); 5794 // if src < 0 ? -1 : 1 5795 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5796 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5797 // if src == NaN, -0.0 or 0.0 return src. 5798 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5799 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5800 } 5801 } 5802 5803 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5804 XMMRegister xtmp1, int vec_enc) { 5805 if (opcode == Op_SignumVD) { 5806 vsubpd(dst, zero, one, vec_enc); 5807 // if src < 0 ? -1 : 1 5808 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5809 // if src == NaN, -0.0 or 0.0 return src. 5810 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5811 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5812 } else { 5813 assert(opcode == Op_SignumVF, ""); 5814 vsubps(dst, zero, one, vec_enc); 5815 // if src < 0 ? -1 : 1 5816 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5817 // if src == NaN, -0.0 or 0.0 return src. 5818 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5819 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5820 } 5821 } 5822 5823 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5824 if (VM_Version::supports_avx512bw()) { 5825 if (mask_len > 32) { 5826 kmovql(dst, src); 5827 } else { 5828 kmovdl(dst, src); 5829 if (mask_len != 32) { 5830 kshiftrdl(dst, dst, 32 - mask_len); 5831 } 5832 } 5833 } else { 5834 assert(mask_len <= 16, ""); 5835 kmovwl(dst, src); 5836 if (mask_len != 16) { 5837 kshiftrwl(dst, dst, 16 - mask_len); 5838 } 5839 } 5840 } 5841 5842 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5843 int lane_size = type2aelembytes(bt); 5844 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5845 if ((is_LP64 || lane_size < 8) && 5846 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5847 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5848 movptr(rtmp, imm32); 5849 switch(lane_size) { 5850 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5851 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5852 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5853 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5854 fatal("Unsupported lane size %d", lane_size); 5855 break; 5856 } 5857 } else { 5858 movptr(rtmp, imm32); 5859 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5860 switch(lane_size) { 5861 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5862 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5863 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5864 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5865 fatal("Unsupported lane size %d", lane_size); 5866 break; 5867 } 5868 } 5869 } 5870 5871 // 5872 // Following is lookup table based popcount computation algorithm:- 5873 // Index Bit set count 5874 // [ 0000 -> 0, 5875 // 0001 -> 1, 5876 // 0010 -> 1, 5877 // 0011 -> 2, 5878 // 0100 -> 1, 5879 // 0101 -> 2, 5880 // 0110 -> 2, 5881 // 0111 -> 3, 5882 // 1000 -> 1, 5883 // 1001 -> 2, 5884 // 1010 -> 3, 5885 // 1011 -> 3, 5886 // 1100 -> 2, 5887 // 1101 -> 3, 5888 // 1111 -> 4 ] 5889 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5890 // shuffle indices for lookup table access. 5891 // b. Right shift each byte of vector lane by 4 positions. 5892 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5893 // shuffle indices for lookup table access. 5894 // d. Add the bitset count of upper and lower 4 bits of each byte. 5895 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5896 // count of all the bytes of a quadword. 5897 // f. Perform step e. for upper 128bit vector lane. 5898 // g. Pack the bitset count of quadwords back to double word. 5899 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5900 5901 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5902 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5903 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5904 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5905 vpsrlw(dst, src, 4, vec_enc); 5906 vpand(dst, dst, xtmp1, vec_enc); 5907 vpand(xtmp1, src, xtmp1, vec_enc); 5908 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5909 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5910 vpshufb(dst, xtmp2, dst, vec_enc); 5911 vpaddb(dst, dst, xtmp1, vec_enc); 5912 } 5913 5914 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5915 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5916 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5917 // Following code is as per steps e,f,g and h of above algorithm. 5918 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5919 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5920 vpsadbw(dst, dst, xtmp2, vec_enc); 5921 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5922 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5923 vpackuswb(dst, xtmp1, dst, vec_enc); 5924 } 5925 5926 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5927 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5928 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5929 // Add the popcount of upper and lower bytes of word. 5930 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5931 vpsrlw(dst, xtmp1, 8, vec_enc); 5932 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5933 vpaddw(dst, dst, xtmp1, vec_enc); 5934 } 5935 5936 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5937 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5938 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5939 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5940 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5941 } 5942 5943 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5944 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5945 switch(bt) { 5946 case T_LONG: 5947 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5948 break; 5949 case T_INT: 5950 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5951 break; 5952 case T_CHAR: 5953 case T_SHORT: 5954 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5955 break; 5956 case T_BYTE: 5957 case T_BOOLEAN: 5958 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5959 break; 5960 default: 5961 fatal("Unsupported type %s", type2name(bt)); 5962 break; 5963 } 5964 } 5965 5966 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5967 KRegister mask, bool merge, int vec_enc) { 5968 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5969 switch(bt) { 5970 case T_LONG: 5971 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5972 evpopcntq(dst, mask, src, merge, vec_enc); 5973 break; 5974 case T_INT: 5975 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5976 evpopcntd(dst, mask, src, merge, vec_enc); 5977 break; 5978 case T_CHAR: 5979 case T_SHORT: 5980 assert(VM_Version::supports_avx512_bitalg(), ""); 5981 evpopcntw(dst, mask, src, merge, vec_enc); 5982 break; 5983 case T_BYTE: 5984 case T_BOOLEAN: 5985 assert(VM_Version::supports_avx512_bitalg(), ""); 5986 evpopcntb(dst, mask, src, merge, vec_enc); 5987 break; 5988 default: 5989 fatal("Unsupported type %s", type2name(bt)); 5990 break; 5991 } 5992 } 5993 5994 #ifndef _LP64 5995 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5996 assert(VM_Version::supports_avx512bw(), ""); 5997 kmovdl(tmp, src); 5998 kunpckdql(dst, tmp, tmp); 5999 } 6000 #endif 6001 6002 // Bit reversal algorithm first reverses the bits of each byte followed by 6003 // a byte level reversal for multi-byte primitive types (short/int/long). 6004 // Algorithm performs a lookup table access to get reverse bit sequence 6005 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 6006 // is obtained by swapping the reverse bit sequences of upper and lower 6007 // nibble of a byte. 6008 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6009 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6010 if (VM_Version::supports_avx512vlbw()) { 6011 6012 // Get the reverse bit sequence of lower nibble of each byte. 6013 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 6014 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6015 evpandq(dst, xtmp2, src, vec_enc); 6016 vpshufb(dst, xtmp1, dst, vec_enc); 6017 vpsllq(dst, dst, 4, vec_enc); 6018 6019 // Get the reverse bit sequence of upper nibble of each byte. 6020 vpandn(xtmp2, xtmp2, src, vec_enc); 6021 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6022 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6023 6024 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6025 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6026 evporq(xtmp2, dst, xtmp2, vec_enc); 6027 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6028 6029 } else if(vec_enc == Assembler::AVX_512bit) { 6030 // Shift based bit reversal. 6031 assert(bt == T_LONG || bt == T_INT, ""); 6032 6033 // Swap lower and upper nibble of each byte. 6034 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 6035 6036 // Swap two least and most significant bits of each nibble. 6037 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 6038 6039 // Swap adjacent pair of bits. 6040 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6041 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 6042 6043 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6044 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 6045 } else { 6046 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 6047 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6048 6049 // Get the reverse bit sequence of lower nibble of each byte. 6050 vpand(dst, xtmp2, src, vec_enc); 6051 vpshufb(dst, xtmp1, dst, vec_enc); 6052 vpsllq(dst, dst, 4, vec_enc); 6053 6054 // Get the reverse bit sequence of upper nibble of each byte. 6055 vpandn(xtmp2, xtmp2, src, vec_enc); 6056 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6057 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6058 6059 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6060 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6061 vpor(xtmp2, dst, xtmp2, vec_enc); 6062 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6063 } 6064 } 6065 6066 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 6067 XMMRegister xtmp, Register rscratch) { 6068 assert(VM_Version::supports_gfni(), ""); 6069 assert(rscratch != noreg || always_reachable(mask), "missing"); 6070 6071 // Galois field instruction based bit reversal based on following algorithm. 6072 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6073 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 6074 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 6075 vector_reverse_byte(bt, dst, xtmp, vec_enc); 6076 } 6077 6078 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 6079 XMMRegister xtmp1, Register rtmp, int vec_enc) { 6080 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 6081 evpandq(dst, xtmp1, src, vec_enc); 6082 vpsllq(dst, dst, nbits, vec_enc); 6083 vpandn(xtmp1, xtmp1, src, vec_enc); 6084 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 6085 evporq(dst, dst, xtmp1, vec_enc); 6086 } 6087 6088 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6089 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6090 // Shift based bit reversal. 6091 assert(VM_Version::supports_evex(), ""); 6092 switch(bt) { 6093 case T_LONG: 6094 // Swap upper and lower double word of each quad word. 6095 evprorq(xtmp1, k0, src, 32, true, vec_enc); 6096 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 6097 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6098 break; 6099 case T_INT: 6100 // Swap upper and lower word of each double word. 6101 evprord(xtmp1, k0, src, 16, true, vec_enc); 6102 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6103 break; 6104 case T_CHAR: 6105 case T_SHORT: 6106 // Swap upper and lower byte of each word. 6107 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 6108 break; 6109 case T_BYTE: 6110 evmovdquq(dst, k0, src, true, vec_enc); 6111 break; 6112 default: 6113 fatal("Unsupported type %s", type2name(bt)); 6114 break; 6115 } 6116 } 6117 6118 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 6119 if (bt == T_BYTE) { 6120 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 6121 evmovdquq(dst, k0, src, true, vec_enc); 6122 } else { 6123 vmovdqu(dst, src); 6124 } 6125 return; 6126 } 6127 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 6128 // pre-computed shuffle indices. 6129 switch(bt) { 6130 case T_LONG: 6131 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 6132 break; 6133 case T_INT: 6134 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 6135 break; 6136 case T_CHAR: 6137 case T_SHORT: 6138 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6139 break; 6140 default: 6141 fatal("Unsupported type %s", type2name(bt)); 6142 break; 6143 } 6144 vpshufb(dst, src, dst, vec_enc); 6145 } 6146 6147 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6148 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6149 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6150 assert(is_integral_type(bt), ""); 6151 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6152 assert(VM_Version::supports_avx512cd(), ""); 6153 switch(bt) { 6154 case T_LONG: 6155 evplzcntq(dst, ktmp, src, merge, vec_enc); 6156 break; 6157 case T_INT: 6158 evplzcntd(dst, ktmp, src, merge, vec_enc); 6159 break; 6160 case T_SHORT: 6161 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6162 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6163 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6164 vpunpckhwd(dst, xtmp1, src, vec_enc); 6165 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6166 vpackusdw(dst, xtmp2, dst, vec_enc); 6167 break; 6168 case T_BYTE: 6169 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6170 // accessing the lookup table. 6171 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6172 // accessing the lookup table. 6173 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6174 assert(VM_Version::supports_avx512bw(), ""); 6175 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6176 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6177 vpand(xtmp2, dst, src, vec_enc); 6178 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6179 vpsrlw(xtmp3, src, 4, vec_enc); 6180 vpand(xtmp3, dst, xtmp3, vec_enc); 6181 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6182 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6183 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6184 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6185 break; 6186 default: 6187 fatal("Unsupported type %s", type2name(bt)); 6188 break; 6189 } 6190 } 6191 6192 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6193 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6194 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6195 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6196 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6197 // accessing the lookup table. 6198 vpand(dst, xtmp2, src, vec_enc); 6199 vpshufb(dst, xtmp1, dst, vec_enc); 6200 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6201 // accessing the lookup table. 6202 vpsrlw(xtmp3, src, 4, vec_enc); 6203 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6204 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6205 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6206 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6207 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6208 vpaddb(dst, dst, xtmp2, vec_enc); 6209 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6210 } 6211 6212 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6213 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6214 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6215 // Add zero counts of lower byte and upper byte of a word if 6216 // upper byte holds a zero value. 6217 vpsrlw(xtmp3, src, 8, vec_enc); 6218 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6219 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6220 vpsllw(xtmp2, dst, 8, vec_enc); 6221 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6222 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6223 vpsrlw(dst, dst, 8, vec_enc); 6224 } 6225 6226 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6227 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6228 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6229 // hence biased exponent can be used to compute leading zero count as per 6230 // following formula:- 6231 // LZCNT = 32 - (biased_exp - 127) 6232 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6233 6234 // Broadcast 0xFF 6235 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6236 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6237 6238 // Extract biased exponent. 6239 vcvtdq2ps(dst, src, vec_enc); 6240 vpsrld(dst, dst, 23, vec_enc); 6241 vpand(dst, dst, xtmp1, vec_enc); 6242 6243 // Broadcast 127. 6244 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6245 // Exponent = biased_exp - 127 6246 vpsubd(dst, dst, xtmp1, vec_enc); 6247 6248 // Exponent = Exponent + 1 6249 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6250 vpaddd(dst, dst, xtmp3, vec_enc); 6251 6252 // Replace -ve exponent with zero, exponent is -ve when src 6253 // lane contains a zero value. 6254 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6255 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6256 6257 // Rematerialize broadcast 32. 6258 vpslld(xtmp1, xtmp3, 5, vec_enc); 6259 // Exponent is 32 if corresponding source lane contains max_int value. 6260 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6261 // LZCNT = 32 - exponent 6262 vpsubd(dst, xtmp1, dst, vec_enc); 6263 6264 // Replace LZCNT with a value 1 if corresponding source lane 6265 // contains max_int value. 6266 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6267 6268 // Replace biased_exp with 0 if source lane value is less than zero. 6269 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6270 vblendvps(dst, dst, xtmp2, src, vec_enc); 6271 } 6272 6273 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6274 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6275 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6276 // Add zero counts of lower word and upper word of a double word if 6277 // upper word holds a zero value. 6278 vpsrld(xtmp3, src, 16, vec_enc); 6279 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6280 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6281 vpslld(xtmp2, dst, 16, vec_enc); 6282 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6283 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6284 vpsrld(dst, dst, 16, vec_enc); 6285 // Add zero counts of lower doubleword and upper doubleword of a 6286 // quadword if upper doubleword holds a zero value. 6287 vpsrlq(xtmp3, src, 32, vec_enc); 6288 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6289 vpsllq(xtmp2, dst, 32, vec_enc); 6290 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6291 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6292 vpsrlq(dst, dst, 32, vec_enc); 6293 } 6294 6295 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6296 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6297 Register rtmp, int vec_enc) { 6298 assert(is_integral_type(bt), "unexpected type"); 6299 assert(vec_enc < Assembler::AVX_512bit, ""); 6300 switch(bt) { 6301 case T_LONG: 6302 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6303 break; 6304 case T_INT: 6305 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6306 break; 6307 case T_SHORT: 6308 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6309 break; 6310 case T_BYTE: 6311 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6312 break; 6313 default: 6314 fatal("Unsupported type %s", type2name(bt)); 6315 break; 6316 } 6317 } 6318 6319 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6320 switch(bt) { 6321 case T_BYTE: 6322 vpsubb(dst, src1, src2, vec_enc); 6323 break; 6324 case T_SHORT: 6325 vpsubw(dst, src1, src2, vec_enc); 6326 break; 6327 case T_INT: 6328 vpsubd(dst, src1, src2, vec_enc); 6329 break; 6330 case T_LONG: 6331 vpsubq(dst, src1, src2, vec_enc); 6332 break; 6333 default: 6334 fatal("Unsupported type %s", type2name(bt)); 6335 break; 6336 } 6337 } 6338 6339 // Trailing zero count computation is based on leading zero count operation as per 6340 // following equation. All AVX3 targets support AVX512CD feature which offers 6341 // direct vector instruction to compute leading zero count. 6342 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6343 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6344 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6345 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6346 assert(is_integral_type(bt), ""); 6347 // xtmp = -1 6348 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6349 // xtmp = xtmp + src 6350 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6351 // xtmp = xtmp & ~src 6352 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6353 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6354 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6355 vpsub(bt, dst, xtmp4, dst, vec_enc); 6356 } 6357 6358 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6359 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6360 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6361 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6362 assert(is_integral_type(bt), ""); 6363 // xtmp = 0 6364 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6365 // xtmp = 0 - src 6366 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6367 // xtmp = xtmp | src 6368 vpor(xtmp3, xtmp3, src, vec_enc); 6369 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6370 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6371 vpsub(bt, dst, xtmp1, dst, vec_enc); 6372 } 6373 6374 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6375 Label done; 6376 Label neg_divisor_fastpath; 6377 cmpl(divisor, 0); 6378 jccb(Assembler::less, neg_divisor_fastpath); 6379 xorl(rdx, rdx); 6380 divl(divisor); 6381 jmpb(done); 6382 bind(neg_divisor_fastpath); 6383 // Fastpath for divisor < 0: 6384 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6385 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6386 movl(rdx, rax); 6387 subl(rdx, divisor); 6388 if (VM_Version::supports_bmi1()) { 6389 andnl(rax, rdx, rax); 6390 } else { 6391 notl(rdx); 6392 andl(rax, rdx); 6393 } 6394 shrl(rax, 31); 6395 bind(done); 6396 } 6397 6398 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6399 Label done; 6400 Label neg_divisor_fastpath; 6401 cmpl(divisor, 0); 6402 jccb(Assembler::less, neg_divisor_fastpath); 6403 xorl(rdx, rdx); 6404 divl(divisor); 6405 jmpb(done); 6406 bind(neg_divisor_fastpath); 6407 // Fastpath when divisor < 0: 6408 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6409 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6410 movl(rdx, rax); 6411 subl(rax, divisor); 6412 if (VM_Version::supports_bmi1()) { 6413 andnl(rax, rax, rdx); 6414 } else { 6415 notl(rax); 6416 andl(rax, rdx); 6417 } 6418 sarl(rax, 31); 6419 andl(rax, divisor); 6420 subl(rdx, rax); 6421 bind(done); 6422 } 6423 6424 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6425 Label done; 6426 Label neg_divisor_fastpath; 6427 6428 cmpl(divisor, 0); 6429 jccb(Assembler::less, neg_divisor_fastpath); 6430 xorl(rdx, rdx); 6431 divl(divisor); 6432 jmpb(done); 6433 bind(neg_divisor_fastpath); 6434 // Fastpath for divisor < 0: 6435 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6436 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6437 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6438 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6439 movl(rdx, rax); 6440 subl(rax, divisor); 6441 if (VM_Version::supports_bmi1()) { 6442 andnl(rax, rax, rdx); 6443 } else { 6444 notl(rax); 6445 andl(rax, rdx); 6446 } 6447 movl(tmp, rax); 6448 shrl(rax, 31); // quotient 6449 sarl(tmp, 31); 6450 andl(tmp, divisor); 6451 subl(rdx, tmp); // remainder 6452 bind(done); 6453 } 6454 6455 #ifdef _LP64 6456 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6457 XMMRegister xtmp2, Register rtmp) { 6458 if(VM_Version::supports_gfni()) { 6459 // Galois field instruction based bit reversal based on following algorithm. 6460 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6461 mov64(rtmp, 0x8040201008040201L); 6462 movq(xtmp1, src); 6463 movq(xtmp2, rtmp); 6464 gf2p8affineqb(xtmp1, xtmp2, 0); 6465 movq(dst, xtmp1); 6466 } else { 6467 // Swap even and odd numbered bits. 6468 movl(rtmp, src); 6469 andl(rtmp, 0x55555555); 6470 shll(rtmp, 1); 6471 movl(dst, src); 6472 andl(dst, 0xAAAAAAAA); 6473 shrl(dst, 1); 6474 orl(dst, rtmp); 6475 6476 // Swap LSB and MSB 2 bits of each nibble. 6477 movl(rtmp, dst); 6478 andl(rtmp, 0x33333333); 6479 shll(rtmp, 2); 6480 andl(dst, 0xCCCCCCCC); 6481 shrl(dst, 2); 6482 orl(dst, rtmp); 6483 6484 // Swap LSB and MSB 4 bits of each byte. 6485 movl(rtmp, dst); 6486 andl(rtmp, 0x0F0F0F0F); 6487 shll(rtmp, 4); 6488 andl(dst, 0xF0F0F0F0); 6489 shrl(dst, 4); 6490 orl(dst, rtmp); 6491 } 6492 bswapl(dst); 6493 } 6494 6495 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6496 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6497 if(VM_Version::supports_gfni()) { 6498 // Galois field instruction based bit reversal based on following algorithm. 6499 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6500 mov64(rtmp1, 0x8040201008040201L); 6501 movq(xtmp1, src); 6502 movq(xtmp2, rtmp1); 6503 gf2p8affineqb(xtmp1, xtmp2, 0); 6504 movq(dst, xtmp1); 6505 } else { 6506 // Swap even and odd numbered bits. 6507 movq(rtmp1, src); 6508 mov64(rtmp2, 0x5555555555555555L); 6509 andq(rtmp1, rtmp2); 6510 shlq(rtmp1, 1); 6511 movq(dst, src); 6512 notq(rtmp2); 6513 andq(dst, rtmp2); 6514 shrq(dst, 1); 6515 orq(dst, rtmp1); 6516 6517 // Swap LSB and MSB 2 bits of each nibble. 6518 movq(rtmp1, dst); 6519 mov64(rtmp2, 0x3333333333333333L); 6520 andq(rtmp1, rtmp2); 6521 shlq(rtmp1, 2); 6522 notq(rtmp2); 6523 andq(dst, rtmp2); 6524 shrq(dst, 2); 6525 orq(dst, rtmp1); 6526 6527 // Swap LSB and MSB 4 bits of each byte. 6528 movq(rtmp1, dst); 6529 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6530 andq(rtmp1, rtmp2); 6531 shlq(rtmp1, 4); 6532 notq(rtmp2); 6533 andq(dst, rtmp2); 6534 shrq(dst, 4); 6535 orq(dst, rtmp1); 6536 } 6537 bswapq(dst); 6538 } 6539 6540 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6541 Label done; 6542 Label neg_divisor_fastpath; 6543 cmpq(divisor, 0); 6544 jccb(Assembler::less, neg_divisor_fastpath); 6545 xorl(rdx, rdx); 6546 divq(divisor); 6547 jmpb(done); 6548 bind(neg_divisor_fastpath); 6549 // Fastpath for divisor < 0: 6550 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6551 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6552 movq(rdx, rax); 6553 subq(rdx, divisor); 6554 if (VM_Version::supports_bmi1()) { 6555 andnq(rax, rdx, rax); 6556 } else { 6557 notq(rdx); 6558 andq(rax, rdx); 6559 } 6560 shrq(rax, 63); 6561 bind(done); 6562 } 6563 6564 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6565 Label done; 6566 Label neg_divisor_fastpath; 6567 cmpq(divisor, 0); 6568 jccb(Assembler::less, neg_divisor_fastpath); 6569 xorq(rdx, rdx); 6570 divq(divisor); 6571 jmp(done); 6572 bind(neg_divisor_fastpath); 6573 // Fastpath when divisor < 0: 6574 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6575 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6576 movq(rdx, rax); 6577 subq(rax, divisor); 6578 if (VM_Version::supports_bmi1()) { 6579 andnq(rax, rax, rdx); 6580 } else { 6581 notq(rax); 6582 andq(rax, rdx); 6583 } 6584 sarq(rax, 63); 6585 andq(rax, divisor); 6586 subq(rdx, rax); 6587 bind(done); 6588 } 6589 6590 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6591 Label done; 6592 Label neg_divisor_fastpath; 6593 cmpq(divisor, 0); 6594 jccb(Assembler::less, neg_divisor_fastpath); 6595 xorq(rdx, rdx); 6596 divq(divisor); 6597 jmp(done); 6598 bind(neg_divisor_fastpath); 6599 // Fastpath for divisor < 0: 6600 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6601 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6602 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6603 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6604 movq(rdx, rax); 6605 subq(rax, divisor); 6606 if (VM_Version::supports_bmi1()) { 6607 andnq(rax, rax, rdx); 6608 } else { 6609 notq(rax); 6610 andq(rax, rdx); 6611 } 6612 movq(tmp, rax); 6613 shrq(rax, 63); // quotient 6614 sarq(tmp, 63); 6615 andq(tmp, divisor); 6616 subq(rdx, tmp); // remainder 6617 bind(done); 6618 } 6619 #endif 6620 6621 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6622 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6623 int vlen_enc) { 6624 assert(VM_Version::supports_avx512bw(), ""); 6625 // Byte shuffles are inlane operations and indices are determined using 6626 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6627 // normalized to index range 0-15. This makes sure that all the multiples 6628 // of an index value are placed at same relative position in 128 bit 6629 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6630 // will be 16th element in their respective 128 bit lanes. 6631 movl(rtmp, 16); 6632 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6633 6634 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6635 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6636 // original shuffle indices and move the shuffled lanes corresponding to true 6637 // mask to destination vector. 6638 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6639 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6640 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6641 6642 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6643 // and broadcasting second 128 bit lane. 6644 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6645 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6646 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6647 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6648 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6649 6650 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6651 // and broadcasting third 128 bit lane. 6652 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6653 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6654 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6655 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6656 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6657 6658 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6659 // and broadcasting third 128 bit lane. 6660 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6661 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6662 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6663 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6664 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6665 } 6666 6667 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6668 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6669 if (vlen_enc == AVX_128bit) { 6670 vpermilps(dst, src, shuffle, vlen_enc); 6671 } else if (bt == T_INT) { 6672 vpermd(dst, shuffle, src, vlen_enc); 6673 } else { 6674 assert(bt == T_FLOAT, ""); 6675 vpermps(dst, shuffle, src, vlen_enc); 6676 } 6677 } 6678 6679 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6680 switch(elem_bt) { 6681 case T_BYTE: 6682 if (ideal_opc == Op_SaturatingAddV) { 6683 vpaddsb(dst, src1, src2, vlen_enc); 6684 } else { 6685 assert(ideal_opc == Op_SaturatingSubV, ""); 6686 vpsubsb(dst, src1, src2, vlen_enc); 6687 } 6688 break; 6689 case T_SHORT: 6690 if (ideal_opc == Op_SaturatingAddV) { 6691 vpaddsw(dst, src1, src2, vlen_enc); 6692 } else { 6693 assert(ideal_opc == Op_SaturatingSubV, ""); 6694 vpsubsw(dst, src1, src2, vlen_enc); 6695 } 6696 break; 6697 default: 6698 fatal("Unsupported type %s", type2name(elem_bt)); 6699 break; 6700 } 6701 } 6702 6703 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6704 switch(elem_bt) { 6705 case T_BYTE: 6706 if (ideal_opc == Op_SaturatingAddV) { 6707 vpaddusb(dst, src1, src2, vlen_enc); 6708 } else { 6709 assert(ideal_opc == Op_SaturatingSubV, ""); 6710 vpsubusb(dst, src1, src2, vlen_enc); 6711 } 6712 break; 6713 case T_SHORT: 6714 if (ideal_opc == Op_SaturatingAddV) { 6715 vpaddusw(dst, src1, src2, vlen_enc); 6716 } else { 6717 assert(ideal_opc == Op_SaturatingSubV, ""); 6718 vpsubusw(dst, src1, src2, vlen_enc); 6719 } 6720 break; 6721 default: 6722 fatal("Unsupported type %s", type2name(elem_bt)); 6723 break; 6724 } 6725 } 6726 6727 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6728 XMMRegister src2, KRegister ktmp, int vlen_enc) { 6729 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input. 6730 // overflow_mask = Inp1 <u Inp2 6731 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc); 6732 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative) 6733 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false); 6734 } 6735 6736 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6737 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6738 // Emulate unsigned comparison using signed comparison 6739 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE 6740 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true); 6741 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc); 6742 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc); 6743 6744 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc); 6745 6746 // Res = INP1 - INP2 (non-commutative and non-associative) 6747 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6748 // Res = Mask ? Zero : Res 6749 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); 6750 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc); 6751 } 6752 6753 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6754 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) { 6755 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation. 6756 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2) 6757 // Res = Signed Add INP1, INP2 6758 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6759 // T1 = SRC1 | SRC2 6760 vpor(xtmp1, src1, src2, vlen_enc); 6761 // Max_Unsigned = -1 6762 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6763 // Unsigned compare: Mask = Res <u T1 6764 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc); 6765 // res = Mask ? Max_Unsigned : Res 6766 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc); 6767 } 6768 6769 // 6770 // Section 2-13 Hacker's Delight list following overflow detection check for saturating 6771 // unsigned addition operation. 6772 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1 6773 // 6774 // We empirically determined its semantic equivalence to following reduced expression 6775 // overflow_mask = (a + b) <u (a | b) 6776 // 6777 // and also verified it though Alive2 solver. 6778 // (https://alive2.llvm.org/ce/z/XDQ7dY) 6779 // 6780 6781 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6782 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) { 6783 // Res = Signed Add INP1, INP2 6784 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6785 // Compute T1 = INP1 | INP2 6786 vpor(xtmp3, src1, src2, vlen_enc); 6787 // T1 = Minimum signed value. 6788 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6789 // Convert T1 to signed value, T1 = T1 + MIN_VALUE 6790 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc); 6791 // Convert Res to signed value, Res<s> = Res + MIN_VALUE 6792 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc); 6793 // Compute overflow detection mask = Res<1> <s T1 6794 if (elem_bt == T_INT) { 6795 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc); 6796 } else { 6797 assert(elem_bt == T_LONG, ""); 6798 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc); 6799 } 6800 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6801 } 6802 6803 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6804 int vlen_enc, bool xtmp2_hold_M1) { 6805 if (VM_Version::supports_avx512dq()) { 6806 evpmovq2m(ktmp, src, vlen_enc); 6807 } else { 6808 assert(VM_Version::supports_evex(), ""); 6809 if (!xtmp2_hold_M1) { 6810 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6811 } 6812 evpsraq(xtmp1, src, 63, vlen_enc); 6813 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6814 } 6815 } 6816 6817 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6818 int vlen_enc, bool xtmp2_hold_M1) { 6819 if (VM_Version::supports_avx512dq()) { 6820 evpmovd2m(ktmp, src, vlen_enc); 6821 } else { 6822 assert(VM_Version::supports_evex(), ""); 6823 if (!xtmp2_hold_M1) { 6824 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6825 } 6826 vpsrad(xtmp1, src, 31, vlen_enc); 6827 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6828 } 6829 } 6830 6831 6832 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 6833 if (elem_bt == T_LONG) { 6834 if (VM_Version::supports_evex()) { 6835 evpsraq(dst, src, 63, vlen_enc); 6836 } else { 6837 vpsrad(dst, src, 31, vlen_enc); 6838 vpshufd(dst, dst, 0xF5, vlen_enc); 6839 } 6840 } else { 6841 assert(elem_bt == T_INT, ""); 6842 vpsrad(dst, src, 31, vlen_enc); 6843 } 6844 } 6845 6846 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6847 if (compute_allones) { 6848 if (vlen_enc == Assembler::AVX_512bit) { 6849 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6850 } else { 6851 vpcmpeqq(allones, allones, allones, vlen_enc); 6852 } 6853 } 6854 if (elem_bt == T_LONG) { 6855 vpsrlq(dst, allones, 1, vlen_enc); 6856 } else { 6857 assert(elem_bt == T_INT, ""); 6858 vpsrld(dst, allones, 1, vlen_enc); 6859 } 6860 } 6861 6862 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6863 if (compute_allones) { 6864 if (vlen_enc == Assembler::AVX_512bit) { 6865 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6866 } else { 6867 vpcmpeqq(allones, allones, allones, vlen_enc); 6868 } 6869 } 6870 if (elem_bt == T_LONG) { 6871 vpsllq(dst, allones, 63, vlen_enc); 6872 } else { 6873 assert(elem_bt == T_INT, ""); 6874 vpslld(dst, allones, 31, vlen_enc); 6875 } 6876 } 6877 6878 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2, 6879 Assembler::ComparisonPredicate cond, int vlen_enc) { 6880 switch(elem_bt) { 6881 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break; 6882 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break; 6883 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break; 6884 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break; 6885 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6886 } 6887 } 6888 6889 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6890 switch(elem_bt) { 6891 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break; 6892 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break; 6893 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break; 6894 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break; 6895 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6896 } 6897 } 6898 6899 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1, 6900 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) { 6901 if (elem_bt == T_LONG) { 6902 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6903 } else { 6904 assert(elem_bt == T_INT, ""); 6905 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6906 } 6907 } 6908 6909 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6910 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6911 KRegister ktmp1, KRegister ktmp2, int vlen_enc) { 6912 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6913 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6914 // Overflow detection based on Hacker's delight section 2-13. 6915 if (ideal_opc == Op_SaturatingAddV) { 6916 // res = src1 + src2 6917 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6918 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6919 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6920 vpxor(xtmp1, dst, src1, vlen_enc); 6921 vpxor(xtmp2, dst, src2, vlen_enc); 6922 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6923 } else { 6924 assert(ideal_opc == Op_SaturatingSubV, ""); 6925 // res = src1 - src2 6926 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6927 // Overflow occurs when both inputs have opposite polarity and 6928 // result polarity does not comply with first input polarity. 6929 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6930 vpxor(xtmp1, src1, src2, vlen_enc); 6931 vpxor(xtmp2, dst, src1, vlen_enc); 6932 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6933 } 6934 6935 // Compute overflow detection mask. 6936 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc); 6937 // Note: xtmp1 hold -1 in all its lanes after above call. 6938 6939 // Compute mask based on first input polarity. 6940 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true); 6941 6942 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6943 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6944 6945 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to 6946 // set bits in first input polarity mask holds a min value. 6947 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc); 6948 // Blend destination lanes with saturated values using overflow detection mask. 6949 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc); 6950 } 6951 6952 6953 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6954 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6955 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) { 6956 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6957 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6958 // Overflow detection based on Hacker's delight section 2-13. 6959 if (ideal_opc == Op_SaturatingAddV) { 6960 // res = src1 + src2 6961 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6962 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6963 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6964 vpxor(xtmp1, dst, src1, vlen_enc); 6965 vpxor(xtmp2, dst, src2, vlen_enc); 6966 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6967 } else { 6968 assert(ideal_opc == Op_SaturatingSubV, ""); 6969 // res = src1 - src2 6970 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6971 // Overflow occurs when both inputs have opposite polarity and 6972 // result polarity does not comply with first input polarity. 6973 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6974 vpxor(xtmp1, src1, src2, vlen_enc); 6975 vpxor(xtmp2, dst, src1, vlen_enc); 6976 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6977 } 6978 6979 // Sign-extend to compute overflow detection mask. 6980 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc); 6981 6982 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc); 6983 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc); 6984 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6985 6986 // Compose saturating min/max vector using first input polarity mask. 6987 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc); 6988 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc); 6989 6990 // Blend result with saturating vector using overflow detection mask. 6991 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6992 } 6993 6994 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6995 switch(elem_bt) { 6996 case T_BYTE: 6997 if (ideal_opc == Op_SaturatingAddV) { 6998 vpaddsb(dst, src1, src2, vlen_enc); 6999 } else { 7000 assert(ideal_opc == Op_SaturatingSubV, ""); 7001 vpsubsb(dst, src1, src2, vlen_enc); 7002 } 7003 break; 7004 case T_SHORT: 7005 if (ideal_opc == Op_SaturatingAddV) { 7006 vpaddsw(dst, src1, src2, vlen_enc); 7007 } else { 7008 assert(ideal_opc == Op_SaturatingSubV, ""); 7009 vpsubsw(dst, src1, src2, vlen_enc); 7010 } 7011 break; 7012 default: 7013 fatal("Unsupported type %s", type2name(elem_bt)); 7014 break; 7015 } 7016 } 7017 7018 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7019 switch(elem_bt) { 7020 case T_BYTE: 7021 if (ideal_opc == Op_SaturatingAddV) { 7022 vpaddusb(dst, src1, src2, vlen_enc); 7023 } else { 7024 assert(ideal_opc == Op_SaturatingSubV, ""); 7025 vpsubusb(dst, src1, src2, vlen_enc); 7026 } 7027 break; 7028 case T_SHORT: 7029 if (ideal_opc == Op_SaturatingAddV) { 7030 vpaddusw(dst, src1, src2, vlen_enc); 7031 } else { 7032 assert(ideal_opc == Op_SaturatingSubV, ""); 7033 vpsubusw(dst, src1, src2, vlen_enc); 7034 } 7035 break; 7036 default: 7037 fatal("Unsupported type %s", type2name(elem_bt)); 7038 break; 7039 } 7040 } 7041 7042 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 7043 XMMRegister src2, int vlen_enc) { 7044 switch(elem_bt) { 7045 case T_BYTE: 7046 evpermi2b(dst, src1, src2, vlen_enc); 7047 break; 7048 case T_SHORT: 7049 evpermi2w(dst, src1, src2, vlen_enc); 7050 break; 7051 case T_INT: 7052 evpermi2d(dst, src1, src2, vlen_enc); 7053 break; 7054 case T_LONG: 7055 evpermi2q(dst, src1, src2, vlen_enc); 7056 break; 7057 case T_FLOAT: 7058 evpermi2ps(dst, src1, src2, vlen_enc); 7059 break; 7060 case T_DOUBLE: 7061 evpermi2pd(dst, src1, src2, vlen_enc); 7062 break; 7063 default: 7064 fatal("Unsupported type %s", type2name(elem_bt)); 7065 break; 7066 } 7067 } 7068 7069 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) { 7070 if (is_unsigned) { 7071 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7072 } else { 7073 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7074 } 7075 } 7076 7077 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) { 7078 if (is_unsigned) { 7079 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7080 } else { 7081 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7082 } 7083 }