1 /* 2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "gc/shared/barrierSet.hpp" 28 #include "gc/shared/barrierSetAssembler.hpp" 29 #include "oops/methodData.hpp" 30 #include "opto/c2_MacroAssembler.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/output.hpp" 33 #include "opto/opcodes.hpp" 34 #include "opto/subnode.hpp" 35 #include "runtime/globals.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 #include "utilities/checkedCast.hpp" 39 #include "utilities/globalDefinitions.hpp" 40 #include "utilities/powerOfTwo.hpp" 41 #include "utilities/sizes.hpp" 42 43 #ifdef PRODUCT 44 #define BLOCK_COMMENT(str) /* nothing */ 45 #define STOP(error) stop(error) 46 #else 47 #define BLOCK_COMMENT(str) block_comment(str) 48 #define STOP(error) block_comment(error); stop(error) 49 #endif 50 51 // C2 compiled method's prolog code. 52 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 53 54 // WARNING: Initial instruction MUST be 5 bytes or longer so that 55 // NativeJump::patch_verified_entry will be able to patch out the entry 56 // code safely. The push to verify stack depth is ok at 5 bytes, 57 // the frame allocation can be either 3 or 6 bytes. So if we don't do 58 // stack bang then we must use the 6 byte frame allocation even if 59 // we have no frame. :-( 60 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 61 62 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 63 // Remove word for return addr 64 framesize -= wordSize; 65 stack_bang_size -= wordSize; 66 67 // Calls to C2R adapters often do not accept exceptional returns. 68 // We require that their callers must bang for them. But be careful, because 69 // some VM calls (such as call site linkage) can use several kilobytes of 70 // stack. But the stack safety zone should account for that. 71 // See bugs 4446381, 4468289, 4497237. 72 if (stack_bang_size > 0) { 73 generate_stack_overflow_check(stack_bang_size); 74 75 // We always push rbp, so that on return to interpreter rbp, will be 76 // restored correctly and we can correct the stack. 77 push(rbp); 78 // Save caller's stack pointer into RBP if the frame pointer is preserved. 79 if (PreserveFramePointer) { 80 mov(rbp, rsp); 81 } 82 // Remove word for ebp 83 framesize -= wordSize; 84 85 // Create frame 86 if (framesize) { 87 subptr(rsp, framesize); 88 } 89 } else { 90 // Create frame (force generation of a 4 byte immediate value) 91 subptr_imm32(rsp, framesize); 92 93 // Save RBP register now. 94 framesize -= wordSize; 95 movptr(Address(rsp, framesize), rbp); 96 // Save caller's stack pointer into RBP if the frame pointer is preserved. 97 if (PreserveFramePointer) { 98 movptr(rbp, rsp); 99 if (framesize > 0) { 100 addptr(rbp, framesize); 101 } 102 } 103 } 104 105 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 106 framesize -= wordSize; 107 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 108 } 109 110 #ifndef _LP64 111 // If method sets FPU control word do it now 112 if (fp_mode_24b) { 113 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 114 } 115 if (UseSSE >= 2 && VerifyFPU) { 116 verify_FPU(0, "FPU stack must be clean on entry"); 117 } 118 #endif 119 120 #ifdef ASSERT 121 if (VerifyStackAtCalls) { 122 Label L; 123 push(rax); 124 mov(rax, rsp); 125 andptr(rax, StackAlignmentInBytes-1); 126 cmpptr(rax, StackAlignmentInBytes-wordSize); 127 pop(rax); 128 jcc(Assembler::equal, L); 129 STOP("Stack is not properly aligned!"); 130 bind(L); 131 } 132 #endif 133 134 if (!is_stub) { 135 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 136 #ifdef _LP64 137 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 138 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 139 Label dummy_slow_path; 140 Label dummy_continuation; 141 Label* slow_path = &dummy_slow_path; 142 Label* continuation = &dummy_continuation; 143 if (!Compile::current()->output()->in_scratch_emit_size()) { 144 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 145 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 146 Compile::current()->output()->add_stub(stub); 147 slow_path = &stub->entry(); 148 continuation = &stub->continuation(); 149 } 150 bs->nmethod_entry_barrier(this, slow_path, continuation); 151 } 152 #else 153 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 154 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 155 #endif 156 } 157 } 158 159 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 160 switch (vlen_in_bytes) { 161 case 4: // fall-through 162 case 8: // fall-through 163 case 16: return Assembler::AVX_128bit; 164 case 32: return Assembler::AVX_256bit; 165 case 64: return Assembler::AVX_512bit; 166 167 default: { 168 ShouldNotReachHere(); 169 return Assembler::AVX_NoVec; 170 } 171 } 172 } 173 174 // fast_lock and fast_unlock used by C2 175 176 // Because the transitions from emitted code to the runtime 177 // monitorenter/exit helper stubs are so slow it's critical that 178 // we inline both the stack-locking fast path and the inflated fast path. 179 // 180 // See also: cmpFastLock and cmpFastUnlock. 181 // 182 // What follows is a specialized inline transliteration of the code 183 // in enter() and exit(). If we're concerned about I$ bloat another 184 // option would be to emit TrySlowEnter and TrySlowExit methods 185 // at startup-time. These methods would accept arguments as 186 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 187 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 188 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 189 // In practice, however, the # of lock sites is bounded and is usually small. 190 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 191 // if the processor uses simple bimodal branch predictors keyed by EIP 192 // Since the helper routines would be called from multiple synchronization 193 // sites. 194 // 195 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 196 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 197 // to those specialized methods. That'd give us a mostly platform-independent 198 // implementation that the JITs could optimize and inline at their pleasure. 199 // Done correctly, the only time we'd need to cross to native could would be 200 // to park() or unpark() threads. We'd also need a few more unsafe operators 201 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 202 // (b) explicit barriers or fence operations. 203 // 204 // TODO: 205 // 206 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 207 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 208 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 209 // the lock operators would typically be faster than reifying Self. 210 // 211 // * Ideally I'd define the primitives as: 212 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 213 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 214 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 215 // Instead, we're stuck with a rather awkward and brittle register assignments below. 216 // Furthermore the register assignments are overconstrained, possibly resulting in 217 // sub-optimal code near the synchronization site. 218 // 219 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 220 // Alternately, use a better sp-proximity test. 221 // 222 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 223 // Either one is sufficient to uniquely identify a thread. 224 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 225 // 226 // * Intrinsify notify() and notifyAll() for the common cases where the 227 // object is locked by the calling thread but the waitlist is empty. 228 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 229 // 230 // * use jccb and jmpb instead of jcc and jmp to improve code density. 231 // But beware of excessive branch density on AMD Opterons. 232 // 233 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 234 // or failure of the fast path. If the fast path fails then we pass 235 // control to the slow path, typically in C. In fast_lock and 236 // fast_unlock we often branch to DONE_LABEL, just to find that C2 237 // will emit a conditional branch immediately after the node. 238 // So we have branches to branches and lots of ICC.ZF games. 239 // Instead, it might be better to have C2 pass a "FailureLabel" 240 // into fast_lock and fast_unlock. In the case of success, control 241 // will drop through the node. ICC.ZF is undefined at exit. 242 // In the case of failure, the node will branch directly to the 243 // FailureLabel 244 245 246 // obj: object to lock 247 // box: on-stack box address (displaced header location) - KILLED 248 // rax,: tmp -- KILLED 249 // scr: tmp -- KILLED 250 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 251 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 252 Metadata* method_data) { 253 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 254 // Ensure the register assignments are disjoint 255 assert(tmpReg == rax, ""); 256 assert(cx1Reg == noreg, ""); 257 assert(cx2Reg == noreg, ""); 258 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 259 260 // Possible cases that we'll encounter in fast_lock 261 // ------------------------------------------------ 262 // * Inflated 263 // -- unlocked 264 // -- Locked 265 // = by self 266 // = by other 267 // * neutral 268 // * stack-locked 269 // -- by self 270 // = sp-proximity test hits 271 // = sp-proximity test generates false-negative 272 // -- by other 273 // 274 275 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 276 277 if (DiagnoseSyncOnValueBasedClasses != 0) { 278 load_klass(tmpReg, objReg, scrReg); 279 testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 280 jcc(Assembler::notZero, DONE_LABEL); 281 } 282 283 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 284 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 285 jcc(Assembler::notZero, IsInflated); 286 287 if (LockingMode == LM_MONITOR) { 288 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 289 testptr(objReg, objReg); 290 } else { 291 assert(LockingMode == LM_LEGACY, "must be"); 292 // Attempt stack-locking ... 293 orptr (tmpReg, markWord::unlocked_value); 294 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 295 lock(); 296 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 297 jcc(Assembler::equal, COUNT); // Success 298 299 // Recursive locking. 300 // The object is stack-locked: markword contains stack pointer to BasicLock. 301 // Locked by current thread if difference with current SP is less than one page. 302 subptr(tmpReg, rsp); 303 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 304 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 305 movptr(Address(boxReg, 0), tmpReg); 306 } 307 jmp(DONE_LABEL); 308 309 bind(IsInflated); 310 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 311 312 #ifndef _LP64 313 // Just take slow path to avoid dealing with 64 bit atomic instructions here. 314 orl(boxReg, 1); // set ICC.ZF=0 to indicate failure 315 #else 316 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 317 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 318 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 319 320 // It's inflated and we use scrReg for ObjectMonitor* in this section. 321 movptr(boxReg, Address(r15_thread, JavaThread::monitor_owner_id_offset())); 322 movq(scrReg, tmpReg); 323 xorq(tmpReg, tmpReg); 324 lock(); 325 cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 326 327 // Propagate ICC.ZF from CAS above into DONE_LABEL. 328 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 329 330 cmpptr(boxReg, rax); // Check if we are already the owner (recursive lock) 331 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 332 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 333 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 334 #endif // _LP64 335 bind(DONE_LABEL); 336 337 // ZFlag == 1 count in fast path 338 // ZFlag == 0 count in slow path 339 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 340 341 bind(COUNT); 342 if (LockingMode == LM_LEGACY) { 343 #ifdef _LP64 344 // Count monitors in fast path 345 increment(Address(thread, JavaThread::held_monitor_count_offset())); 346 #endif 347 } 348 xorl(tmpReg, tmpReg); // Set ZF == 1 349 350 bind(NO_COUNT); 351 352 // At NO_COUNT the icc ZFlag is set as follows ... 353 // fast_unlock uses the same protocol. 354 // ZFlag == 1 -> Success 355 // ZFlag == 0 -> Failure - force control through the slow path 356 } 357 358 // obj: object to unlock 359 // box: box address (displaced header location), killed. Must be EAX. 360 // tmp: killed, cannot be obj nor box. 361 // 362 // Some commentary on balanced locking: 363 // 364 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 365 // Methods that don't have provably balanced locking are forced to run in the 366 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 367 // The interpreter provides two properties: 368 // I1: At return-time the interpreter automatically and quietly unlocks any 369 // objects acquired the current activation (frame). Recall that the 370 // interpreter maintains an on-stack list of locks currently held by 371 // a frame. 372 // I2: If a method attempts to unlock an object that is not held by the 373 // the frame the interpreter throws IMSX. 374 // 375 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 376 // B() doesn't have provably balanced locking so it runs in the interpreter. 377 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 378 // is still locked by A(). 379 // 380 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 381 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 382 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 383 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 384 // Arguably given that the spec legislates the JNI case as undefined our implementation 385 // could reasonably *avoid* checking owner in fast_unlock(). 386 // In the interest of performance we elide m->Owner==Self check in unlock. 387 // A perfectly viable alternative is to elide the owner check except when 388 // Xcheck:jni is enabled. 389 390 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { 391 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 392 assert(boxReg == rax, ""); 393 assert_different_registers(objReg, boxReg, tmpReg); 394 395 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 396 397 if (LockingMode == LM_LEGACY) { 398 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 399 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 400 } 401 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 402 if (LockingMode != LM_MONITOR) { 403 testptr(tmpReg, markWord::monitor_value); // Inflated? 404 jcc(Assembler::zero, Stacked); 405 } 406 407 // It's inflated. 408 409 #ifndef _LP64 410 // Just take slow path to avoid dealing with 64 bit atomic instructions here. 411 orl(boxReg, 1); // set ICC.ZF=0 to indicate failure 412 jmpb(DONE_LABEL); 413 #else 414 // Despite our balanced locking property we still check that m->_owner == Self 415 // as java routines or native JNI code called by this thread might 416 // have released the lock. 417 // Refer to the comments in synchronizer.cpp for how we might encode extra 418 // state in _succ so we can avoid fetching EntryList|cxq. 419 // 420 // If there's no contention try a 1-0 exit. That is, exit without 421 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 422 // we detect and recover from the race that the 1-0 exit admits. 423 // 424 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 425 // before it STs null into _owner, releasing the lock. Updates 426 // to data protected by the critical section must be visible before 427 // we drop the lock (and thus before any other thread could acquire 428 // the lock and observe the fields protected by the lock). 429 // IA32's memory-model is SPO, so STs are ordered with respect to 430 // each other and there's no need for an explicit barrier (fence). 431 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 432 Label LSuccess, LNotRecursive; 433 434 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 435 jccb(Assembler::equal, LNotRecursive); 436 437 // Recursive inflated unlock 438 decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 439 jmpb(LSuccess); 440 441 bind(LNotRecursive); 442 443 // Set owner to null. 444 // Release to satisfy the JMM 445 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 446 // We need a full fence after clearing owner to avoid stranding. 447 // StoreLoad achieves this. 448 membar(StoreLoad); 449 450 // Check if the entry lists are empty (EntryList first - by convention). 451 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 452 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 453 jccb(Assembler::zero, LSuccess); // If so we are done. 454 455 // Check if there is a successor. 456 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 457 jccb(Assembler::notZero, LSuccess); // If so we are done. 458 459 // Save the monitor pointer in the current thread, so we can try to 460 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 461 andptr(tmpReg, ~(int32_t)markWord::monitor_value); 462 movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg); 463 464 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 465 jmpb (DONE_LABEL); 466 467 bind (LSuccess); 468 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 469 jmpb (DONE_LABEL); 470 #endif // _LP64 471 472 if (LockingMode == LM_LEGACY) { 473 bind (Stacked); 474 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 475 lock(); 476 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 477 // Intentional fall-thru into DONE_LABEL 478 } 479 480 bind(DONE_LABEL); 481 482 // ZFlag == 1 count in fast path 483 // ZFlag == 0 count in slow path 484 jccb(Assembler::notZero, NO_COUNT); 485 486 bind(COUNT); 487 488 if (LockingMode == LM_LEGACY) { 489 // Count monitors in fast path 490 #ifdef _LP64 491 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 492 #endif 493 } 494 495 xorl(tmpReg, tmpReg); // Set ZF == 1 496 497 bind(NO_COUNT); 498 } 499 500 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 501 Register t, Register thread) { 502 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 503 assert(rax_reg == rax, "Used for CAS"); 504 assert_different_registers(obj, box, rax_reg, t, thread); 505 506 // Handle inflated monitor. 507 Label inflated; 508 // Finish fast lock successfully. ZF value is irrelevant. 509 Label locked; 510 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 511 Label slow_path; 512 513 if (UseObjectMonitorTable) { 514 // Clear cache in case fast locking succeeds. 515 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 516 } 517 518 if (DiagnoseSyncOnValueBasedClasses != 0) { 519 load_klass(rax_reg, obj, t); 520 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 521 jcc(Assembler::notZero, slow_path); 522 } 523 524 const Register mark = t; 525 526 { // Lightweight Lock 527 528 Label push; 529 530 const Register top = UseObjectMonitorTable ? rax_reg : box; 531 532 // Load the mark. 533 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 534 535 // Prefetch top. 536 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 537 538 // Check for monitor (0b10). 539 testptr(mark, markWord::monitor_value); 540 jcc(Assembler::notZero, inflated); 541 542 // Check if lock-stack is full. 543 cmpl(top, LockStack::end_offset() - 1); 544 jcc(Assembler::greater, slow_path); 545 546 // Check if recursive. 547 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 548 jccb(Assembler::equal, push); 549 550 // Try to lock. Transition lock bits 0b01 => 0b00 551 movptr(rax_reg, mark); 552 orptr(rax_reg, markWord::unlocked_value); 553 andptr(mark, ~(int32_t)markWord::unlocked_value); 554 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 555 jcc(Assembler::notEqual, slow_path); 556 557 if (UseObjectMonitorTable) { 558 // Need to reload top, clobbered by CAS. 559 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 560 } 561 bind(push); 562 // After successful lock, push object on lock-stack. 563 movptr(Address(thread, top), obj); 564 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 565 jmpb(locked); 566 } 567 568 { // Handle inflated monitor. 569 bind(inflated); 570 571 #ifndef _LP64 572 // Just take slow path to avoid dealing with 64 bit atomic instructions here. 573 orl(box, 1); // set ICC.ZF=0 to indicate failure 574 jmpb(slow_path); 575 #else 576 const Register monitor = t; 577 578 if (!UseObjectMonitorTable) { 579 assert(mark == monitor, "should be the same here"); 580 } else { 581 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 582 // Fetch ObjectMonitor* from the cache or take the slow-path. 583 Label monitor_found; 584 585 // Load cache address 586 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 587 588 const int num_unrolled = 2; 589 for (int i = 0; i < num_unrolled; i++) { 590 cmpptr(obj, Address(t)); 591 jccb(Assembler::equal, monitor_found); 592 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 593 } 594 595 Label loop; 596 597 // Search for obj in cache. 598 bind(loop); 599 600 // Check for match. 601 cmpptr(obj, Address(t)); 602 jccb(Assembler::equal, monitor_found); 603 604 // Search until null encountered, guaranteed _null_sentinel at end. 605 cmpptr(Address(t), 1); 606 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 607 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 608 jmpb(loop); 609 610 // Cache hit. 611 bind(monitor_found); 612 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 613 } 614 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 615 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 616 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 617 618 Label monitor_locked; 619 // Lock the monitor. 620 621 if (UseObjectMonitorTable) { 622 // Cache the monitor for unlock before trashing box. On failure to acquire 623 // the lock, the slow path will reset the entry accordingly (see CacheSetter). 624 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 625 } 626 627 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 628 xorptr(rax_reg, rax_reg); 629 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset())); 630 lock(); cmpxchgptr(box, owner_address); 631 jccb(Assembler::equal, monitor_locked); 632 633 // Check if recursive. 634 cmpptr(box, rax_reg); 635 jccb(Assembler::notEqual, slow_path); 636 637 // Recursive. 638 increment(recursions_address); 639 640 bind(monitor_locked); 641 #endif // _LP64 642 } 643 644 bind(locked); 645 // Set ZF = 1 646 xorl(rax_reg, rax_reg); 647 648 #ifdef ASSERT 649 // Check that locked label is reached with ZF set. 650 Label zf_correct; 651 Label zf_bad_zero; 652 jcc(Assembler::zero, zf_correct); 653 jmp(zf_bad_zero); 654 #endif 655 656 bind(slow_path); 657 #ifdef ASSERT 658 // Check that slow_path label is reached with ZF not set. 659 jcc(Assembler::notZero, zf_correct); 660 stop("Fast Lock ZF != 0"); 661 bind(zf_bad_zero); 662 stop("Fast Lock ZF != 1"); 663 bind(zf_correct); 664 #endif 665 // C2 uses the value of ZF to determine the continuation. 666 } 667 668 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 669 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 670 assert(reg_rax == rax, "Used for CAS"); 671 assert_different_registers(obj, reg_rax, t); 672 673 // Handle inflated monitor. 674 Label inflated, inflated_check_lock_stack; 675 // Finish fast unlock successfully. MUST jump with ZF == 1 676 Label unlocked, slow_path; 677 678 const Register mark = t; 679 const Register monitor = t; 680 const Register top = UseObjectMonitorTable ? t : reg_rax; 681 const Register box = reg_rax; 682 683 Label dummy; 684 C2FastUnlockLightweightStub* stub = nullptr; 685 686 if (!Compile::current()->output()->in_scratch_emit_size()) { 687 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 688 Compile::current()->output()->add_stub(stub); 689 } 690 691 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 692 693 { // Lightweight Unlock 694 695 // Load top. 696 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 697 698 if (!UseObjectMonitorTable) { 699 // Prefetch mark. 700 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 701 } 702 703 // Check if obj is top of lock-stack. 704 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 705 // Top of lock stack was not obj. Must be monitor. 706 jcc(Assembler::notEqual, inflated_check_lock_stack); 707 708 // Pop lock-stack. 709 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 710 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 711 712 // Check if recursive. 713 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 714 jcc(Assembler::equal, unlocked); 715 716 // We elide the monitor check, let the CAS fail instead. 717 718 if (UseObjectMonitorTable) { 719 // Load mark. 720 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 721 } 722 723 // Try to unlock. Transition lock bits 0b00 => 0b01 724 movptr(reg_rax, mark); 725 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 726 orptr(mark, markWord::unlocked_value); 727 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 728 jcc(Assembler::notEqual, push_and_slow_path); 729 jmp(unlocked); 730 } 731 732 733 { // Handle inflated monitor. 734 bind(inflated_check_lock_stack); 735 #ifdef ASSERT 736 Label check_done; 737 subl(top, oopSize); 738 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 739 jcc(Assembler::below, check_done); 740 cmpptr(obj, Address(thread, top)); 741 jccb(Assembler::notEqual, inflated_check_lock_stack); 742 stop("Fast Unlock lock on stack"); 743 bind(check_done); 744 if (UseObjectMonitorTable) { 745 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 746 } 747 testptr(mark, markWord::monitor_value); 748 jccb(Assembler::notZero, inflated); 749 stop("Fast Unlock not monitor"); 750 #endif 751 752 bind(inflated); 753 754 #ifndef _LP64 755 // Just take slow path to avoid dealing with 64 bit atomic instructions here. 756 orl(t, 1); // set ICC.ZF=0 to indicate failure 757 jmpb(slow_path); 758 #else 759 if (!UseObjectMonitorTable) { 760 assert(mark == monitor, "should be the same here"); 761 } else { 762 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 763 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 764 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 765 cmpptr(monitor, alignof(ObjectMonitor*)); 766 jcc(Assembler::below, slow_path); 767 } 768 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 769 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 770 const Address cxq_address{monitor, ObjectMonitor::cxq_offset() - monitor_tag}; 771 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 772 const Address EntryList_address{monitor, ObjectMonitor::EntryList_offset() - monitor_tag}; 773 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 774 775 Label recursive; 776 777 // Check if recursive. 778 cmpptr(recursions_address, 0); 779 jccb(Assembler::notZero, recursive); 780 781 // Set owner to null. 782 // Release to satisfy the JMM 783 movptr(owner_address, NULL_WORD); 784 // We need a full fence after clearing owner to avoid stranding. 785 // StoreLoad achieves this. 786 membar(StoreLoad); 787 788 // Check if the entry lists are empty (EntryList first - by convention). 789 movptr(reg_rax, EntryList_address); 790 orptr(reg_rax, cxq_address); 791 jccb(Assembler::zero, unlocked); // If so we are done. 792 793 // Check if there is a successor. 794 cmpptr(succ_address, NULL_WORD); 795 jccb(Assembler::notZero, unlocked); // If so we are done. 796 797 // Save the monitor pointer in the current thread, so we can try to 798 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 799 if (!UseObjectMonitorTable) { 800 andptr(monitor, ~(int32_t)markWord::monitor_value); 801 } 802 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 803 804 orl(t, 1); // Fast Unlock ZF = 0 805 jmpb(slow_path); 806 807 // Recursive unlock. 808 bind(recursive); 809 decrement(recursions_address); 810 #endif // _LP64 811 } 812 813 bind(unlocked); 814 xorl(t, t); // Fast Unlock ZF = 1 815 816 #ifdef ASSERT 817 // Check that unlocked label is reached with ZF set. 818 Label zf_correct; 819 Label zf_bad_zero; 820 jcc(Assembler::zero, zf_correct); 821 jmp(zf_bad_zero); 822 #endif 823 824 bind(slow_path); 825 if (stub != nullptr) { 826 bind(stub->slow_path_continuation()); 827 } 828 #ifdef ASSERT 829 // Check that stub->continuation() label is reached with ZF not set. 830 jcc(Assembler::notZero, zf_correct); 831 stop("Fast Unlock ZF != 0"); 832 bind(zf_bad_zero); 833 stop("Fast Unlock ZF != 1"); 834 bind(zf_correct); 835 #endif 836 // C2 uses the value of ZF to determine the continuation. 837 } 838 839 //------------------------------------------------------------------------------------------- 840 // Generic instructions support for use in .ad files C2 code generation 841 842 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 843 if (dst != src) { 844 movdqu(dst, src); 845 } 846 if (opcode == Op_AbsVD) { 847 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 848 } else { 849 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 850 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 851 } 852 } 853 854 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 855 if (opcode == Op_AbsVD) { 856 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 857 } else { 858 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 859 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 860 } 861 } 862 863 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 864 if (dst != src) { 865 movdqu(dst, src); 866 } 867 if (opcode == Op_AbsVF) { 868 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 869 } else { 870 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 871 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 872 } 873 } 874 875 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 876 if (opcode == Op_AbsVF) { 877 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 878 } else { 879 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 880 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 881 } 882 } 883 884 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 885 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 886 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 887 888 if (opcode == Op_MinV) { 889 if (elem_bt == T_BYTE) { 890 pminsb(dst, src); 891 } else if (elem_bt == T_SHORT) { 892 pminsw(dst, src); 893 } else if (elem_bt == T_INT) { 894 pminsd(dst, src); 895 } else { 896 assert(elem_bt == T_LONG, "required"); 897 assert(tmp == xmm0, "required"); 898 assert_different_registers(dst, src, tmp); 899 movdqu(xmm0, dst); 900 pcmpgtq(xmm0, src); 901 blendvpd(dst, src); // xmm0 as mask 902 } 903 } else { // opcode == Op_MaxV 904 if (elem_bt == T_BYTE) { 905 pmaxsb(dst, src); 906 } else if (elem_bt == T_SHORT) { 907 pmaxsw(dst, src); 908 } else if (elem_bt == T_INT) { 909 pmaxsd(dst, src); 910 } else { 911 assert(elem_bt == T_LONG, "required"); 912 assert(tmp == xmm0, "required"); 913 assert_different_registers(dst, src, tmp); 914 movdqu(xmm0, src); 915 pcmpgtq(xmm0, dst); 916 blendvpd(dst, src); // xmm0 as mask 917 } 918 } 919 } 920 921 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 922 XMMRegister src1, Address src2, int vlen_enc) { 923 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 924 if (opcode == Op_UMinV) { 925 switch(elem_bt) { 926 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 927 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 928 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 929 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 930 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 931 } 932 } else { 933 assert(opcode == Op_UMaxV, "required"); 934 switch(elem_bt) { 935 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 936 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 937 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 938 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 939 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 940 } 941 } 942 } 943 944 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 945 // For optimality, leverage a full vector width of 512 bits 946 // for operations over smaller vector sizes on AVX512 targets. 947 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { 948 if (opcode == Op_UMaxV) { 949 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 950 } else { 951 assert(opcode == Op_UMinV, "required"); 952 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 953 } 954 } else { 955 // T1 = -1 956 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc); 957 // T1 = -1 << 63 958 vpsllq(xtmp1, xtmp1, 63, vlen_enc); 959 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2 960 vpaddq(xtmp2, xtmp1, src2, vlen_enc); 961 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1 962 vpaddq(xtmp1, xtmp1, src1, vlen_enc); 963 // Mask = T2 > T1 964 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc); 965 if (opcode == Op_UMaxV) { 966 // Res = Mask ? Src2 : Src1 967 vpblendvb(dst, src1, src2, xtmp1, vlen_enc); 968 } else { 969 // Res = Mask ? Src1 : Src2 970 vpblendvb(dst, src2, src1, xtmp1, vlen_enc); 971 } 972 } 973 } 974 975 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 976 XMMRegister src1, XMMRegister src2, int vlen_enc) { 977 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 978 if (opcode == Op_UMinV) { 979 switch(elem_bt) { 980 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 981 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 982 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 983 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 984 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 985 } 986 } else { 987 assert(opcode == Op_UMaxV, "required"); 988 switch(elem_bt) { 989 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 990 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 991 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 992 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 993 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 994 } 995 } 996 } 997 998 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 999 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1000 int vlen_enc) { 1001 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1002 1003 if (opcode == Op_MinV) { 1004 if (elem_bt == T_BYTE) { 1005 vpminsb(dst, src1, src2, vlen_enc); 1006 } else if (elem_bt == T_SHORT) { 1007 vpminsw(dst, src1, src2, vlen_enc); 1008 } else if (elem_bt == T_INT) { 1009 vpminsd(dst, src1, src2, vlen_enc); 1010 } else { 1011 assert(elem_bt == T_LONG, "required"); 1012 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1013 vpminsq(dst, src1, src2, vlen_enc); 1014 } else { 1015 assert_different_registers(dst, src1, src2); 1016 vpcmpgtq(dst, src1, src2, vlen_enc); 1017 vblendvpd(dst, src1, src2, dst, vlen_enc); 1018 } 1019 } 1020 } else { // opcode == Op_MaxV 1021 if (elem_bt == T_BYTE) { 1022 vpmaxsb(dst, src1, src2, vlen_enc); 1023 } else if (elem_bt == T_SHORT) { 1024 vpmaxsw(dst, src1, src2, vlen_enc); 1025 } else if (elem_bt == T_INT) { 1026 vpmaxsd(dst, src1, src2, vlen_enc); 1027 } else { 1028 assert(elem_bt == T_LONG, "required"); 1029 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1030 vpmaxsq(dst, src1, src2, vlen_enc); 1031 } else { 1032 assert_different_registers(dst, src1, src2); 1033 vpcmpgtq(dst, src1, src2, vlen_enc); 1034 vblendvpd(dst, src2, src1, dst, vlen_enc); 1035 } 1036 } 1037 } 1038 } 1039 1040 // Float/Double min max 1041 1042 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1043 XMMRegister dst, XMMRegister a, XMMRegister b, 1044 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1045 int vlen_enc) { 1046 assert(UseAVX > 0, "required"); 1047 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1048 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1049 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1050 assert_different_registers(a, tmp, atmp, btmp); 1051 assert_different_registers(b, tmp, atmp, btmp); 1052 1053 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1054 bool is_double_word = is_double_word_type(elem_bt); 1055 1056 /* Note on 'non-obvious' assembly sequence: 1057 * 1058 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1059 * and Java on how they handle floats: 1060 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1061 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1062 * 1063 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1064 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1065 * (only useful when signs differ, noop otherwise) 1066 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1067 1068 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1069 * btmp = (b < +0.0) ? a : b 1070 * atmp = (b < +0.0) ? b : a 1071 * Tmp = Max_Float(atmp , btmp) 1072 * Res = (atmp == NaN) ? atmp : Tmp 1073 */ 1074 1075 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1076 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1077 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1078 XMMRegister mask; 1079 1080 if (!is_double_word && is_min) { 1081 mask = a; 1082 vblend = &MacroAssembler::vblendvps; 1083 vmaxmin = &MacroAssembler::vminps; 1084 vcmp = &MacroAssembler::vcmpps; 1085 } else if (!is_double_word && !is_min) { 1086 mask = b; 1087 vblend = &MacroAssembler::vblendvps; 1088 vmaxmin = &MacroAssembler::vmaxps; 1089 vcmp = &MacroAssembler::vcmpps; 1090 } else if (is_double_word && is_min) { 1091 mask = a; 1092 vblend = &MacroAssembler::vblendvpd; 1093 vmaxmin = &MacroAssembler::vminpd; 1094 vcmp = &MacroAssembler::vcmppd; 1095 } else { 1096 assert(is_double_word && !is_min, "sanity"); 1097 mask = b; 1098 vblend = &MacroAssembler::vblendvpd; 1099 vmaxmin = &MacroAssembler::vmaxpd; 1100 vcmp = &MacroAssembler::vcmppd; 1101 } 1102 1103 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1104 XMMRegister maxmin, scratch; 1105 if (dst == btmp) { 1106 maxmin = btmp; 1107 scratch = tmp; 1108 } else { 1109 maxmin = tmp; 1110 scratch = btmp; 1111 } 1112 1113 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1114 if (precompute_mask && !is_double_word) { 1115 vpsrad(tmp, mask, 32, vlen_enc); 1116 mask = tmp; 1117 } else if (precompute_mask && is_double_word) { 1118 vpxor(tmp, tmp, tmp, vlen_enc); 1119 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1120 mask = tmp; 1121 } 1122 1123 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1124 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1125 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1126 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1127 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1128 } 1129 1130 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1131 XMMRegister dst, XMMRegister a, XMMRegister b, 1132 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1133 int vlen_enc) { 1134 assert(UseAVX > 2, "required"); 1135 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1136 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1137 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1138 assert_different_registers(dst, a, atmp, btmp); 1139 assert_different_registers(dst, b, atmp, btmp); 1140 1141 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1142 bool is_double_word = is_double_word_type(elem_bt); 1143 bool merge = true; 1144 1145 if (!is_double_word && is_min) { 1146 evpmovd2m(ktmp, a, vlen_enc); 1147 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1148 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1149 vminps(dst, atmp, btmp, vlen_enc); 1150 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1151 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1152 } else if (!is_double_word && !is_min) { 1153 evpmovd2m(ktmp, b, vlen_enc); 1154 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1155 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1156 vmaxps(dst, atmp, btmp, vlen_enc); 1157 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1158 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1159 } else if (is_double_word && is_min) { 1160 evpmovq2m(ktmp, a, vlen_enc); 1161 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1162 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1163 vminpd(dst, atmp, btmp, vlen_enc); 1164 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1165 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1166 } else { 1167 assert(is_double_word && !is_min, "sanity"); 1168 evpmovq2m(ktmp, b, vlen_enc); 1169 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1170 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1171 vmaxpd(dst, atmp, btmp, vlen_enc); 1172 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1173 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1174 } 1175 } 1176 1177 // Float/Double signum 1178 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1179 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1180 1181 Label DONE_LABEL; 1182 1183 if (opcode == Op_SignumF) { 1184 assert(UseSSE > 0, "required"); 1185 ucomiss(dst, zero); 1186 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1187 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1188 movflt(dst, one); 1189 jcc(Assembler::above, DONE_LABEL); 1190 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1191 } else if (opcode == Op_SignumD) { 1192 assert(UseSSE > 1, "required"); 1193 ucomisd(dst, zero); 1194 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1195 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1196 movdbl(dst, one); 1197 jcc(Assembler::above, DONE_LABEL); 1198 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1199 } 1200 1201 bind(DONE_LABEL); 1202 } 1203 1204 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1205 if (sign) { 1206 pmovsxbw(dst, src); 1207 } else { 1208 pmovzxbw(dst, src); 1209 } 1210 } 1211 1212 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1213 if (sign) { 1214 vpmovsxbw(dst, src, vector_len); 1215 } else { 1216 vpmovzxbw(dst, src, vector_len); 1217 } 1218 } 1219 1220 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1221 if (sign) { 1222 vpmovsxbd(dst, src, vector_len); 1223 } else { 1224 vpmovzxbd(dst, src, vector_len); 1225 } 1226 } 1227 1228 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1229 if (sign) { 1230 vpmovsxwd(dst, src, vector_len); 1231 } else { 1232 vpmovzxwd(dst, src, vector_len); 1233 } 1234 } 1235 1236 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1237 int shift, int vector_len) { 1238 if (opcode == Op_RotateLeftV) { 1239 if (etype == T_INT) { 1240 evprold(dst, src, shift, vector_len); 1241 } else { 1242 assert(etype == T_LONG, "expected type T_LONG"); 1243 evprolq(dst, src, shift, vector_len); 1244 } 1245 } else { 1246 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1247 if (etype == T_INT) { 1248 evprord(dst, src, shift, vector_len); 1249 } else { 1250 assert(etype == T_LONG, "expected type T_LONG"); 1251 evprorq(dst, src, shift, vector_len); 1252 } 1253 } 1254 } 1255 1256 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1257 XMMRegister shift, int vector_len) { 1258 if (opcode == Op_RotateLeftV) { 1259 if (etype == T_INT) { 1260 evprolvd(dst, src, shift, vector_len); 1261 } else { 1262 assert(etype == T_LONG, "expected type T_LONG"); 1263 evprolvq(dst, src, shift, vector_len); 1264 } 1265 } else { 1266 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1267 if (etype == T_INT) { 1268 evprorvd(dst, src, shift, vector_len); 1269 } else { 1270 assert(etype == T_LONG, "expected type T_LONG"); 1271 evprorvq(dst, src, shift, vector_len); 1272 } 1273 } 1274 } 1275 1276 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1277 if (opcode == Op_RShiftVI) { 1278 psrad(dst, shift); 1279 } else if (opcode == Op_LShiftVI) { 1280 pslld(dst, shift); 1281 } else { 1282 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1283 psrld(dst, shift); 1284 } 1285 } 1286 1287 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1288 switch (opcode) { 1289 case Op_RShiftVI: psrad(dst, shift); break; 1290 case Op_LShiftVI: pslld(dst, shift); break; 1291 case Op_URShiftVI: psrld(dst, shift); break; 1292 1293 default: assert(false, "%s", NodeClassNames[opcode]); 1294 } 1295 } 1296 1297 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1298 if (opcode == Op_RShiftVI) { 1299 vpsrad(dst, nds, shift, vector_len); 1300 } else if (opcode == Op_LShiftVI) { 1301 vpslld(dst, nds, shift, vector_len); 1302 } else { 1303 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1304 vpsrld(dst, nds, shift, vector_len); 1305 } 1306 } 1307 1308 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1309 switch (opcode) { 1310 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1311 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1312 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1313 1314 default: assert(false, "%s", NodeClassNames[opcode]); 1315 } 1316 } 1317 1318 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1319 switch (opcode) { 1320 case Op_RShiftVB: // fall-through 1321 case Op_RShiftVS: psraw(dst, shift); break; 1322 1323 case Op_LShiftVB: // fall-through 1324 case Op_LShiftVS: psllw(dst, shift); break; 1325 1326 case Op_URShiftVS: // fall-through 1327 case Op_URShiftVB: psrlw(dst, shift); break; 1328 1329 default: assert(false, "%s", NodeClassNames[opcode]); 1330 } 1331 } 1332 1333 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1334 switch (opcode) { 1335 case Op_RShiftVB: // fall-through 1336 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1337 1338 case Op_LShiftVB: // fall-through 1339 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1340 1341 case Op_URShiftVS: // fall-through 1342 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1343 1344 default: assert(false, "%s", NodeClassNames[opcode]); 1345 } 1346 } 1347 1348 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1349 switch (opcode) { 1350 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1351 case Op_LShiftVL: psllq(dst, shift); break; 1352 case Op_URShiftVL: psrlq(dst, shift); break; 1353 1354 default: assert(false, "%s", NodeClassNames[opcode]); 1355 } 1356 } 1357 1358 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1359 if (opcode == Op_RShiftVL) { 1360 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1361 } else if (opcode == Op_LShiftVL) { 1362 psllq(dst, shift); 1363 } else { 1364 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1365 psrlq(dst, shift); 1366 } 1367 } 1368 1369 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1370 switch (opcode) { 1371 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1372 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1373 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1374 1375 default: assert(false, "%s", NodeClassNames[opcode]); 1376 } 1377 } 1378 1379 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1380 if (opcode == Op_RShiftVL) { 1381 evpsraq(dst, nds, shift, vector_len); 1382 } else if (opcode == Op_LShiftVL) { 1383 vpsllq(dst, nds, shift, vector_len); 1384 } else { 1385 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1386 vpsrlq(dst, nds, shift, vector_len); 1387 } 1388 } 1389 1390 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1391 switch (opcode) { 1392 case Op_RShiftVB: // fall-through 1393 case Op_RShiftVS: // fall-through 1394 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1395 1396 case Op_LShiftVB: // fall-through 1397 case Op_LShiftVS: // fall-through 1398 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1399 1400 case Op_URShiftVB: // fall-through 1401 case Op_URShiftVS: // fall-through 1402 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1403 1404 default: assert(false, "%s", NodeClassNames[opcode]); 1405 } 1406 } 1407 1408 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1409 switch (opcode) { 1410 case Op_RShiftVB: // fall-through 1411 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1412 1413 case Op_LShiftVB: // fall-through 1414 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1415 1416 case Op_URShiftVB: // fall-through 1417 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1418 1419 default: assert(false, "%s", NodeClassNames[opcode]); 1420 } 1421 } 1422 1423 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1424 assert(UseAVX >= 2, "required"); 1425 switch (opcode) { 1426 case Op_RShiftVL: { 1427 if (UseAVX > 2) { 1428 assert(tmp == xnoreg, "not used"); 1429 if (!VM_Version::supports_avx512vl()) { 1430 vlen_enc = Assembler::AVX_512bit; 1431 } 1432 evpsravq(dst, src, shift, vlen_enc); 1433 } else { 1434 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1435 vpsrlvq(dst, src, shift, vlen_enc); 1436 vpsrlvq(tmp, tmp, shift, vlen_enc); 1437 vpxor(dst, dst, tmp, vlen_enc); 1438 vpsubq(dst, dst, tmp, vlen_enc); 1439 } 1440 break; 1441 } 1442 case Op_LShiftVL: { 1443 assert(tmp == xnoreg, "not used"); 1444 vpsllvq(dst, src, shift, vlen_enc); 1445 break; 1446 } 1447 case Op_URShiftVL: { 1448 assert(tmp == xnoreg, "not used"); 1449 vpsrlvq(dst, src, shift, vlen_enc); 1450 break; 1451 } 1452 default: assert(false, "%s", NodeClassNames[opcode]); 1453 } 1454 } 1455 1456 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1457 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1458 assert(opcode == Op_LShiftVB || 1459 opcode == Op_RShiftVB || 1460 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1461 bool sign = (opcode != Op_URShiftVB); 1462 assert(vector_len == 0, "required"); 1463 vextendbd(sign, dst, src, 1); 1464 vpmovzxbd(vtmp, shift, 1); 1465 varshiftd(opcode, dst, dst, vtmp, 1); 1466 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1467 vextracti128_high(vtmp, dst); 1468 vpackusdw(dst, dst, vtmp, 0); 1469 } 1470 1471 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1472 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1473 assert(opcode == Op_LShiftVB || 1474 opcode == Op_RShiftVB || 1475 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1476 bool sign = (opcode != Op_URShiftVB); 1477 int ext_vector_len = vector_len + 1; 1478 vextendbw(sign, dst, src, ext_vector_len); 1479 vpmovzxbw(vtmp, shift, ext_vector_len); 1480 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1481 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1482 if (vector_len == 0) { 1483 vextracti128_high(vtmp, dst); 1484 vpackuswb(dst, dst, vtmp, vector_len); 1485 } else { 1486 vextracti64x4_high(vtmp, dst); 1487 vpackuswb(dst, dst, vtmp, vector_len); 1488 vpermq(dst, dst, 0xD8, vector_len); 1489 } 1490 } 1491 1492 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1493 switch(typ) { 1494 case T_BYTE: 1495 pinsrb(dst, val, idx); 1496 break; 1497 case T_SHORT: 1498 pinsrw(dst, val, idx); 1499 break; 1500 case T_INT: 1501 pinsrd(dst, val, idx); 1502 break; 1503 case T_LONG: 1504 pinsrq(dst, val, idx); 1505 break; 1506 default: 1507 assert(false,"Should not reach here."); 1508 break; 1509 } 1510 } 1511 1512 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1513 switch(typ) { 1514 case T_BYTE: 1515 vpinsrb(dst, src, val, idx); 1516 break; 1517 case T_SHORT: 1518 vpinsrw(dst, src, val, idx); 1519 break; 1520 case T_INT: 1521 vpinsrd(dst, src, val, idx); 1522 break; 1523 case T_LONG: 1524 vpinsrq(dst, src, val, idx); 1525 break; 1526 default: 1527 assert(false,"Should not reach here."); 1528 break; 1529 } 1530 } 1531 1532 #ifdef _LP64 1533 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1534 XMMRegister dst, Register base, 1535 Register idx_base, 1536 Register offset, Register mask, 1537 Register mask_idx, Register rtmp, 1538 int vlen_enc) { 1539 vpxor(dst, dst, dst, vlen_enc); 1540 if (elem_bt == T_SHORT) { 1541 for (int i = 0; i < 4; i++) { 1542 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1543 Label skip_load; 1544 btq(mask, mask_idx); 1545 jccb(Assembler::carryClear, skip_load); 1546 movl(rtmp, Address(idx_base, i * 4)); 1547 if (offset != noreg) { 1548 addl(rtmp, offset); 1549 } 1550 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1551 bind(skip_load); 1552 incq(mask_idx); 1553 } 1554 } else { 1555 assert(elem_bt == T_BYTE, ""); 1556 for (int i = 0; i < 8; i++) { 1557 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1558 Label skip_load; 1559 btq(mask, mask_idx); 1560 jccb(Assembler::carryClear, skip_load); 1561 movl(rtmp, Address(idx_base, i * 4)); 1562 if (offset != noreg) { 1563 addl(rtmp, offset); 1564 } 1565 pinsrb(dst, Address(base, rtmp), i); 1566 bind(skip_load); 1567 incq(mask_idx); 1568 } 1569 } 1570 } 1571 #endif // _LP64 1572 1573 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1574 Register base, Register idx_base, 1575 Register offset, Register rtmp, 1576 int vlen_enc) { 1577 vpxor(dst, dst, dst, vlen_enc); 1578 if (elem_bt == T_SHORT) { 1579 for (int i = 0; i < 4; i++) { 1580 // dst[i] = src[offset + idx_base[i]] 1581 movl(rtmp, Address(idx_base, i * 4)); 1582 if (offset != noreg) { 1583 addl(rtmp, offset); 1584 } 1585 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1586 } 1587 } else { 1588 assert(elem_bt == T_BYTE, ""); 1589 for (int i = 0; i < 8; i++) { 1590 // dst[i] = src[offset + idx_base[i]] 1591 movl(rtmp, Address(idx_base, i * 4)); 1592 if (offset != noreg) { 1593 addl(rtmp, offset); 1594 } 1595 pinsrb(dst, Address(base, rtmp), i); 1596 } 1597 } 1598 } 1599 1600 /* 1601 * Gather using hybrid algorithm, first partially unroll scalar loop 1602 * to accumulate values from gather indices into a quad-word(64bit) slice. 1603 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1604 * permutation to place the slice into appropriate vector lane 1605 * locations in destination vector. Following pseudo code describes the 1606 * algorithm in detail: 1607 * 1608 * DST_VEC = ZERO_VEC 1609 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1610 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1611 * FOREACH_ITER: 1612 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1613 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1614 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1615 * PERM_INDEX = PERM_INDEX - TWO_VEC 1616 * 1617 * With each iteration, doubleword permute indices (0,1) corresponding 1618 * to gathered quadword gets right shifted by two lane positions. 1619 * 1620 */ 1621 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1622 Register base, Register idx_base, 1623 Register offset, Register mask, 1624 XMMRegister xtmp1, XMMRegister xtmp2, 1625 XMMRegister temp_dst, Register rtmp, 1626 Register mask_idx, Register length, 1627 int vector_len, int vlen_enc) { 1628 Label GATHER8_LOOP; 1629 assert(is_subword_type(elem_ty), ""); 1630 movl(length, vector_len); 1631 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1632 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1633 vallones(xtmp2, vlen_enc); 1634 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1635 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1636 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1637 1638 bind(GATHER8_LOOP); 1639 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1640 if (mask == noreg) { 1641 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1642 } else { 1643 LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc)); 1644 } 1645 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1646 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1647 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1648 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1649 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1650 vpor(dst, dst, temp_dst, vlen_enc); 1651 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1652 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1653 jcc(Assembler::notEqual, GATHER8_LOOP); 1654 } 1655 1656 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1657 switch(typ) { 1658 case T_INT: 1659 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1660 break; 1661 case T_FLOAT: 1662 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1663 break; 1664 case T_LONG: 1665 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1666 break; 1667 case T_DOUBLE: 1668 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1669 break; 1670 default: 1671 assert(false,"Should not reach here."); 1672 break; 1673 } 1674 } 1675 1676 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1677 switch(typ) { 1678 case T_INT: 1679 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1680 break; 1681 case T_FLOAT: 1682 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1683 break; 1684 case T_LONG: 1685 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1686 break; 1687 case T_DOUBLE: 1688 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1689 break; 1690 default: 1691 assert(false,"Should not reach here."); 1692 break; 1693 } 1694 } 1695 1696 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1697 switch(typ) { 1698 case T_INT: 1699 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1700 break; 1701 case T_FLOAT: 1702 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1703 break; 1704 case T_LONG: 1705 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1706 break; 1707 case T_DOUBLE: 1708 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1709 break; 1710 default: 1711 assert(false,"Should not reach here."); 1712 break; 1713 } 1714 } 1715 1716 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1717 if (vlen_in_bytes <= 16) { 1718 pxor (dst, dst); 1719 psubb(dst, src); 1720 switch (elem_bt) { 1721 case T_BYTE: /* nothing to do */ break; 1722 case T_SHORT: pmovsxbw(dst, dst); break; 1723 case T_INT: pmovsxbd(dst, dst); break; 1724 case T_FLOAT: pmovsxbd(dst, dst); break; 1725 case T_LONG: pmovsxbq(dst, dst); break; 1726 case T_DOUBLE: pmovsxbq(dst, dst); break; 1727 1728 default: assert(false, "%s", type2name(elem_bt)); 1729 } 1730 } else { 1731 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1732 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1733 1734 vpxor (dst, dst, dst, vlen_enc); 1735 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1736 1737 switch (elem_bt) { 1738 case T_BYTE: /* nothing to do */ break; 1739 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1740 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1741 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1742 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1743 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1744 1745 default: assert(false, "%s", type2name(elem_bt)); 1746 } 1747 } 1748 } 1749 1750 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1751 if (novlbwdq) { 1752 vpmovsxbd(xtmp, src, vlen_enc); 1753 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1754 Assembler::eq, true, vlen_enc, noreg); 1755 } else { 1756 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1757 vpsubb(xtmp, xtmp, src, vlen_enc); 1758 evpmovb2m(dst, xtmp, vlen_enc); 1759 } 1760 } 1761 1762 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) { 1763 if (is_integral_type(bt)) { 1764 switch (vlen_in_bytes) { 1765 case 4: movdl(dst, src); break; 1766 case 8: movq(dst, src); break; 1767 case 16: movdqu(dst, src); break; 1768 case 32: vmovdqu(dst, src); break; 1769 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1770 default: ShouldNotReachHere(); 1771 } 1772 } else { 1773 switch (vlen_in_bytes) { 1774 case 4: movflt(dst, src); break; 1775 case 8: movdbl(dst, src); break; 1776 case 16: movups(dst, src); break; 1777 case 32: vmovups(dst, src, Assembler::AVX_256bit); break; 1778 case 64: vmovups(dst, src, Assembler::AVX_512bit); break; 1779 default: ShouldNotReachHere(); 1780 } 1781 } 1782 } 1783 1784 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1785 assert(rscratch != noreg || always_reachable(src), "missing"); 1786 1787 if (reachable(src)) { 1788 load_vector(bt, dst, as_Address(src), vlen_in_bytes); 1789 } else { 1790 lea(rscratch, src); 1791 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes); 1792 } 1793 } 1794 1795 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1796 int vlen_enc = vector_length_encoding(vlen); 1797 if (VM_Version::supports_avx()) { 1798 if (bt == T_LONG) { 1799 if (VM_Version::supports_avx2()) { 1800 vpbroadcastq(dst, src, vlen_enc); 1801 } else { 1802 vmovddup(dst, src, vlen_enc); 1803 } 1804 } else if (bt == T_DOUBLE) { 1805 if (vlen_enc != Assembler::AVX_128bit) { 1806 vbroadcastsd(dst, src, vlen_enc, noreg); 1807 } else { 1808 vmovddup(dst, src, vlen_enc); 1809 } 1810 } else { 1811 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1812 vpbroadcastd(dst, src, vlen_enc); 1813 } else { 1814 vbroadcastss(dst, src, vlen_enc); 1815 } 1816 } 1817 } else if (VM_Version::supports_sse3()) { 1818 movddup(dst, src); 1819 } else { 1820 load_vector(bt, dst, src, vlen); 1821 } 1822 } 1823 1824 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1825 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1826 int offset = exact_log2(type2aelembytes(bt)) << 6; 1827 if (is_floating_point_type(bt)) { 1828 offset += 128; 1829 } 1830 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1831 load_vector(T_BYTE, dst, addr, vlen_in_bytes); 1832 } 1833 1834 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1835 1836 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1837 int vector_len = Assembler::AVX_128bit; 1838 1839 switch (opcode) { 1840 case Op_AndReductionV: pand(dst, src); break; 1841 case Op_OrReductionV: por (dst, src); break; 1842 case Op_XorReductionV: pxor(dst, src); break; 1843 case Op_MinReductionV: 1844 switch (typ) { 1845 case T_BYTE: pminsb(dst, src); break; 1846 case T_SHORT: pminsw(dst, src); break; 1847 case T_INT: pminsd(dst, src); break; 1848 case T_LONG: assert(UseAVX > 2, "required"); 1849 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1850 default: assert(false, "wrong type"); 1851 } 1852 break; 1853 case Op_MaxReductionV: 1854 switch (typ) { 1855 case T_BYTE: pmaxsb(dst, src); break; 1856 case T_SHORT: pmaxsw(dst, src); break; 1857 case T_INT: pmaxsd(dst, src); break; 1858 case T_LONG: assert(UseAVX > 2, "required"); 1859 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1860 default: assert(false, "wrong type"); 1861 } 1862 break; 1863 case Op_AddReductionVF: addss(dst, src); break; 1864 case Op_AddReductionVD: addsd(dst, src); break; 1865 case Op_AddReductionVI: 1866 switch (typ) { 1867 case T_BYTE: paddb(dst, src); break; 1868 case T_SHORT: paddw(dst, src); break; 1869 case T_INT: paddd(dst, src); break; 1870 default: assert(false, "wrong type"); 1871 } 1872 break; 1873 case Op_AddReductionVL: paddq(dst, src); break; 1874 case Op_MulReductionVF: mulss(dst, src); break; 1875 case Op_MulReductionVD: mulsd(dst, src); break; 1876 case Op_MulReductionVI: 1877 switch (typ) { 1878 case T_SHORT: pmullw(dst, src); break; 1879 case T_INT: pmulld(dst, src); break; 1880 default: assert(false, "wrong type"); 1881 } 1882 break; 1883 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1884 evpmullq(dst, dst, src, vector_len); break; 1885 default: assert(false, "wrong opcode"); 1886 } 1887 } 1888 1889 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1890 switch (opcode) { 1891 case Op_AddReductionVF: addps(dst, src); break; 1892 case Op_AddReductionVD: addpd(dst, src); break; 1893 case Op_MulReductionVF: mulps(dst, src); break; 1894 case Op_MulReductionVD: mulpd(dst, src); break; 1895 default: assert(false, "%s", NodeClassNames[opcode]); 1896 } 1897 } 1898 1899 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1900 int vector_len = Assembler::AVX_256bit; 1901 1902 switch (opcode) { 1903 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1904 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1905 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1906 case Op_MinReductionV: 1907 switch (typ) { 1908 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1909 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1910 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1911 case T_LONG: assert(UseAVX > 2, "required"); 1912 vpminsq(dst, src1, src2, vector_len); break; 1913 default: assert(false, "wrong type"); 1914 } 1915 break; 1916 case Op_MaxReductionV: 1917 switch (typ) { 1918 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1919 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1920 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1921 case T_LONG: assert(UseAVX > 2, "required"); 1922 vpmaxsq(dst, src1, src2, vector_len); break; 1923 default: assert(false, "wrong type"); 1924 } 1925 break; 1926 case Op_AddReductionVI: 1927 switch (typ) { 1928 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1929 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1930 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1931 default: assert(false, "wrong type"); 1932 } 1933 break; 1934 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1935 case Op_MulReductionVI: 1936 switch (typ) { 1937 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1938 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1939 default: assert(false, "wrong type"); 1940 } 1941 break; 1942 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1943 default: assert(false, "wrong opcode"); 1944 } 1945 } 1946 1947 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1948 int vector_len = Assembler::AVX_256bit; 1949 1950 switch (opcode) { 1951 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 1952 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 1953 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 1954 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 1955 default: assert(false, "%s", NodeClassNames[opcode]); 1956 } 1957 } 1958 1959 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1960 XMMRegister dst, XMMRegister src, 1961 XMMRegister vtmp1, XMMRegister vtmp2) { 1962 switch (opcode) { 1963 case Op_AddReductionVF: 1964 case Op_MulReductionVF: 1965 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1966 break; 1967 1968 case Op_AddReductionVD: 1969 case Op_MulReductionVD: 1970 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1971 break; 1972 1973 default: assert(false, "wrong opcode"); 1974 } 1975 } 1976 1977 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 1978 XMMRegister dst, XMMRegister src, 1979 XMMRegister vtmp1, XMMRegister vtmp2) { 1980 switch (opcode) { 1981 case Op_AddReductionVF: 1982 case Op_MulReductionVF: 1983 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1984 break; 1985 1986 case Op_AddReductionVD: 1987 case Op_MulReductionVD: 1988 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1989 break; 1990 1991 default: assert(false, "%s", NodeClassNames[opcode]); 1992 } 1993 } 1994 1995 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1996 Register dst, Register src1, XMMRegister src2, 1997 XMMRegister vtmp1, XMMRegister vtmp2) { 1998 switch (vlen) { 1999 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2000 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2001 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2002 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2003 2004 default: assert(false, "wrong vector length"); 2005 } 2006 } 2007 2008 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 2009 Register dst, Register src1, XMMRegister src2, 2010 XMMRegister vtmp1, XMMRegister vtmp2) { 2011 switch (vlen) { 2012 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2013 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2014 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2015 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2016 2017 default: assert(false, "wrong vector length"); 2018 } 2019 } 2020 2021 void C2_MacroAssembler::reduceS(int opcode, int vlen, 2022 Register dst, Register src1, XMMRegister src2, 2023 XMMRegister vtmp1, XMMRegister vtmp2) { 2024 switch (vlen) { 2025 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2026 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2027 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2028 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2029 2030 default: assert(false, "wrong vector length"); 2031 } 2032 } 2033 2034 void C2_MacroAssembler::reduceI(int opcode, int vlen, 2035 Register dst, Register src1, XMMRegister src2, 2036 XMMRegister vtmp1, XMMRegister vtmp2) { 2037 switch (vlen) { 2038 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2039 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2040 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2041 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2042 2043 default: assert(false, "wrong vector length"); 2044 } 2045 } 2046 2047 #ifdef _LP64 2048 void C2_MacroAssembler::reduceL(int opcode, int vlen, 2049 Register dst, Register src1, XMMRegister src2, 2050 XMMRegister vtmp1, XMMRegister vtmp2) { 2051 switch (vlen) { 2052 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2053 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2054 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2055 2056 default: assert(false, "wrong vector length"); 2057 } 2058 } 2059 #endif // _LP64 2060 2061 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2062 switch (vlen) { 2063 case 2: 2064 assert(vtmp2 == xnoreg, ""); 2065 reduce2F(opcode, dst, src, vtmp1); 2066 break; 2067 case 4: 2068 assert(vtmp2 == xnoreg, ""); 2069 reduce4F(opcode, dst, src, vtmp1); 2070 break; 2071 case 8: 2072 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2073 break; 2074 case 16: 2075 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2076 break; 2077 default: assert(false, "wrong vector length"); 2078 } 2079 } 2080 2081 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2082 switch (vlen) { 2083 case 2: 2084 assert(vtmp2 == xnoreg, ""); 2085 reduce2D(opcode, dst, src, vtmp1); 2086 break; 2087 case 4: 2088 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2089 break; 2090 case 8: 2091 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2092 break; 2093 default: assert(false, "wrong vector length"); 2094 } 2095 } 2096 2097 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2098 switch (vlen) { 2099 case 2: 2100 assert(vtmp1 == xnoreg, ""); 2101 assert(vtmp2 == xnoreg, ""); 2102 unorderedReduce2F(opcode, dst, src); 2103 break; 2104 case 4: 2105 assert(vtmp2 == xnoreg, ""); 2106 unorderedReduce4F(opcode, dst, src, vtmp1); 2107 break; 2108 case 8: 2109 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 2110 break; 2111 case 16: 2112 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 2113 break; 2114 default: assert(false, "wrong vector length"); 2115 } 2116 } 2117 2118 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2119 switch (vlen) { 2120 case 2: 2121 assert(vtmp1 == xnoreg, ""); 2122 assert(vtmp2 == xnoreg, ""); 2123 unorderedReduce2D(opcode, dst, src); 2124 break; 2125 case 4: 2126 assert(vtmp2 == xnoreg, ""); 2127 unorderedReduce4D(opcode, dst, src, vtmp1); 2128 break; 2129 case 8: 2130 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2131 break; 2132 default: assert(false, "wrong vector length"); 2133 } 2134 } 2135 2136 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2137 if (opcode == Op_AddReductionVI) { 2138 if (vtmp1 != src2) { 2139 movdqu(vtmp1, src2); 2140 } 2141 phaddd(vtmp1, vtmp1); 2142 } else { 2143 pshufd(vtmp1, src2, 0x1); 2144 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2145 } 2146 movdl(vtmp2, src1); 2147 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2148 movdl(dst, vtmp1); 2149 } 2150 2151 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2152 if (opcode == Op_AddReductionVI) { 2153 if (vtmp1 != src2) { 2154 movdqu(vtmp1, src2); 2155 } 2156 phaddd(vtmp1, src2); 2157 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2158 } else { 2159 pshufd(vtmp2, src2, 0xE); 2160 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2161 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2162 } 2163 } 2164 2165 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2166 if (opcode == Op_AddReductionVI) { 2167 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2168 vextracti128_high(vtmp2, vtmp1); 2169 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2170 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2171 } else { 2172 vextracti128_high(vtmp1, src2); 2173 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2174 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2175 } 2176 } 2177 2178 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2179 vextracti64x4_high(vtmp2, src2); 2180 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2181 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2182 } 2183 2184 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2185 pshufd(vtmp2, src2, 0x1); 2186 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2187 movdqu(vtmp1, vtmp2); 2188 psrldq(vtmp1, 2); 2189 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2190 movdqu(vtmp2, vtmp1); 2191 psrldq(vtmp2, 1); 2192 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2193 movdl(vtmp2, src1); 2194 pmovsxbd(vtmp1, vtmp1); 2195 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2196 pextrb(dst, vtmp1, 0x0); 2197 movsbl(dst, dst); 2198 } 2199 2200 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2201 pshufd(vtmp1, src2, 0xE); 2202 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2203 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2204 } 2205 2206 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2207 vextracti128_high(vtmp2, src2); 2208 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2209 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2210 } 2211 2212 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2213 vextracti64x4_high(vtmp1, src2); 2214 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2215 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2216 } 2217 2218 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2219 pmovsxbw(vtmp2, src2); 2220 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2221 } 2222 2223 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2224 if (UseAVX > 1) { 2225 int vector_len = Assembler::AVX_256bit; 2226 vpmovsxbw(vtmp1, src2, vector_len); 2227 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2228 } else { 2229 pmovsxbw(vtmp2, src2); 2230 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2231 pshufd(vtmp2, src2, 0x1); 2232 pmovsxbw(vtmp2, src2); 2233 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2234 } 2235 } 2236 2237 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2238 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2239 int vector_len = Assembler::AVX_512bit; 2240 vpmovsxbw(vtmp1, src2, vector_len); 2241 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2242 } else { 2243 assert(UseAVX >= 2,"Should not reach here."); 2244 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2245 vextracti128_high(vtmp2, src2); 2246 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2247 } 2248 } 2249 2250 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2251 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2252 vextracti64x4_high(vtmp2, src2); 2253 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2254 } 2255 2256 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2257 if (opcode == Op_AddReductionVI) { 2258 if (vtmp1 != src2) { 2259 movdqu(vtmp1, src2); 2260 } 2261 phaddw(vtmp1, vtmp1); 2262 phaddw(vtmp1, vtmp1); 2263 } else { 2264 pshufd(vtmp2, src2, 0x1); 2265 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2266 movdqu(vtmp1, vtmp2); 2267 psrldq(vtmp1, 2); 2268 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2269 } 2270 movdl(vtmp2, src1); 2271 pmovsxwd(vtmp1, vtmp1); 2272 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2273 pextrw(dst, vtmp1, 0x0); 2274 movswl(dst, dst); 2275 } 2276 2277 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2278 if (opcode == Op_AddReductionVI) { 2279 if (vtmp1 != src2) { 2280 movdqu(vtmp1, src2); 2281 } 2282 phaddw(vtmp1, src2); 2283 } else { 2284 pshufd(vtmp1, src2, 0xE); 2285 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2286 } 2287 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2288 } 2289 2290 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2291 if (opcode == Op_AddReductionVI) { 2292 int vector_len = Assembler::AVX_256bit; 2293 vphaddw(vtmp2, src2, src2, vector_len); 2294 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2295 } else { 2296 vextracti128_high(vtmp2, src2); 2297 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2298 } 2299 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2300 } 2301 2302 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2303 int vector_len = Assembler::AVX_256bit; 2304 vextracti64x4_high(vtmp1, src2); 2305 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2306 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2307 } 2308 2309 #ifdef _LP64 2310 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2311 pshufd(vtmp2, src2, 0xE); 2312 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2313 movdq(vtmp1, src1); 2314 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2315 movdq(dst, vtmp1); 2316 } 2317 2318 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2319 vextracti128_high(vtmp1, src2); 2320 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2321 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2322 } 2323 2324 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2325 vextracti64x4_high(vtmp2, src2); 2326 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2327 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2328 } 2329 2330 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2331 mov64(temp, -1L); 2332 bzhiq(temp, temp, len); 2333 kmovql(dst, temp); 2334 } 2335 #endif // _LP64 2336 2337 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2338 reduce_operation_128(T_FLOAT, opcode, dst, src); 2339 pshufd(vtmp, src, 0x1); 2340 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2341 } 2342 2343 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2344 reduce2F(opcode, dst, src, vtmp); 2345 pshufd(vtmp, src, 0x2); 2346 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2347 pshufd(vtmp, src, 0x3); 2348 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2349 } 2350 2351 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2352 reduce4F(opcode, dst, src, vtmp2); 2353 vextractf128_high(vtmp2, src); 2354 reduce4F(opcode, dst, vtmp2, vtmp1); 2355 } 2356 2357 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2358 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2359 vextracti64x4_high(vtmp1, src); 2360 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2361 } 2362 2363 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2364 pshufd(dst, src, 0x1); 2365 reduce_operation_128(T_FLOAT, opcode, dst, src); 2366 } 2367 2368 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2369 pshufd(vtmp, src, 0xE); 2370 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2371 unorderedReduce2F(opcode, dst, vtmp); 2372 } 2373 2374 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2375 vextractf128_high(vtmp1, src); 2376 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2377 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2378 } 2379 2380 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2381 vextractf64x4_high(vtmp2, src); 2382 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2383 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2384 } 2385 2386 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2387 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2388 pshufd(vtmp, src, 0xE); 2389 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2390 } 2391 2392 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2393 reduce2D(opcode, dst, src, vtmp2); 2394 vextractf128_high(vtmp2, src); 2395 reduce2D(opcode, dst, vtmp2, vtmp1); 2396 } 2397 2398 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2399 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2400 vextracti64x4_high(vtmp1, src); 2401 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2402 } 2403 2404 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2405 pshufd(dst, src, 0xE); 2406 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2407 } 2408 2409 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2410 vextractf128_high(vtmp, src); 2411 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2412 unorderedReduce2D(opcode, dst, vtmp); 2413 } 2414 2415 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2416 vextractf64x4_high(vtmp2, src); 2417 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2418 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2419 } 2420 2421 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2422 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2423 } 2424 2425 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2426 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2427 } 2428 2429 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 2430 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2431 } 2432 2433 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2434 int vec_enc) { 2435 switch(elem_bt) { 2436 case T_INT: 2437 case T_FLOAT: 2438 vmaskmovps(dst, src, mask, vec_enc); 2439 break; 2440 case T_LONG: 2441 case T_DOUBLE: 2442 vmaskmovpd(dst, src, mask, vec_enc); 2443 break; 2444 default: 2445 fatal("Unsupported type %s", type2name(elem_bt)); 2446 break; 2447 } 2448 } 2449 2450 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2451 int vec_enc) { 2452 switch(elem_bt) { 2453 case T_INT: 2454 case T_FLOAT: 2455 vmaskmovps(dst, src, mask, vec_enc); 2456 break; 2457 case T_LONG: 2458 case T_DOUBLE: 2459 vmaskmovpd(dst, src, mask, vec_enc); 2460 break; 2461 default: 2462 fatal("Unsupported type %s", type2name(elem_bt)); 2463 break; 2464 } 2465 } 2466 2467 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2468 XMMRegister dst, XMMRegister src, 2469 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2470 XMMRegister xmm_0, XMMRegister xmm_1) { 2471 const int permconst[] = {1, 14}; 2472 XMMRegister wsrc = src; 2473 XMMRegister wdst = xmm_0; 2474 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2475 2476 int vlen_enc = Assembler::AVX_128bit; 2477 if (vlen == 16) { 2478 vlen_enc = Assembler::AVX_256bit; 2479 } 2480 2481 for (int i = log2(vlen) - 1; i >=0; i--) { 2482 if (i == 0 && !is_dst_valid) { 2483 wdst = dst; 2484 } 2485 if (i == 3) { 2486 vextracti64x4_high(wtmp, wsrc); 2487 } else if (i == 2) { 2488 vextracti128_high(wtmp, wsrc); 2489 } else { // i = [0,1] 2490 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2491 } 2492 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2493 wsrc = wdst; 2494 vlen_enc = Assembler::AVX_128bit; 2495 } 2496 if (is_dst_valid) { 2497 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2498 } 2499 } 2500 2501 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2502 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2503 XMMRegister xmm_0, XMMRegister xmm_1) { 2504 XMMRegister wsrc = src; 2505 XMMRegister wdst = xmm_0; 2506 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2507 int vlen_enc = Assembler::AVX_128bit; 2508 if (vlen == 8) { 2509 vlen_enc = Assembler::AVX_256bit; 2510 } 2511 for (int i = log2(vlen) - 1; i >=0; i--) { 2512 if (i == 0 && !is_dst_valid) { 2513 wdst = dst; 2514 } 2515 if (i == 1) { 2516 vextracti128_high(wtmp, wsrc); 2517 } else if (i == 2) { 2518 vextracti64x4_high(wtmp, wsrc); 2519 } else { 2520 assert(i == 0, "%d", i); 2521 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2522 } 2523 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2524 wsrc = wdst; 2525 vlen_enc = Assembler::AVX_128bit; 2526 } 2527 if (is_dst_valid) { 2528 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2529 } 2530 } 2531 2532 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2533 switch (bt) { 2534 case T_BYTE: pextrb(dst, src, idx); break; 2535 case T_SHORT: pextrw(dst, src, idx); break; 2536 case T_INT: pextrd(dst, src, idx); break; 2537 case T_LONG: pextrq(dst, src, idx); break; 2538 2539 default: 2540 assert(false,"Should not reach here."); 2541 break; 2542 } 2543 } 2544 2545 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2546 int esize = type2aelembytes(typ); 2547 int elem_per_lane = 16/esize; 2548 int lane = elemindex / elem_per_lane; 2549 int eindex = elemindex % elem_per_lane; 2550 2551 if (lane >= 2) { 2552 assert(UseAVX > 2, "required"); 2553 vextractf32x4(dst, src, lane & 3); 2554 return dst; 2555 } else if (lane > 0) { 2556 assert(UseAVX > 0, "required"); 2557 vextractf128(dst, src, lane); 2558 return dst; 2559 } else { 2560 return src; 2561 } 2562 } 2563 2564 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2565 if (typ == T_BYTE) { 2566 movsbl(dst, dst); 2567 } else if (typ == T_SHORT) { 2568 movswl(dst, dst); 2569 } 2570 } 2571 2572 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2573 int esize = type2aelembytes(typ); 2574 int elem_per_lane = 16/esize; 2575 int eindex = elemindex % elem_per_lane; 2576 assert(is_integral_type(typ),"required"); 2577 2578 if (eindex == 0) { 2579 if (typ == T_LONG) { 2580 movq(dst, src); 2581 } else { 2582 movdl(dst, src); 2583 movsxl(typ, dst); 2584 } 2585 } else { 2586 extract(typ, dst, src, eindex); 2587 movsxl(typ, dst); 2588 } 2589 } 2590 2591 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2592 int esize = type2aelembytes(typ); 2593 int elem_per_lane = 16/esize; 2594 int eindex = elemindex % elem_per_lane; 2595 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2596 2597 if (eindex == 0) { 2598 movq(dst, src); 2599 } else { 2600 if (typ == T_FLOAT) { 2601 if (UseAVX == 0) { 2602 movdqu(dst, src); 2603 shufps(dst, dst, eindex); 2604 } else { 2605 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2606 } 2607 } else { 2608 if (UseAVX == 0) { 2609 movdqu(dst, src); 2610 psrldq(dst, eindex*esize); 2611 } else { 2612 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2613 } 2614 movq(dst, dst); 2615 } 2616 } 2617 // Zero upper bits 2618 if (typ == T_FLOAT) { 2619 if (UseAVX == 0) { 2620 assert(vtmp != xnoreg, "required."); 2621 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2622 pand(dst, vtmp); 2623 } else { 2624 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2625 } 2626 } 2627 } 2628 2629 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2630 switch(typ) { 2631 case T_BYTE: 2632 case T_BOOLEAN: 2633 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2634 break; 2635 case T_SHORT: 2636 case T_CHAR: 2637 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2638 break; 2639 case T_INT: 2640 case T_FLOAT: 2641 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2642 break; 2643 case T_LONG: 2644 case T_DOUBLE: 2645 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2646 break; 2647 default: 2648 assert(false,"Should not reach here."); 2649 break; 2650 } 2651 } 2652 2653 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2654 assert(rscratch != noreg || always_reachable(src2), "missing"); 2655 2656 switch(typ) { 2657 case T_BOOLEAN: 2658 case T_BYTE: 2659 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2660 break; 2661 case T_CHAR: 2662 case T_SHORT: 2663 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2664 break; 2665 case T_INT: 2666 case T_FLOAT: 2667 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2668 break; 2669 case T_LONG: 2670 case T_DOUBLE: 2671 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2672 break; 2673 default: 2674 assert(false,"Should not reach here."); 2675 break; 2676 } 2677 } 2678 2679 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2680 switch(typ) { 2681 case T_BYTE: 2682 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2683 break; 2684 case T_SHORT: 2685 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2686 break; 2687 case T_INT: 2688 case T_FLOAT: 2689 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2690 break; 2691 case T_LONG: 2692 case T_DOUBLE: 2693 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2694 break; 2695 default: 2696 assert(false,"Should not reach here."); 2697 break; 2698 } 2699 } 2700 2701 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2702 assert(vlen_in_bytes <= 32, ""); 2703 int esize = type2aelembytes(bt); 2704 if (vlen_in_bytes == 32) { 2705 assert(vtmp == xnoreg, "required."); 2706 if (esize >= 4) { 2707 vtestps(src1, src2, AVX_256bit); 2708 } else { 2709 vptest(src1, src2, AVX_256bit); 2710 } 2711 return; 2712 } 2713 if (vlen_in_bytes < 16) { 2714 // Duplicate the lower part to fill the whole register, 2715 // Don't need to do so for src2 2716 assert(vtmp != xnoreg, "required"); 2717 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2718 pshufd(vtmp, src1, shuffle_imm); 2719 } else { 2720 assert(vtmp == xnoreg, "required"); 2721 vtmp = src1; 2722 } 2723 if (esize >= 4 && VM_Version::supports_avx()) { 2724 vtestps(vtmp, src2, AVX_128bit); 2725 } else { 2726 ptest(vtmp, src2); 2727 } 2728 } 2729 2730 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2731 #ifdef ASSERT 2732 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2733 bool is_bw_supported = VM_Version::supports_avx512bw(); 2734 if (is_bw && !is_bw_supported) { 2735 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2736 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2737 "XMM register should be 0-15"); 2738 } 2739 #endif // ASSERT 2740 switch (elem_bt) { 2741 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2742 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2743 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2744 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2745 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2746 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2747 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2748 } 2749 } 2750 2751 #ifdef _LP64 2752 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2753 assert(UseAVX >= 2, "required"); 2754 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2755 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2756 if ((UseAVX > 2) && 2757 (!is_bw || VM_Version::supports_avx512bw()) && 2758 (!is_vl || VM_Version::supports_avx512vl())) { 2759 switch (elem_bt) { 2760 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2761 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2762 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2763 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2764 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2765 } 2766 } else { 2767 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2768 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2769 switch (elem_bt) { 2770 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2771 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2772 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2773 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2774 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2775 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2776 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2777 } 2778 } 2779 } 2780 #endif 2781 2782 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2783 switch (to_elem_bt) { 2784 case T_SHORT: 2785 vpmovsxbw(dst, src, vlen_enc); 2786 break; 2787 case T_INT: 2788 vpmovsxbd(dst, src, vlen_enc); 2789 break; 2790 case T_FLOAT: 2791 vpmovsxbd(dst, src, vlen_enc); 2792 vcvtdq2ps(dst, dst, vlen_enc); 2793 break; 2794 case T_LONG: 2795 vpmovsxbq(dst, src, vlen_enc); 2796 break; 2797 case T_DOUBLE: { 2798 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2799 vpmovsxbd(dst, src, mid_vlen_enc); 2800 vcvtdq2pd(dst, dst, vlen_enc); 2801 break; 2802 } 2803 default: 2804 fatal("Unsupported type %s", type2name(to_elem_bt)); 2805 break; 2806 } 2807 } 2808 2809 //------------------------------------------------------------------------------------------- 2810 2811 // IndexOf for constant substrings with size >= 8 chars 2812 // which don't need to be loaded through stack. 2813 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2814 Register cnt1, Register cnt2, 2815 int int_cnt2, Register result, 2816 XMMRegister vec, Register tmp, 2817 int ae) { 2818 ShortBranchVerifier sbv(this); 2819 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2820 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2821 2822 // This method uses the pcmpestri instruction with bound registers 2823 // inputs: 2824 // xmm - substring 2825 // rax - substring length (elements count) 2826 // mem - scanned string 2827 // rdx - string length (elements count) 2828 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2829 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2830 // outputs: 2831 // rcx - matched index in string 2832 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2833 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2834 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2835 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2836 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2837 2838 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2839 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2840 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2841 2842 // Note, inline_string_indexOf() generates checks: 2843 // if (substr.count > string.count) return -1; 2844 // if (substr.count == 0) return 0; 2845 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2846 2847 // Load substring. 2848 if (ae == StrIntrinsicNode::UL) { 2849 pmovzxbw(vec, Address(str2, 0)); 2850 } else { 2851 movdqu(vec, Address(str2, 0)); 2852 } 2853 movl(cnt2, int_cnt2); 2854 movptr(result, str1); // string addr 2855 2856 if (int_cnt2 > stride) { 2857 jmpb(SCAN_TO_SUBSTR); 2858 2859 // Reload substr for rescan, this code 2860 // is executed only for large substrings (> 8 chars) 2861 bind(RELOAD_SUBSTR); 2862 if (ae == StrIntrinsicNode::UL) { 2863 pmovzxbw(vec, Address(str2, 0)); 2864 } else { 2865 movdqu(vec, Address(str2, 0)); 2866 } 2867 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2868 2869 bind(RELOAD_STR); 2870 // We came here after the beginning of the substring was 2871 // matched but the rest of it was not so we need to search 2872 // again. Start from the next element after the previous match. 2873 2874 // cnt2 is number of substring reminding elements and 2875 // cnt1 is number of string reminding elements when cmp failed. 2876 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2877 subl(cnt1, cnt2); 2878 addl(cnt1, int_cnt2); 2879 movl(cnt2, int_cnt2); // Now restore cnt2 2880 2881 decrementl(cnt1); // Shift to next element 2882 cmpl(cnt1, cnt2); 2883 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2884 2885 addptr(result, (1<<scale1)); 2886 2887 } // (int_cnt2 > 8) 2888 2889 // Scan string for start of substr in 16-byte vectors 2890 bind(SCAN_TO_SUBSTR); 2891 pcmpestri(vec, Address(result, 0), mode); 2892 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2893 subl(cnt1, stride); 2894 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2895 cmpl(cnt1, cnt2); 2896 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2897 addptr(result, 16); 2898 jmpb(SCAN_TO_SUBSTR); 2899 2900 // Found a potential substr 2901 bind(FOUND_CANDIDATE); 2902 // Matched whole vector if first element matched (tmp(rcx) == 0). 2903 if (int_cnt2 == stride) { 2904 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2905 } else { // int_cnt2 > 8 2906 jccb(Assembler::overflow, FOUND_SUBSTR); 2907 } 2908 // After pcmpestri tmp(rcx) contains matched element index 2909 // Compute start addr of substr 2910 lea(result, Address(result, tmp, scale1)); 2911 2912 // Make sure string is still long enough 2913 subl(cnt1, tmp); 2914 cmpl(cnt1, cnt2); 2915 if (int_cnt2 == stride) { 2916 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2917 } else { // int_cnt2 > 8 2918 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2919 } 2920 // Left less then substring. 2921 2922 bind(RET_NOT_FOUND); 2923 movl(result, -1); 2924 jmp(EXIT); 2925 2926 if (int_cnt2 > stride) { 2927 // This code is optimized for the case when whole substring 2928 // is matched if its head is matched. 2929 bind(MATCH_SUBSTR_HEAD); 2930 pcmpestri(vec, Address(result, 0), mode); 2931 // Reload only string if does not match 2932 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2933 2934 Label CONT_SCAN_SUBSTR; 2935 // Compare the rest of substring (> 8 chars). 2936 bind(FOUND_SUBSTR); 2937 // First 8 chars are already matched. 2938 negptr(cnt2); 2939 addptr(cnt2, stride); 2940 2941 bind(SCAN_SUBSTR); 2942 subl(cnt1, stride); 2943 cmpl(cnt2, -stride); // Do not read beyond substring 2944 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2945 // Back-up strings to avoid reading beyond substring: 2946 // cnt1 = cnt1 - cnt2 + 8 2947 addl(cnt1, cnt2); // cnt2 is negative 2948 addl(cnt1, stride); 2949 movl(cnt2, stride); negptr(cnt2); 2950 bind(CONT_SCAN_SUBSTR); 2951 if (int_cnt2 < (int)G) { 2952 int tail_off1 = int_cnt2<<scale1; 2953 int tail_off2 = int_cnt2<<scale2; 2954 if (ae == StrIntrinsicNode::UL) { 2955 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2956 } else { 2957 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2958 } 2959 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2960 } else { 2961 // calculate index in register to avoid integer overflow (int_cnt2*2) 2962 movl(tmp, int_cnt2); 2963 addptr(tmp, cnt2); 2964 if (ae == StrIntrinsicNode::UL) { 2965 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2966 } else { 2967 movdqu(vec, Address(str2, tmp, scale2, 0)); 2968 } 2969 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2970 } 2971 // Need to reload strings pointers if not matched whole vector 2972 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2973 addptr(cnt2, stride); 2974 jcc(Assembler::negative, SCAN_SUBSTR); 2975 // Fall through if found full substring 2976 2977 } // (int_cnt2 > 8) 2978 2979 bind(RET_FOUND); 2980 // Found result if we matched full small substring. 2981 // Compute substr offset 2982 subptr(result, str1); 2983 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2984 shrl(result, 1); // index 2985 } 2986 bind(EXIT); 2987 2988 } // string_indexofC8 2989 2990 // Small strings are loaded through stack if they cross page boundary. 2991 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2992 Register cnt1, Register cnt2, 2993 int int_cnt2, Register result, 2994 XMMRegister vec, Register tmp, 2995 int ae) { 2996 ShortBranchVerifier sbv(this); 2997 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2998 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2999 3000 // 3001 // int_cnt2 is length of small (< 8 chars) constant substring 3002 // or (-1) for non constant substring in which case its length 3003 // is in cnt2 register. 3004 // 3005 // Note, inline_string_indexOf() generates checks: 3006 // if (substr.count > string.count) return -1; 3007 // if (substr.count == 0) return 0; 3008 // 3009 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 3010 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 3011 // This method uses the pcmpestri instruction with bound registers 3012 // inputs: 3013 // xmm - substring 3014 // rax - substring length (elements count) 3015 // mem - scanned string 3016 // rdx - string length (elements count) 3017 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 3018 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 3019 // outputs: 3020 // rcx - matched index in string 3021 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3022 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 3023 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 3024 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 3025 3026 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 3027 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 3028 FOUND_CANDIDATE; 3029 3030 { //======================================================== 3031 // We don't know where these strings are located 3032 // and we can't read beyond them. Load them through stack. 3033 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 3034 3035 movptr(tmp, rsp); // save old SP 3036 3037 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 3038 if (int_cnt2 == (1>>scale2)) { // One byte 3039 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 3040 load_unsigned_byte(result, Address(str2, 0)); 3041 movdl(vec, result); // move 32 bits 3042 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 3043 // Not enough header space in 32-bit VM: 12+3 = 15. 3044 movl(result, Address(str2, -1)); 3045 shrl(result, 8); 3046 movdl(vec, result); // move 32 bits 3047 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 3048 load_unsigned_short(result, Address(str2, 0)); 3049 movdl(vec, result); // move 32 bits 3050 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 3051 movdl(vec, Address(str2, 0)); // move 32 bits 3052 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 3053 movq(vec, Address(str2, 0)); // move 64 bits 3054 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 3055 // Array header size is 12 bytes in 32-bit VM 3056 // + 6 bytes for 3 chars == 18 bytes, 3057 // enough space to load vec and shift. 3058 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3059 if (ae == StrIntrinsicNode::UL) { 3060 int tail_off = int_cnt2-8; 3061 pmovzxbw(vec, Address(str2, tail_off)); 3062 psrldq(vec, -2*tail_off); 3063 } 3064 else { 3065 int tail_off = int_cnt2*(1<<scale2); 3066 movdqu(vec, Address(str2, tail_off-16)); 3067 psrldq(vec, 16-tail_off); 3068 } 3069 } 3070 } else { // not constant substring 3071 cmpl(cnt2, stride); 3072 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3073 3074 // We can read beyond string if srt+16 does not cross page boundary 3075 // since heaps are aligned and mapped by pages. 3076 assert(os::vm_page_size() < (int)G, "default page should be small"); 3077 movl(result, str2); // We need only low 32 bits 3078 andl(result, ((int)os::vm_page_size()-1)); 3079 cmpl(result, ((int)os::vm_page_size()-16)); 3080 jccb(Assembler::belowEqual, CHECK_STR); 3081 3082 // Move small strings to stack to allow load 16 bytes into vec. 3083 subptr(rsp, 16); 3084 int stk_offset = wordSize-(1<<scale2); 3085 push(cnt2); 3086 3087 bind(COPY_SUBSTR); 3088 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3089 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3090 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3091 } else if (ae == StrIntrinsicNode::UU) { 3092 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3093 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3094 } 3095 decrement(cnt2); 3096 jccb(Assembler::notZero, COPY_SUBSTR); 3097 3098 pop(cnt2); 3099 movptr(str2, rsp); // New substring address 3100 } // non constant 3101 3102 bind(CHECK_STR); 3103 cmpl(cnt1, stride); 3104 jccb(Assembler::aboveEqual, BIG_STRINGS); 3105 3106 // Check cross page boundary. 3107 movl(result, str1); // We need only low 32 bits 3108 andl(result, ((int)os::vm_page_size()-1)); 3109 cmpl(result, ((int)os::vm_page_size()-16)); 3110 jccb(Assembler::belowEqual, BIG_STRINGS); 3111 3112 subptr(rsp, 16); 3113 int stk_offset = -(1<<scale1); 3114 if (int_cnt2 < 0) { // not constant 3115 push(cnt2); 3116 stk_offset += wordSize; 3117 } 3118 movl(cnt2, cnt1); 3119 3120 bind(COPY_STR); 3121 if (ae == StrIntrinsicNode::LL) { 3122 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3123 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3124 } else { 3125 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3126 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3127 } 3128 decrement(cnt2); 3129 jccb(Assembler::notZero, COPY_STR); 3130 3131 if (int_cnt2 < 0) { // not constant 3132 pop(cnt2); 3133 } 3134 movptr(str1, rsp); // New string address 3135 3136 bind(BIG_STRINGS); 3137 // Load substring. 3138 if (int_cnt2 < 0) { // -1 3139 if (ae == StrIntrinsicNode::UL) { 3140 pmovzxbw(vec, Address(str2, 0)); 3141 } else { 3142 movdqu(vec, Address(str2, 0)); 3143 } 3144 push(cnt2); // substr count 3145 push(str2); // substr addr 3146 push(str1); // string addr 3147 } else { 3148 // Small (< 8 chars) constant substrings are loaded already. 3149 movl(cnt2, int_cnt2); 3150 } 3151 push(tmp); // original SP 3152 3153 } // Finished loading 3154 3155 //======================================================== 3156 // Start search 3157 // 3158 3159 movptr(result, str1); // string addr 3160 3161 if (int_cnt2 < 0) { // Only for non constant substring 3162 jmpb(SCAN_TO_SUBSTR); 3163 3164 // SP saved at sp+0 3165 // String saved at sp+1*wordSize 3166 // Substr saved at sp+2*wordSize 3167 // Substr count saved at sp+3*wordSize 3168 3169 // Reload substr for rescan, this code 3170 // is executed only for large substrings (> 8 chars) 3171 bind(RELOAD_SUBSTR); 3172 movptr(str2, Address(rsp, 2*wordSize)); 3173 movl(cnt2, Address(rsp, 3*wordSize)); 3174 if (ae == StrIntrinsicNode::UL) { 3175 pmovzxbw(vec, Address(str2, 0)); 3176 } else { 3177 movdqu(vec, Address(str2, 0)); 3178 } 3179 // We came here after the beginning of the substring was 3180 // matched but the rest of it was not so we need to search 3181 // again. Start from the next element after the previous match. 3182 subptr(str1, result); // Restore counter 3183 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3184 shrl(str1, 1); 3185 } 3186 addl(cnt1, str1); 3187 decrementl(cnt1); // Shift to next element 3188 cmpl(cnt1, cnt2); 3189 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3190 3191 addptr(result, (1<<scale1)); 3192 } // non constant 3193 3194 // Scan string for start of substr in 16-byte vectors 3195 bind(SCAN_TO_SUBSTR); 3196 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3197 pcmpestri(vec, Address(result, 0), mode); 3198 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3199 subl(cnt1, stride); 3200 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3201 cmpl(cnt1, cnt2); 3202 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3203 addptr(result, 16); 3204 3205 bind(ADJUST_STR); 3206 cmpl(cnt1, stride); // Do not read beyond string 3207 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3208 // Back-up string to avoid reading beyond string. 3209 lea(result, Address(result, cnt1, scale1, -16)); 3210 movl(cnt1, stride); 3211 jmpb(SCAN_TO_SUBSTR); 3212 3213 // Found a potential substr 3214 bind(FOUND_CANDIDATE); 3215 // After pcmpestri tmp(rcx) contains matched element index 3216 3217 // Make sure string is still long enough 3218 subl(cnt1, tmp); 3219 cmpl(cnt1, cnt2); 3220 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3221 // Left less then substring. 3222 3223 bind(RET_NOT_FOUND); 3224 movl(result, -1); 3225 jmp(CLEANUP); 3226 3227 bind(FOUND_SUBSTR); 3228 // Compute start addr of substr 3229 lea(result, Address(result, tmp, scale1)); 3230 if (int_cnt2 > 0) { // Constant substring 3231 // Repeat search for small substring (< 8 chars) 3232 // from new point without reloading substring. 3233 // Have to check that we don't read beyond string. 3234 cmpl(tmp, stride-int_cnt2); 3235 jccb(Assembler::greater, ADJUST_STR); 3236 // Fall through if matched whole substring. 3237 } else { // non constant 3238 assert(int_cnt2 == -1, "should be != 0"); 3239 3240 addl(tmp, cnt2); 3241 // Found result if we matched whole substring. 3242 cmpl(tmp, stride); 3243 jcc(Assembler::lessEqual, RET_FOUND); 3244 3245 // Repeat search for small substring (<= 8 chars) 3246 // from new point 'str1' without reloading substring. 3247 cmpl(cnt2, stride); 3248 // Have to check that we don't read beyond string. 3249 jccb(Assembler::lessEqual, ADJUST_STR); 3250 3251 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3252 // Compare the rest of substring (> 8 chars). 3253 movptr(str1, result); 3254 3255 cmpl(tmp, cnt2); 3256 // First 8 chars are already matched. 3257 jccb(Assembler::equal, CHECK_NEXT); 3258 3259 bind(SCAN_SUBSTR); 3260 pcmpestri(vec, Address(str1, 0), mode); 3261 // Need to reload strings pointers if not matched whole vector 3262 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3263 3264 bind(CHECK_NEXT); 3265 subl(cnt2, stride); 3266 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3267 addptr(str1, 16); 3268 if (ae == StrIntrinsicNode::UL) { 3269 addptr(str2, 8); 3270 } else { 3271 addptr(str2, 16); 3272 } 3273 subl(cnt1, stride); 3274 cmpl(cnt2, stride); // Do not read beyond substring 3275 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3276 // Back-up strings to avoid reading beyond substring. 3277 3278 if (ae == StrIntrinsicNode::UL) { 3279 lea(str2, Address(str2, cnt2, scale2, -8)); 3280 lea(str1, Address(str1, cnt2, scale1, -16)); 3281 } else { 3282 lea(str2, Address(str2, cnt2, scale2, -16)); 3283 lea(str1, Address(str1, cnt2, scale1, -16)); 3284 } 3285 subl(cnt1, cnt2); 3286 movl(cnt2, stride); 3287 addl(cnt1, stride); 3288 bind(CONT_SCAN_SUBSTR); 3289 if (ae == StrIntrinsicNode::UL) { 3290 pmovzxbw(vec, Address(str2, 0)); 3291 } else { 3292 movdqu(vec, Address(str2, 0)); 3293 } 3294 jmp(SCAN_SUBSTR); 3295 3296 bind(RET_FOUND_LONG); 3297 movptr(str1, Address(rsp, wordSize)); 3298 } // non constant 3299 3300 bind(RET_FOUND); 3301 // Compute substr offset 3302 subptr(result, str1); 3303 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3304 shrl(result, 1); // index 3305 } 3306 bind(CLEANUP); 3307 pop(rsp); // restore SP 3308 3309 } // string_indexof 3310 3311 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3312 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3313 ShortBranchVerifier sbv(this); 3314 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3315 3316 int stride = 8; 3317 3318 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3319 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3320 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3321 FOUND_SEQ_CHAR, DONE_LABEL; 3322 3323 movptr(result, str1); 3324 if (UseAVX >= 2) { 3325 cmpl(cnt1, stride); 3326 jcc(Assembler::less, SCAN_TO_CHAR); 3327 cmpl(cnt1, 2*stride); 3328 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3329 movdl(vec1, ch); 3330 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3331 vpxor(vec2, vec2); 3332 movl(tmp, cnt1); 3333 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3334 andl(cnt1,0x0000000F); //tail count (in chars) 3335 3336 bind(SCAN_TO_16_CHAR_LOOP); 3337 vmovdqu(vec3, Address(result, 0)); 3338 vpcmpeqw(vec3, vec3, vec1, 1); 3339 vptest(vec2, vec3); 3340 jcc(Assembler::carryClear, FOUND_CHAR); 3341 addptr(result, 32); 3342 subl(tmp, 2*stride); 3343 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3344 jmp(SCAN_TO_8_CHAR); 3345 bind(SCAN_TO_8_CHAR_INIT); 3346 movdl(vec1, ch); 3347 pshuflw(vec1, vec1, 0x00); 3348 pshufd(vec1, vec1, 0); 3349 pxor(vec2, vec2); 3350 } 3351 bind(SCAN_TO_8_CHAR); 3352 cmpl(cnt1, stride); 3353 jcc(Assembler::less, SCAN_TO_CHAR); 3354 if (UseAVX < 2) { 3355 movdl(vec1, ch); 3356 pshuflw(vec1, vec1, 0x00); 3357 pshufd(vec1, vec1, 0); 3358 pxor(vec2, vec2); 3359 } 3360 movl(tmp, cnt1); 3361 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3362 andl(cnt1,0x00000007); //tail count (in chars) 3363 3364 bind(SCAN_TO_8_CHAR_LOOP); 3365 movdqu(vec3, Address(result, 0)); 3366 pcmpeqw(vec3, vec1); 3367 ptest(vec2, vec3); 3368 jcc(Assembler::carryClear, FOUND_CHAR); 3369 addptr(result, 16); 3370 subl(tmp, stride); 3371 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3372 bind(SCAN_TO_CHAR); 3373 testl(cnt1, cnt1); 3374 jcc(Assembler::zero, RET_NOT_FOUND); 3375 bind(SCAN_TO_CHAR_LOOP); 3376 load_unsigned_short(tmp, Address(result, 0)); 3377 cmpl(ch, tmp); 3378 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3379 addptr(result, 2); 3380 subl(cnt1, 1); 3381 jccb(Assembler::zero, RET_NOT_FOUND); 3382 jmp(SCAN_TO_CHAR_LOOP); 3383 3384 bind(RET_NOT_FOUND); 3385 movl(result, -1); 3386 jmpb(DONE_LABEL); 3387 3388 bind(FOUND_CHAR); 3389 if (UseAVX >= 2) { 3390 vpmovmskb(tmp, vec3); 3391 } else { 3392 pmovmskb(tmp, vec3); 3393 } 3394 bsfl(ch, tmp); 3395 addptr(result, ch); 3396 3397 bind(FOUND_SEQ_CHAR); 3398 subptr(result, str1); 3399 shrl(result, 1); 3400 3401 bind(DONE_LABEL); 3402 } // string_indexof_char 3403 3404 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3405 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3406 ShortBranchVerifier sbv(this); 3407 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3408 3409 int stride = 16; 3410 3411 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3412 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3413 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3414 FOUND_SEQ_CHAR, DONE_LABEL; 3415 3416 movptr(result, str1); 3417 if (UseAVX >= 2) { 3418 cmpl(cnt1, stride); 3419 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3420 cmpl(cnt1, stride*2); 3421 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3422 movdl(vec1, ch); 3423 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3424 vpxor(vec2, vec2); 3425 movl(tmp, cnt1); 3426 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3427 andl(cnt1,0x0000001F); //tail count (in chars) 3428 3429 bind(SCAN_TO_32_CHAR_LOOP); 3430 vmovdqu(vec3, Address(result, 0)); 3431 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3432 vptest(vec2, vec3); 3433 jcc(Assembler::carryClear, FOUND_CHAR); 3434 addptr(result, 32); 3435 subl(tmp, stride*2); 3436 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3437 jmp(SCAN_TO_16_CHAR); 3438 3439 bind(SCAN_TO_16_CHAR_INIT); 3440 movdl(vec1, ch); 3441 pxor(vec2, vec2); 3442 pshufb(vec1, vec2); 3443 } 3444 3445 bind(SCAN_TO_16_CHAR); 3446 cmpl(cnt1, stride); 3447 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3448 if (UseAVX < 2) { 3449 movdl(vec1, ch); 3450 pxor(vec2, vec2); 3451 pshufb(vec1, vec2); 3452 } 3453 movl(tmp, cnt1); 3454 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3455 andl(cnt1,0x0000000F); //tail count (in bytes) 3456 3457 bind(SCAN_TO_16_CHAR_LOOP); 3458 movdqu(vec3, Address(result, 0)); 3459 pcmpeqb(vec3, vec1); 3460 ptest(vec2, vec3); 3461 jcc(Assembler::carryClear, FOUND_CHAR); 3462 addptr(result, 16); 3463 subl(tmp, stride); 3464 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3465 3466 bind(SCAN_TO_CHAR_INIT); 3467 testl(cnt1, cnt1); 3468 jcc(Assembler::zero, RET_NOT_FOUND); 3469 bind(SCAN_TO_CHAR_LOOP); 3470 load_unsigned_byte(tmp, Address(result, 0)); 3471 cmpl(ch, tmp); 3472 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3473 addptr(result, 1); 3474 subl(cnt1, 1); 3475 jccb(Assembler::zero, RET_NOT_FOUND); 3476 jmp(SCAN_TO_CHAR_LOOP); 3477 3478 bind(RET_NOT_FOUND); 3479 movl(result, -1); 3480 jmpb(DONE_LABEL); 3481 3482 bind(FOUND_CHAR); 3483 if (UseAVX >= 2) { 3484 vpmovmskb(tmp, vec3); 3485 } else { 3486 pmovmskb(tmp, vec3); 3487 } 3488 bsfl(ch, tmp); 3489 addptr(result, ch); 3490 3491 bind(FOUND_SEQ_CHAR); 3492 subptr(result, str1); 3493 3494 bind(DONE_LABEL); 3495 } // stringL_indexof_char 3496 3497 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3498 switch (eltype) { 3499 case T_BOOLEAN: return sizeof(jboolean); 3500 case T_BYTE: return sizeof(jbyte); 3501 case T_SHORT: return sizeof(jshort); 3502 case T_CHAR: return sizeof(jchar); 3503 case T_INT: return sizeof(jint); 3504 default: 3505 ShouldNotReachHere(); 3506 return -1; 3507 } 3508 } 3509 3510 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3511 switch (eltype) { 3512 // T_BOOLEAN used as surrogate for unsigned byte 3513 case T_BOOLEAN: movzbl(dst, src); break; 3514 case T_BYTE: movsbl(dst, src); break; 3515 case T_SHORT: movswl(dst, src); break; 3516 case T_CHAR: movzwl(dst, src); break; 3517 case T_INT: movl(dst, src); break; 3518 default: 3519 ShouldNotReachHere(); 3520 } 3521 } 3522 3523 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3524 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3525 } 3526 3527 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3528 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3529 } 3530 3531 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3532 const int vlen = Assembler::AVX_256bit; 3533 switch (eltype) { 3534 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3535 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3536 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3537 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3538 case T_INT: 3539 // do nothing 3540 break; 3541 default: 3542 ShouldNotReachHere(); 3543 } 3544 } 3545 3546 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3547 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3548 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3549 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3550 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3551 BasicType eltype) { 3552 ShortBranchVerifier sbv(this); 3553 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3554 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3555 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3556 3557 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3558 SHORT_UNROLLED_LOOP_EXIT, 3559 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3560 UNROLLED_VECTOR_LOOP_BEGIN, 3561 END; 3562 switch (eltype) { 3563 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3564 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3565 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3566 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3567 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3568 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3569 } 3570 3571 // For "renaming" for readibility of the code 3572 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3573 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3574 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3575 3576 const int elsize = arrays_hashcode_elsize(eltype); 3577 3578 /* 3579 if (cnt1 >= 2) { 3580 if (cnt1 >= 32) { 3581 UNROLLED VECTOR LOOP 3582 } 3583 UNROLLED SCALAR LOOP 3584 } 3585 SINGLE SCALAR 3586 */ 3587 3588 cmpl(cnt1, 32); 3589 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3590 3591 // cnt1 >= 32 && generate_vectorized_loop 3592 xorl(index, index); 3593 3594 // vresult = IntVector.zero(I256); 3595 for (int idx = 0; idx < 4; idx++) { 3596 vpxor(vresult[idx], vresult[idx]); 3597 } 3598 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3599 Register bound = tmp2; 3600 Register next = tmp3; 3601 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3602 movl(next, Address(tmp2, 0)); 3603 movdl(vnext, next); 3604 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3605 3606 // index = 0; 3607 // bound = cnt1 & ~(32 - 1); 3608 movl(bound, cnt1); 3609 andl(bound, ~(32 - 1)); 3610 // for (; index < bound; index += 32) { 3611 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3612 // result *= next; 3613 imull(result, next); 3614 // loop fission to upfront the cost of fetching from memory, OOO execution 3615 // can then hopefully do a better job of prefetching 3616 for (int idx = 0; idx < 4; idx++) { 3617 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3618 } 3619 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3620 for (int idx = 0; idx < 4; idx++) { 3621 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3622 arrays_hashcode_elvcast(vtmp[idx], eltype); 3623 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3624 } 3625 // index += 32; 3626 addl(index, 32); 3627 // index < bound; 3628 cmpl(index, bound); 3629 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3630 // } 3631 3632 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3633 subl(cnt1, bound); 3634 // release bound 3635 3636 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3637 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3638 for (int idx = 0; idx < 4; idx++) { 3639 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, (int)((8 * idx + 1) * sizeof(jint))), T_INT); 3640 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3641 } 3642 // result += vresult.reduceLanes(ADD); 3643 for (int idx = 0; idx < 4; idx++) { 3644 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3645 } 3646 3647 // } else if (cnt1 < 32) { 3648 3649 bind(SHORT_UNROLLED_BEGIN); 3650 // int i = 1; 3651 movl(index, 1); 3652 cmpl(index, cnt1); 3653 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3654 3655 // for (; i < cnt1 ; i += 2) { 3656 bind(SHORT_UNROLLED_LOOP_BEGIN); 3657 movl(tmp3, 961); 3658 imull(result, tmp3); 3659 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3660 movl(tmp3, tmp2); 3661 shll(tmp3, 5); 3662 subl(tmp3, tmp2); 3663 addl(result, tmp3); 3664 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3665 addl(result, tmp3); 3666 addl(index, 2); 3667 cmpl(index, cnt1); 3668 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3669 3670 // } 3671 // if (i >= cnt1) { 3672 bind(SHORT_UNROLLED_LOOP_EXIT); 3673 jccb(Assembler::greater, END); 3674 movl(tmp2, result); 3675 shll(result, 5); 3676 subl(result, tmp2); 3677 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3678 addl(result, tmp3); 3679 // } 3680 bind(END); 3681 3682 BLOCK_COMMENT("} // arrays_hashcode"); 3683 3684 } // arrays_hashcode 3685 3686 // helper function for string_compare 3687 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3688 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3689 Address::ScaleFactor scale2, Register index, int ae) { 3690 if (ae == StrIntrinsicNode::LL) { 3691 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3692 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3693 } else if (ae == StrIntrinsicNode::UU) { 3694 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3695 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3696 } else { 3697 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3698 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3699 } 3700 } 3701 3702 // Compare strings, used for char[] and byte[]. 3703 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3704 Register cnt1, Register cnt2, Register result, 3705 XMMRegister vec1, int ae, KRegister mask) { 3706 ShortBranchVerifier sbv(this); 3707 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3708 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3709 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3710 int stride2x2 = 0x40; 3711 Address::ScaleFactor scale = Address::no_scale; 3712 Address::ScaleFactor scale1 = Address::no_scale; 3713 Address::ScaleFactor scale2 = Address::no_scale; 3714 3715 if (ae != StrIntrinsicNode::LL) { 3716 stride2x2 = 0x20; 3717 } 3718 3719 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3720 shrl(cnt2, 1); 3721 } 3722 // Compute the minimum of the string lengths and the 3723 // difference of the string lengths (stack). 3724 // Do the conditional move stuff 3725 movl(result, cnt1); 3726 subl(cnt1, cnt2); 3727 push(cnt1); 3728 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3729 3730 // Is the minimum length zero? 3731 testl(cnt2, cnt2); 3732 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3733 if (ae == StrIntrinsicNode::LL) { 3734 // Load first bytes 3735 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3736 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3737 } else if (ae == StrIntrinsicNode::UU) { 3738 // Load first characters 3739 load_unsigned_short(result, Address(str1, 0)); 3740 load_unsigned_short(cnt1, Address(str2, 0)); 3741 } else { 3742 load_unsigned_byte(result, Address(str1, 0)); 3743 load_unsigned_short(cnt1, Address(str2, 0)); 3744 } 3745 subl(result, cnt1); 3746 jcc(Assembler::notZero, POP_LABEL); 3747 3748 if (ae == StrIntrinsicNode::UU) { 3749 // Divide length by 2 to get number of chars 3750 shrl(cnt2, 1); 3751 } 3752 cmpl(cnt2, 1); 3753 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3754 3755 // Check if the strings start at the same location and setup scale and stride 3756 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3757 cmpptr(str1, str2); 3758 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3759 if (ae == StrIntrinsicNode::LL) { 3760 scale = Address::times_1; 3761 stride = 16; 3762 } else { 3763 scale = Address::times_2; 3764 stride = 8; 3765 } 3766 } else { 3767 scale1 = Address::times_1; 3768 scale2 = Address::times_2; 3769 // scale not used 3770 stride = 8; 3771 } 3772 3773 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3774 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3775 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3776 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3777 Label COMPARE_TAIL_LONG; 3778 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3779 3780 int pcmpmask = 0x19; 3781 if (ae == StrIntrinsicNode::LL) { 3782 pcmpmask &= ~0x01; 3783 } 3784 3785 // Setup to compare 16-chars (32-bytes) vectors, 3786 // start from first character again because it has aligned address. 3787 if (ae == StrIntrinsicNode::LL) { 3788 stride2 = 32; 3789 } else { 3790 stride2 = 16; 3791 } 3792 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3793 adr_stride = stride << scale; 3794 } else { 3795 adr_stride1 = 8; //stride << scale1; 3796 adr_stride2 = 16; //stride << scale2; 3797 } 3798 3799 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3800 // rax and rdx are used by pcmpestri as elements counters 3801 movl(result, cnt2); 3802 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3803 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3804 3805 // fast path : compare first 2 8-char vectors. 3806 bind(COMPARE_16_CHARS); 3807 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3808 movdqu(vec1, Address(str1, 0)); 3809 } else { 3810 pmovzxbw(vec1, Address(str1, 0)); 3811 } 3812 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3813 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3814 3815 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3816 movdqu(vec1, Address(str1, adr_stride)); 3817 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3818 } else { 3819 pmovzxbw(vec1, Address(str1, adr_stride1)); 3820 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3821 } 3822 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3823 addl(cnt1, stride); 3824 3825 // Compare the characters at index in cnt1 3826 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3827 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3828 subl(result, cnt2); 3829 jmp(POP_LABEL); 3830 3831 // Setup the registers to start vector comparison loop 3832 bind(COMPARE_WIDE_VECTORS); 3833 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3834 lea(str1, Address(str1, result, scale)); 3835 lea(str2, Address(str2, result, scale)); 3836 } else { 3837 lea(str1, Address(str1, result, scale1)); 3838 lea(str2, Address(str2, result, scale2)); 3839 } 3840 subl(result, stride2); 3841 subl(cnt2, stride2); 3842 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3843 negptr(result); 3844 3845 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3846 bind(COMPARE_WIDE_VECTORS_LOOP); 3847 3848 #ifdef _LP64 3849 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3850 cmpl(cnt2, stride2x2); 3851 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3852 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3853 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3854 3855 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3856 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3857 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3858 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3859 } else { 3860 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3861 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3862 } 3863 kortestql(mask, mask); 3864 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3865 addptr(result, stride2x2); // update since we already compared at this addr 3866 subl(cnt2, stride2x2); // and sub the size too 3867 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3868 3869 vpxor(vec1, vec1); 3870 jmpb(COMPARE_WIDE_TAIL); 3871 }//if (VM_Version::supports_avx512vlbw()) 3872 #endif // _LP64 3873 3874 3875 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3876 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3877 vmovdqu(vec1, Address(str1, result, scale)); 3878 vpxor(vec1, Address(str2, result, scale)); 3879 } else { 3880 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3881 vpxor(vec1, Address(str2, result, scale2)); 3882 } 3883 vptest(vec1, vec1); 3884 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3885 addptr(result, stride2); 3886 subl(cnt2, stride2); 3887 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3888 // clean upper bits of YMM registers 3889 vpxor(vec1, vec1); 3890 3891 // compare wide vectors tail 3892 bind(COMPARE_WIDE_TAIL); 3893 testptr(result, result); 3894 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3895 3896 movl(result, stride2); 3897 movl(cnt2, result); 3898 negptr(result); 3899 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3900 3901 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3902 bind(VECTOR_NOT_EQUAL); 3903 // clean upper bits of YMM registers 3904 vpxor(vec1, vec1); 3905 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3906 lea(str1, Address(str1, result, scale)); 3907 lea(str2, Address(str2, result, scale)); 3908 } else { 3909 lea(str1, Address(str1, result, scale1)); 3910 lea(str2, Address(str2, result, scale2)); 3911 } 3912 jmp(COMPARE_16_CHARS); 3913 3914 // Compare tail chars, length between 1 to 15 chars 3915 bind(COMPARE_TAIL_LONG); 3916 movl(cnt2, result); 3917 cmpl(cnt2, stride); 3918 jcc(Assembler::less, COMPARE_SMALL_STR); 3919 3920 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3921 movdqu(vec1, Address(str1, 0)); 3922 } else { 3923 pmovzxbw(vec1, Address(str1, 0)); 3924 } 3925 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3926 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3927 subptr(cnt2, stride); 3928 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3929 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3930 lea(str1, Address(str1, result, scale)); 3931 lea(str2, Address(str2, result, scale)); 3932 } else { 3933 lea(str1, Address(str1, result, scale1)); 3934 lea(str2, Address(str2, result, scale2)); 3935 } 3936 negptr(cnt2); 3937 jmpb(WHILE_HEAD_LABEL); 3938 3939 bind(COMPARE_SMALL_STR); 3940 } else if (UseSSE42Intrinsics) { 3941 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3942 int pcmpmask = 0x19; 3943 // Setup to compare 8-char (16-byte) vectors, 3944 // start from first character again because it has aligned address. 3945 movl(result, cnt2); 3946 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3947 if (ae == StrIntrinsicNode::LL) { 3948 pcmpmask &= ~0x01; 3949 } 3950 jcc(Assembler::zero, COMPARE_TAIL); 3951 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3952 lea(str1, Address(str1, result, scale)); 3953 lea(str2, Address(str2, result, scale)); 3954 } else { 3955 lea(str1, Address(str1, result, scale1)); 3956 lea(str2, Address(str2, result, scale2)); 3957 } 3958 negptr(result); 3959 3960 // pcmpestri 3961 // inputs: 3962 // vec1- substring 3963 // rax - negative string length (elements count) 3964 // mem - scanned string 3965 // rdx - string length (elements count) 3966 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3967 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3968 // outputs: 3969 // rcx - first mismatched element index 3970 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3971 3972 bind(COMPARE_WIDE_VECTORS); 3973 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3974 movdqu(vec1, Address(str1, result, scale)); 3975 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3976 } else { 3977 pmovzxbw(vec1, Address(str1, result, scale1)); 3978 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3979 } 3980 // After pcmpestri cnt1(rcx) contains mismatched element index 3981 3982 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3983 addptr(result, stride); 3984 subptr(cnt2, stride); 3985 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3986 3987 // compare wide vectors tail 3988 testptr(result, result); 3989 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3990 3991 movl(cnt2, stride); 3992 movl(result, stride); 3993 negptr(result); 3994 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3995 movdqu(vec1, Address(str1, result, scale)); 3996 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3997 } else { 3998 pmovzxbw(vec1, Address(str1, result, scale1)); 3999 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4000 } 4001 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 4002 4003 // Mismatched characters in the vectors 4004 bind(VECTOR_NOT_EQUAL); 4005 addptr(cnt1, result); 4006 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 4007 subl(result, cnt2); 4008 jmpb(POP_LABEL); 4009 4010 bind(COMPARE_TAIL); // limit is zero 4011 movl(cnt2, result); 4012 // Fallthru to tail compare 4013 } 4014 // Shift str2 and str1 to the end of the arrays, negate min 4015 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4016 lea(str1, Address(str1, cnt2, scale)); 4017 lea(str2, Address(str2, cnt2, scale)); 4018 } else { 4019 lea(str1, Address(str1, cnt2, scale1)); 4020 lea(str2, Address(str2, cnt2, scale2)); 4021 } 4022 decrementl(cnt2); // first character was compared already 4023 negptr(cnt2); 4024 4025 // Compare the rest of the elements 4026 bind(WHILE_HEAD_LABEL); 4027 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 4028 subl(result, cnt1); 4029 jccb(Assembler::notZero, POP_LABEL); 4030 increment(cnt2); 4031 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 4032 4033 // Strings are equal up to min length. Return the length difference. 4034 bind(LENGTH_DIFF_LABEL); 4035 pop(result); 4036 if (ae == StrIntrinsicNode::UU) { 4037 // Divide diff by 2 to get number of chars 4038 sarl(result, 1); 4039 } 4040 jmpb(DONE_LABEL); 4041 4042 #ifdef _LP64 4043 if (VM_Version::supports_avx512vlbw()) { 4044 4045 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 4046 4047 kmovql(cnt1, mask); 4048 notq(cnt1); 4049 bsfq(cnt2, cnt1); 4050 if (ae != StrIntrinsicNode::LL) { 4051 // Divide diff by 2 to get number of chars 4052 sarl(cnt2, 1); 4053 } 4054 addq(result, cnt2); 4055 if (ae == StrIntrinsicNode::LL) { 4056 load_unsigned_byte(cnt1, Address(str2, result)); 4057 load_unsigned_byte(result, Address(str1, result)); 4058 } else if (ae == StrIntrinsicNode::UU) { 4059 load_unsigned_short(cnt1, Address(str2, result, scale)); 4060 load_unsigned_short(result, Address(str1, result, scale)); 4061 } else { 4062 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4063 load_unsigned_byte(result, Address(str1, result, scale1)); 4064 } 4065 subl(result, cnt1); 4066 jmpb(POP_LABEL); 4067 }//if (VM_Version::supports_avx512vlbw()) 4068 #endif // _LP64 4069 4070 // Discard the stored length difference 4071 bind(POP_LABEL); 4072 pop(cnt1); 4073 4074 // That's it 4075 bind(DONE_LABEL); 4076 if(ae == StrIntrinsicNode::UL) { 4077 negl(result); 4078 } 4079 4080 } 4081 4082 // Search for Non-ASCII character (Negative byte value) in a byte array, 4083 // return the index of the first such character, otherwise the length 4084 // of the array segment searched. 4085 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4086 // @IntrinsicCandidate 4087 // public static int countPositives(byte[] ba, int off, int len) { 4088 // for (int i = off; i < off + len; i++) { 4089 // if (ba[i] < 0) { 4090 // return i - off; 4091 // } 4092 // } 4093 // return len; 4094 // } 4095 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4096 Register result, Register tmp1, 4097 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4098 // rsi: byte array 4099 // rcx: len 4100 // rax: result 4101 ShortBranchVerifier sbv(this); 4102 assert_different_registers(ary1, len, result, tmp1); 4103 assert_different_registers(vec1, vec2); 4104 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4105 4106 movl(result, len); // copy 4107 // len == 0 4108 testl(len, len); 4109 jcc(Assembler::zero, DONE); 4110 4111 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4112 VM_Version::supports_avx512vlbw() && 4113 VM_Version::supports_bmi2()) { 4114 4115 Label test_64_loop, test_tail, BREAK_LOOP; 4116 movl(tmp1, len); 4117 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4118 4119 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4120 andl(len, 0xffffffc0); // vector count (in chars) 4121 jccb(Assembler::zero, test_tail); 4122 4123 lea(ary1, Address(ary1, len, Address::times_1)); 4124 negptr(len); 4125 4126 bind(test_64_loop); 4127 // Check whether our 64 elements of size byte contain negatives 4128 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4129 kortestql(mask1, mask1); 4130 jcc(Assembler::notZero, BREAK_LOOP); 4131 4132 addptr(len, 64); 4133 jccb(Assembler::notZero, test_64_loop); 4134 4135 bind(test_tail); 4136 // bail out when there is nothing to be done 4137 testl(tmp1, -1); 4138 jcc(Assembler::zero, DONE); 4139 4140 4141 // check the tail for absense of negatives 4142 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4143 #ifdef _LP64 4144 { 4145 Register tmp3_aliased = len; 4146 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4147 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4148 notq(tmp3_aliased); 4149 kmovql(mask2, tmp3_aliased); 4150 } 4151 #else 4152 Label k_init; 4153 jmp(k_init); 4154 4155 // We could not read 64-bits from a general purpose register thus we move 4156 // data required to compose 64 1's to the instruction stream 4157 // We emit 64 byte wide series of elements from 0..63 which later on would 4158 // be used as a compare targets with tail count contained in tmp1 register. 4159 // Result would be a k register having tmp1 consecutive number or 1 4160 // counting from least significant bit. 4161 address tmp = pc(); 4162 emit_int64(0x0706050403020100); 4163 emit_int64(0x0F0E0D0C0B0A0908); 4164 emit_int64(0x1716151413121110); 4165 emit_int64(0x1F1E1D1C1B1A1918); 4166 emit_int64(0x2726252423222120); 4167 emit_int64(0x2F2E2D2C2B2A2928); 4168 emit_int64(0x3736353433323130); 4169 emit_int64(0x3F3E3D3C3B3A3938); 4170 4171 bind(k_init); 4172 lea(len, InternalAddress(tmp)); 4173 // create mask to test for negative byte inside a vector 4174 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 4175 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 4176 4177 #endif 4178 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4179 ktestq(mask1, mask2); 4180 jcc(Assembler::zero, DONE); 4181 4182 // do a full check for negative registers in the tail 4183 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4184 // ary1 already pointing to the right place 4185 jmpb(TAIL_START); 4186 4187 bind(BREAK_LOOP); 4188 // At least one byte in the last 64 byte block was negative. 4189 // Set up to look at the last 64 bytes as if they were a tail 4190 lea(ary1, Address(ary1, len, Address::times_1)); 4191 addptr(result, len); 4192 // Ignore the very last byte: if all others are positive, 4193 // it must be negative, so we can skip right to the 2+1 byte 4194 // end comparison at this point 4195 orl(result, 63); 4196 movl(len, 63); 4197 // Fallthru to tail compare 4198 } else { 4199 4200 if (UseAVX >= 2 && UseSSE >= 2) { 4201 // With AVX2, use 32-byte vector compare 4202 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4203 4204 // Compare 32-byte vectors 4205 testl(len, 0xffffffe0); // vector count (in bytes) 4206 jccb(Assembler::zero, TAIL_START); 4207 4208 andl(len, 0xffffffe0); 4209 lea(ary1, Address(ary1, len, Address::times_1)); 4210 negptr(len); 4211 4212 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4213 movdl(vec2, tmp1); 4214 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4215 4216 bind(COMPARE_WIDE_VECTORS); 4217 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4218 vptest(vec1, vec2); 4219 jccb(Assembler::notZero, BREAK_LOOP); 4220 addptr(len, 32); 4221 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4222 4223 testl(result, 0x0000001f); // any bytes remaining? 4224 jcc(Assembler::zero, DONE); 4225 4226 // Quick test using the already prepared vector mask 4227 movl(len, result); 4228 andl(len, 0x0000001f); 4229 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4230 vptest(vec1, vec2); 4231 jcc(Assembler::zero, DONE); 4232 // There are zeros, jump to the tail to determine exactly where 4233 jmpb(TAIL_START); 4234 4235 bind(BREAK_LOOP); 4236 // At least one byte in the last 32-byte vector is negative. 4237 // Set up to look at the last 32 bytes as if they were a tail 4238 lea(ary1, Address(ary1, len, Address::times_1)); 4239 addptr(result, len); 4240 // Ignore the very last byte: if all others are positive, 4241 // it must be negative, so we can skip right to the 2+1 byte 4242 // end comparison at this point 4243 orl(result, 31); 4244 movl(len, 31); 4245 // Fallthru to tail compare 4246 } else if (UseSSE42Intrinsics) { 4247 // With SSE4.2, use double quad vector compare 4248 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4249 4250 // Compare 16-byte vectors 4251 testl(len, 0xfffffff0); // vector count (in bytes) 4252 jcc(Assembler::zero, TAIL_START); 4253 4254 andl(len, 0xfffffff0); 4255 lea(ary1, Address(ary1, len, Address::times_1)); 4256 negptr(len); 4257 4258 movl(tmp1, 0x80808080); 4259 movdl(vec2, tmp1); 4260 pshufd(vec2, vec2, 0); 4261 4262 bind(COMPARE_WIDE_VECTORS); 4263 movdqu(vec1, Address(ary1, len, Address::times_1)); 4264 ptest(vec1, vec2); 4265 jccb(Assembler::notZero, BREAK_LOOP); 4266 addptr(len, 16); 4267 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4268 4269 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4270 jcc(Assembler::zero, DONE); 4271 4272 // Quick test using the already prepared vector mask 4273 movl(len, result); 4274 andl(len, 0x0000000f); // tail count (in bytes) 4275 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4276 ptest(vec1, vec2); 4277 jcc(Assembler::zero, DONE); 4278 jmpb(TAIL_START); 4279 4280 bind(BREAK_LOOP); 4281 // At least one byte in the last 16-byte vector is negative. 4282 // Set up and look at the last 16 bytes as if they were a tail 4283 lea(ary1, Address(ary1, len, Address::times_1)); 4284 addptr(result, len); 4285 // Ignore the very last byte: if all others are positive, 4286 // it must be negative, so we can skip right to the 2+1 byte 4287 // end comparison at this point 4288 orl(result, 15); 4289 movl(len, 15); 4290 // Fallthru to tail compare 4291 } 4292 } 4293 4294 bind(TAIL_START); 4295 // Compare 4-byte vectors 4296 andl(len, 0xfffffffc); // vector count (in bytes) 4297 jccb(Assembler::zero, COMPARE_CHAR); 4298 4299 lea(ary1, Address(ary1, len, Address::times_1)); 4300 negptr(len); 4301 4302 bind(COMPARE_VECTORS); 4303 movl(tmp1, Address(ary1, len, Address::times_1)); 4304 andl(tmp1, 0x80808080); 4305 jccb(Assembler::notZero, TAIL_ADJUST); 4306 addptr(len, 4); 4307 jccb(Assembler::notZero, COMPARE_VECTORS); 4308 4309 // Compare trailing char (final 2-3 bytes), if any 4310 bind(COMPARE_CHAR); 4311 4312 testl(result, 0x2); // tail char 4313 jccb(Assembler::zero, COMPARE_BYTE); 4314 load_unsigned_short(tmp1, Address(ary1, 0)); 4315 andl(tmp1, 0x00008080); 4316 jccb(Assembler::notZero, CHAR_ADJUST); 4317 lea(ary1, Address(ary1, 2)); 4318 4319 bind(COMPARE_BYTE); 4320 testl(result, 0x1); // tail byte 4321 jccb(Assembler::zero, DONE); 4322 load_unsigned_byte(tmp1, Address(ary1, 0)); 4323 testl(tmp1, 0x00000080); 4324 jccb(Assembler::zero, DONE); 4325 subptr(result, 1); 4326 jmpb(DONE); 4327 4328 bind(TAIL_ADJUST); 4329 // there are negative bits in the last 4 byte block. 4330 // Adjust result and check the next three bytes 4331 addptr(result, len); 4332 orl(result, 3); 4333 lea(ary1, Address(ary1, len, Address::times_1)); 4334 jmpb(COMPARE_CHAR); 4335 4336 bind(CHAR_ADJUST); 4337 // We are looking at a char + optional byte tail, and found that one 4338 // of the bytes in the char is negative. Adjust the result, check the 4339 // first byte and readjust if needed. 4340 andl(result, 0xfffffffc); 4341 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4342 jccb(Assembler::notZero, DONE); 4343 addptr(result, 1); 4344 4345 // That's it 4346 bind(DONE); 4347 if (UseAVX >= 2 && UseSSE >= 2) { 4348 // clean upper bits of YMM registers 4349 vpxor(vec1, vec1); 4350 vpxor(vec2, vec2); 4351 } 4352 } 4353 4354 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4355 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4356 Register limit, Register result, Register chr, 4357 XMMRegister vec1, XMMRegister vec2, bool is_char, 4358 KRegister mask, bool expand_ary2) { 4359 // for expand_ary2, limit is the (smaller) size of the second array. 4360 ShortBranchVerifier sbv(this); 4361 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4362 4363 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4364 "Expansion only implemented for AVX2"); 4365 4366 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4367 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4368 4369 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4370 int scaleIncr = expand_ary2 ? 8 : 16; 4371 4372 if (is_array_equ) { 4373 // Check the input args 4374 cmpoop(ary1, ary2); 4375 jcc(Assembler::equal, TRUE_LABEL); 4376 4377 // Need additional checks for arrays_equals. 4378 testptr(ary1, ary1); 4379 jcc(Assembler::zero, FALSE_LABEL); 4380 testptr(ary2, ary2); 4381 jcc(Assembler::zero, FALSE_LABEL); 4382 4383 // Check the lengths 4384 movl(limit, Address(ary1, length_offset)); 4385 cmpl(limit, Address(ary2, length_offset)); 4386 jcc(Assembler::notEqual, FALSE_LABEL); 4387 } 4388 4389 // count == 0 4390 testl(limit, limit); 4391 jcc(Assembler::zero, TRUE_LABEL); 4392 4393 if (is_array_equ) { 4394 // Load array address 4395 lea(ary1, Address(ary1, base_offset)); 4396 lea(ary2, Address(ary2, base_offset)); 4397 } 4398 4399 if (is_array_equ && is_char) { 4400 // arrays_equals when used for char[]. 4401 shll(limit, 1); // byte count != 0 4402 } 4403 movl(result, limit); // copy 4404 4405 if (UseAVX >= 2) { 4406 // With AVX2, use 32-byte vector compare 4407 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4408 4409 // Compare 32-byte vectors 4410 if (expand_ary2) { 4411 andl(result, 0x0000000f); // tail count (in bytes) 4412 andl(limit, 0xfffffff0); // vector count (in bytes) 4413 jcc(Assembler::zero, COMPARE_TAIL); 4414 } else { 4415 andl(result, 0x0000001f); // tail count (in bytes) 4416 andl(limit, 0xffffffe0); // vector count (in bytes) 4417 jcc(Assembler::zero, COMPARE_TAIL_16); 4418 } 4419 4420 lea(ary1, Address(ary1, limit, scaleFactor)); 4421 lea(ary2, Address(ary2, limit, Address::times_1)); 4422 negptr(limit); 4423 4424 #ifdef _LP64 4425 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4426 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4427 4428 cmpl(limit, -64); 4429 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4430 4431 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4432 4433 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4434 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4435 kortestql(mask, mask); 4436 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4437 addptr(limit, 64); // update since we already compared at this addr 4438 cmpl(limit, -64); 4439 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4440 4441 // At this point we may still need to compare -limit+result bytes. 4442 // We could execute the next two instruction and just continue via non-wide path: 4443 // cmpl(limit, 0); 4444 // jcc(Assembler::equal, COMPARE_TAIL); // true 4445 // But since we stopped at the points ary{1,2}+limit which are 4446 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4447 // (|limit| <= 32 and result < 32), 4448 // we may just compare the last 64 bytes. 4449 // 4450 addptr(result, -64); // it is safe, bc we just came from this area 4451 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4452 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4453 kortestql(mask, mask); 4454 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4455 4456 jmp(TRUE_LABEL); 4457 4458 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4459 4460 }//if (VM_Version::supports_avx512vlbw()) 4461 #endif //_LP64 4462 bind(COMPARE_WIDE_VECTORS); 4463 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4464 if (expand_ary2) { 4465 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4466 } else { 4467 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4468 } 4469 vpxor(vec1, vec2); 4470 4471 vptest(vec1, vec1); 4472 jcc(Assembler::notZero, FALSE_LABEL); 4473 addptr(limit, scaleIncr * 2); 4474 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4475 4476 testl(result, result); 4477 jcc(Assembler::zero, TRUE_LABEL); 4478 4479 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4480 if (expand_ary2) { 4481 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4482 } else { 4483 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4484 } 4485 vpxor(vec1, vec2); 4486 4487 vptest(vec1, vec1); 4488 jcc(Assembler::notZero, FALSE_LABEL); 4489 jmp(TRUE_LABEL); 4490 4491 bind(COMPARE_TAIL_16); // limit is zero 4492 movl(limit, result); 4493 4494 // Compare 16-byte chunks 4495 andl(result, 0x0000000f); // tail count (in bytes) 4496 andl(limit, 0xfffffff0); // vector count (in bytes) 4497 jcc(Assembler::zero, COMPARE_TAIL); 4498 4499 lea(ary1, Address(ary1, limit, scaleFactor)); 4500 lea(ary2, Address(ary2, limit, Address::times_1)); 4501 negptr(limit); 4502 4503 bind(COMPARE_WIDE_VECTORS_16); 4504 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4505 if (expand_ary2) { 4506 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4507 } else { 4508 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4509 } 4510 pxor(vec1, vec2); 4511 4512 ptest(vec1, vec1); 4513 jcc(Assembler::notZero, FALSE_LABEL); 4514 addptr(limit, scaleIncr); 4515 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4516 4517 bind(COMPARE_TAIL); // limit is zero 4518 movl(limit, result); 4519 // Fallthru to tail compare 4520 } else if (UseSSE42Intrinsics) { 4521 // With SSE4.2, use double quad vector compare 4522 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4523 4524 // Compare 16-byte vectors 4525 andl(result, 0x0000000f); // tail count (in bytes) 4526 andl(limit, 0xfffffff0); // vector count (in bytes) 4527 jcc(Assembler::zero, COMPARE_TAIL); 4528 4529 lea(ary1, Address(ary1, limit, Address::times_1)); 4530 lea(ary2, Address(ary2, limit, Address::times_1)); 4531 negptr(limit); 4532 4533 bind(COMPARE_WIDE_VECTORS); 4534 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4535 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4536 pxor(vec1, vec2); 4537 4538 ptest(vec1, vec1); 4539 jcc(Assembler::notZero, FALSE_LABEL); 4540 addptr(limit, 16); 4541 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4542 4543 testl(result, result); 4544 jcc(Assembler::zero, TRUE_LABEL); 4545 4546 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4547 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4548 pxor(vec1, vec2); 4549 4550 ptest(vec1, vec1); 4551 jccb(Assembler::notZero, FALSE_LABEL); 4552 jmpb(TRUE_LABEL); 4553 4554 bind(COMPARE_TAIL); // limit is zero 4555 movl(limit, result); 4556 // Fallthru to tail compare 4557 } 4558 4559 // Compare 4-byte vectors 4560 if (expand_ary2) { 4561 testl(result, result); 4562 jccb(Assembler::zero, TRUE_LABEL); 4563 } else { 4564 andl(limit, 0xfffffffc); // vector count (in bytes) 4565 jccb(Assembler::zero, COMPARE_CHAR); 4566 } 4567 4568 lea(ary1, Address(ary1, limit, scaleFactor)); 4569 lea(ary2, Address(ary2, limit, Address::times_1)); 4570 negptr(limit); 4571 4572 bind(COMPARE_VECTORS); 4573 if (expand_ary2) { 4574 // There are no "vector" operations for bytes to shorts 4575 movzbl(chr, Address(ary2, limit, Address::times_1)); 4576 cmpw(Address(ary1, limit, Address::times_2), chr); 4577 jccb(Assembler::notEqual, FALSE_LABEL); 4578 addptr(limit, 1); 4579 jcc(Assembler::notZero, COMPARE_VECTORS); 4580 jmp(TRUE_LABEL); 4581 } else { 4582 movl(chr, Address(ary1, limit, Address::times_1)); 4583 cmpl(chr, Address(ary2, limit, Address::times_1)); 4584 jccb(Assembler::notEqual, FALSE_LABEL); 4585 addptr(limit, 4); 4586 jcc(Assembler::notZero, COMPARE_VECTORS); 4587 } 4588 4589 // Compare trailing char (final 2 bytes), if any 4590 bind(COMPARE_CHAR); 4591 testl(result, 0x2); // tail char 4592 jccb(Assembler::zero, COMPARE_BYTE); 4593 load_unsigned_short(chr, Address(ary1, 0)); 4594 load_unsigned_short(limit, Address(ary2, 0)); 4595 cmpl(chr, limit); 4596 jccb(Assembler::notEqual, FALSE_LABEL); 4597 4598 if (is_array_equ && is_char) { 4599 bind(COMPARE_BYTE); 4600 } else { 4601 lea(ary1, Address(ary1, 2)); 4602 lea(ary2, Address(ary2, 2)); 4603 4604 bind(COMPARE_BYTE); 4605 testl(result, 0x1); // tail byte 4606 jccb(Assembler::zero, TRUE_LABEL); 4607 load_unsigned_byte(chr, Address(ary1, 0)); 4608 load_unsigned_byte(limit, Address(ary2, 0)); 4609 cmpl(chr, limit); 4610 jccb(Assembler::notEqual, FALSE_LABEL); 4611 } 4612 bind(TRUE_LABEL); 4613 movl(result, 1); // return true 4614 jmpb(DONE); 4615 4616 bind(FALSE_LABEL); 4617 xorl(result, result); // return false 4618 4619 // That's it 4620 bind(DONE); 4621 if (UseAVX >= 2) { 4622 // clean upper bits of YMM registers 4623 vpxor(vec1, vec1); 4624 vpxor(vec2, vec2); 4625 } 4626 } 4627 4628 #ifdef _LP64 4629 4630 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4631 #define __ masm. 4632 Register dst = stub.data<0>(); 4633 XMMRegister src = stub.data<1>(); 4634 address target = stub.data<2>(); 4635 __ bind(stub.entry()); 4636 __ subptr(rsp, 8); 4637 __ movdbl(Address(rsp), src); 4638 __ call(RuntimeAddress(target)); 4639 __ pop(dst); 4640 __ jmp(stub.continuation()); 4641 #undef __ 4642 } 4643 4644 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4645 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4646 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4647 4648 address slowpath_target; 4649 if (dst_bt == T_INT) { 4650 if (src_bt == T_FLOAT) { 4651 cvttss2sil(dst, src); 4652 cmpl(dst, 0x80000000); 4653 slowpath_target = StubRoutines::x86::f2i_fixup(); 4654 } else { 4655 cvttsd2sil(dst, src); 4656 cmpl(dst, 0x80000000); 4657 slowpath_target = StubRoutines::x86::d2i_fixup(); 4658 } 4659 } else { 4660 if (src_bt == T_FLOAT) { 4661 cvttss2siq(dst, src); 4662 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4663 slowpath_target = StubRoutines::x86::f2l_fixup(); 4664 } else { 4665 cvttsd2siq(dst, src); 4666 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4667 slowpath_target = StubRoutines::x86::d2l_fixup(); 4668 } 4669 } 4670 4671 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4672 jcc(Assembler::equal, stub->entry()); 4673 bind(stub->continuation()); 4674 } 4675 4676 #endif // _LP64 4677 4678 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4679 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4680 switch(ideal_opc) { 4681 case Op_LShiftVS: 4682 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4683 case Op_LShiftVI: 4684 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4685 case Op_LShiftVL: 4686 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4687 case Op_RShiftVS: 4688 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4689 case Op_RShiftVI: 4690 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4691 case Op_RShiftVL: 4692 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4693 case Op_URShiftVS: 4694 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4695 case Op_URShiftVI: 4696 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4697 case Op_URShiftVL: 4698 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4699 case Op_RotateRightV: 4700 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4701 case Op_RotateLeftV: 4702 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4703 default: 4704 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4705 break; 4706 } 4707 } 4708 4709 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4710 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) { 4711 if (is_unsigned) { 4712 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4713 } else { 4714 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4715 } 4716 } 4717 4718 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4719 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4720 switch (elem_bt) { 4721 case T_BYTE: 4722 if (ideal_opc == Op_SaturatingAddV) { 4723 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4724 } else { 4725 assert(ideal_opc == Op_SaturatingSubV, ""); 4726 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4727 } 4728 break; 4729 case T_SHORT: 4730 if (ideal_opc == Op_SaturatingAddV) { 4731 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4732 } else { 4733 assert(ideal_opc == Op_SaturatingSubV, ""); 4734 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4735 } 4736 break; 4737 default: 4738 fatal("Unsupported type %s", type2name(elem_bt)); 4739 break; 4740 } 4741 } 4742 4743 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4744 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4745 switch (elem_bt) { 4746 case T_BYTE: 4747 if (ideal_opc == Op_SaturatingAddV) { 4748 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4749 } else { 4750 assert(ideal_opc == Op_SaturatingSubV, ""); 4751 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4752 } 4753 break; 4754 case T_SHORT: 4755 if (ideal_opc == Op_SaturatingAddV) { 4756 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4757 } else { 4758 assert(ideal_opc == Op_SaturatingSubV, ""); 4759 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4760 } 4761 break; 4762 default: 4763 fatal("Unsupported type %s", type2name(elem_bt)); 4764 break; 4765 } 4766 } 4767 4768 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4769 Address src2, bool is_unsigned, bool merge, int vlen_enc) { 4770 if (is_unsigned) { 4771 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4772 } else { 4773 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4774 } 4775 } 4776 4777 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4778 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4779 switch (elem_bt) { 4780 case T_BYTE: 4781 if (ideal_opc == Op_SaturatingAddV) { 4782 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4783 } else { 4784 assert(ideal_opc == Op_SaturatingSubV, ""); 4785 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4786 } 4787 break; 4788 case T_SHORT: 4789 if (ideal_opc == Op_SaturatingAddV) { 4790 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4791 } else { 4792 assert(ideal_opc == Op_SaturatingSubV, ""); 4793 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4794 } 4795 break; 4796 default: 4797 fatal("Unsupported type %s", type2name(elem_bt)); 4798 break; 4799 } 4800 } 4801 4802 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4803 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4804 switch (elem_bt) { 4805 case T_BYTE: 4806 if (ideal_opc == Op_SaturatingAddV) { 4807 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4808 } else { 4809 assert(ideal_opc == Op_SaturatingSubV, ""); 4810 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4811 } 4812 break; 4813 case T_SHORT: 4814 if (ideal_opc == Op_SaturatingAddV) { 4815 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4816 } else { 4817 assert(ideal_opc == Op_SaturatingSubV, ""); 4818 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4819 } 4820 break; 4821 default: 4822 fatal("Unsupported type %s", type2name(elem_bt)); 4823 break; 4824 } 4825 } 4826 4827 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4828 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4829 bool is_varshift) { 4830 switch (ideal_opc) { 4831 case Op_AddVB: 4832 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4833 case Op_AddVS: 4834 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4835 case Op_AddVI: 4836 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4837 case Op_AddVL: 4838 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4839 case Op_AddVF: 4840 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4841 case Op_AddVD: 4842 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4843 case Op_SubVB: 4844 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4845 case Op_SubVS: 4846 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4847 case Op_SubVI: 4848 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4849 case Op_SubVL: 4850 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4851 case Op_SubVF: 4852 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4853 case Op_SubVD: 4854 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4855 case Op_MulVS: 4856 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4857 case Op_MulVI: 4858 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4859 case Op_MulVL: 4860 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4861 case Op_MulVF: 4862 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4863 case Op_MulVD: 4864 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4865 case Op_DivVF: 4866 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4867 case Op_DivVD: 4868 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4869 case Op_SqrtVF: 4870 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4871 case Op_SqrtVD: 4872 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4873 case Op_AbsVB: 4874 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4875 case Op_AbsVS: 4876 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4877 case Op_AbsVI: 4878 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4879 case Op_AbsVL: 4880 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4881 case Op_FmaVF: 4882 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4883 case Op_FmaVD: 4884 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4885 case Op_VectorRearrange: 4886 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4887 case Op_LShiftVS: 4888 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4889 case Op_LShiftVI: 4890 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4891 case Op_LShiftVL: 4892 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4893 case Op_RShiftVS: 4894 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4895 case Op_RShiftVI: 4896 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4897 case Op_RShiftVL: 4898 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4899 case Op_URShiftVS: 4900 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4901 case Op_URShiftVI: 4902 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4903 case Op_URShiftVL: 4904 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4905 case Op_RotateLeftV: 4906 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4907 case Op_RotateRightV: 4908 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4909 case Op_MaxV: 4910 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4911 case Op_MinV: 4912 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4913 case Op_UMinV: 4914 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4915 case Op_UMaxV: 4916 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4917 case Op_XorV: 4918 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4919 case Op_OrV: 4920 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4921 case Op_AndV: 4922 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4923 default: 4924 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4925 break; 4926 } 4927 } 4928 4929 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4930 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4931 switch (ideal_opc) { 4932 case Op_AddVB: 4933 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4934 case Op_AddVS: 4935 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4936 case Op_AddVI: 4937 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4938 case Op_AddVL: 4939 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4940 case Op_AddVF: 4941 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4942 case Op_AddVD: 4943 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4944 case Op_SubVB: 4945 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4946 case Op_SubVS: 4947 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4948 case Op_SubVI: 4949 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4950 case Op_SubVL: 4951 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4952 case Op_SubVF: 4953 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4954 case Op_SubVD: 4955 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4956 case Op_MulVS: 4957 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4958 case Op_MulVI: 4959 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4960 case Op_MulVL: 4961 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4962 case Op_MulVF: 4963 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4964 case Op_MulVD: 4965 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4966 case Op_DivVF: 4967 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4968 case Op_DivVD: 4969 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4970 case Op_FmaVF: 4971 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4972 case Op_FmaVD: 4973 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4974 case Op_MaxV: 4975 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4976 case Op_MinV: 4977 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4978 case Op_UMaxV: 4979 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4980 case Op_UMinV: 4981 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4982 case Op_XorV: 4983 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4984 case Op_OrV: 4985 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4986 case Op_AndV: 4987 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4988 default: 4989 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4990 break; 4991 } 4992 } 4993 4994 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4995 KRegister src1, KRegister src2) { 4996 BasicType etype = T_ILLEGAL; 4997 switch(mask_len) { 4998 case 2: 4999 case 4: 5000 case 8: etype = T_BYTE; break; 5001 case 16: etype = T_SHORT; break; 5002 case 32: etype = T_INT; break; 5003 case 64: etype = T_LONG; break; 5004 default: fatal("Unsupported type"); break; 5005 } 5006 assert(etype != T_ILLEGAL, ""); 5007 switch(ideal_opc) { 5008 case Op_AndVMask: 5009 kand(etype, dst, src1, src2); break; 5010 case Op_OrVMask: 5011 kor(etype, dst, src1, src2); break; 5012 case Op_XorVMask: 5013 kxor(etype, dst, src1, src2); break; 5014 default: 5015 fatal("Unsupported masked operation"); break; 5016 } 5017 } 5018 5019 /* 5020 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5021 * If src is NaN, the result is 0. 5022 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 5023 * the result is equal to the value of Integer.MIN_VALUE. 5024 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 5025 * the result is equal to the value of Integer.MAX_VALUE. 5026 */ 5027 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5028 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5029 Register rscratch, AddressLiteral float_sign_flip, 5030 int vec_enc) { 5031 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5032 Label done; 5033 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 5034 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5035 vptest(xtmp2, xtmp2, vec_enc); 5036 jccb(Assembler::equal, done); 5037 5038 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 5039 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 5040 5041 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5042 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 5043 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 5044 5045 // Recompute the mask for remaining special value. 5046 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 5047 // Extract SRC values corresponding to TRUE mask lanes. 5048 vpand(xtmp4, xtmp2, src, vec_enc); 5049 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 5050 // values are set. 5051 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 5052 5053 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 5054 bind(done); 5055 } 5056 5057 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5058 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5059 Register rscratch, AddressLiteral float_sign_flip, 5060 int vec_enc) { 5061 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5062 Label done; 5063 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5064 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5065 kortestwl(ktmp1, ktmp1); 5066 jccb(Assembler::equal, done); 5067 5068 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5069 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5070 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5071 5072 kxorwl(ktmp1, ktmp1, ktmp2); 5073 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5074 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5075 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5076 bind(done); 5077 } 5078 5079 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5080 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5081 Register rscratch, AddressLiteral double_sign_flip, 5082 int vec_enc) { 5083 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5084 5085 Label done; 5086 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5087 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 5088 kortestwl(ktmp1, ktmp1); 5089 jccb(Assembler::equal, done); 5090 5091 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5092 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5093 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5094 5095 kxorwl(ktmp1, ktmp1, ktmp2); 5096 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5097 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5098 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5099 bind(done); 5100 } 5101 5102 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5103 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5104 Register rscratch, AddressLiteral float_sign_flip, 5105 int vec_enc) { 5106 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5107 Label done; 5108 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5109 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5110 kortestwl(ktmp1, ktmp1); 5111 jccb(Assembler::equal, done); 5112 5113 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5114 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5115 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5116 5117 kxorwl(ktmp1, ktmp1, ktmp2); 5118 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5119 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5120 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5121 bind(done); 5122 } 5123 5124 /* 5125 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5126 * If src is NaN, the result is 0. 5127 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 5128 * the result is equal to the value of Long.MIN_VALUE. 5129 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 5130 * the result is equal to the value of Long.MAX_VALUE. 5131 */ 5132 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5133 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5134 Register rscratch, AddressLiteral double_sign_flip, 5135 int vec_enc) { 5136 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5137 5138 Label done; 5139 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5140 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 5141 kortestwl(ktmp1, ktmp1); 5142 jccb(Assembler::equal, done); 5143 5144 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5145 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5146 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5147 5148 kxorwl(ktmp1, ktmp1, ktmp2); 5149 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5150 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5151 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5152 bind(done); 5153 } 5154 5155 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5156 XMMRegister xtmp, int index, int vec_enc) { 5157 assert(vec_enc < Assembler::AVX_512bit, ""); 5158 if (vec_enc == Assembler::AVX_256bit) { 5159 vextractf128_high(xtmp, src); 5160 vshufps(dst, src, xtmp, index, vec_enc); 5161 } else { 5162 vshufps(dst, src, zero, index, vec_enc); 5163 } 5164 } 5165 5166 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5167 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5168 AddressLiteral float_sign_flip, int src_vec_enc) { 5169 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5170 5171 Label done; 5172 // Compare the destination lanes with float_sign_flip 5173 // value to get mask for all special values. 5174 movdqu(xtmp1, float_sign_flip, rscratch); 5175 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5176 ptest(xtmp2, xtmp2); 5177 jccb(Assembler::equal, done); 5178 5179 // Flip float_sign_flip to get max integer value. 5180 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5181 pxor(xtmp1, xtmp4); 5182 5183 // Set detination lanes corresponding to unordered source lanes as zero. 5184 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5185 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5186 5187 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5188 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5189 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5190 5191 // Recompute the mask for remaining special value. 5192 pxor(xtmp2, xtmp3); 5193 // Extract mask corresponding to non-negative source lanes. 5194 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5195 5196 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5197 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5198 pand(xtmp3, xtmp2); 5199 5200 // Replace destination lanes holding special value(0x80000000) with max int 5201 // if corresponding source lane holds a +ve value. 5202 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5203 bind(done); 5204 } 5205 5206 5207 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5208 XMMRegister xtmp, Register rscratch, int vec_enc) { 5209 switch(to_elem_bt) { 5210 case T_SHORT: 5211 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5212 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5213 vpackusdw(dst, dst, zero, vec_enc); 5214 if (vec_enc == Assembler::AVX_256bit) { 5215 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5216 } 5217 break; 5218 case T_BYTE: 5219 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5220 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5221 vpackusdw(dst, dst, zero, vec_enc); 5222 if (vec_enc == Assembler::AVX_256bit) { 5223 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5224 } 5225 vpackuswb(dst, dst, zero, vec_enc); 5226 break; 5227 default: assert(false, "%s", type2name(to_elem_bt)); 5228 } 5229 } 5230 5231 /* 5232 * Algorithm for vector D2L and F2I conversions:- 5233 * a) Perform vector D2L/F2I cast. 5234 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5235 * It signifies that source value could be any of the special floating point 5236 * values(NaN,-Inf,Inf,Max,-Min). 5237 * c) Set destination to zero if source is NaN value. 5238 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5239 */ 5240 5241 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5242 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5243 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5244 int to_elem_sz = type2aelembytes(to_elem_bt); 5245 assert(to_elem_sz <= 4, ""); 5246 vcvttps2dq(dst, src, vec_enc); 5247 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5248 if (to_elem_sz < 4) { 5249 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5250 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5251 } 5252 } 5253 5254 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5255 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5256 Register rscratch, int vec_enc) { 5257 int to_elem_sz = type2aelembytes(to_elem_bt); 5258 assert(to_elem_sz <= 4, ""); 5259 vcvttps2dq(dst, src, vec_enc); 5260 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5261 switch(to_elem_bt) { 5262 case T_INT: 5263 break; 5264 case T_SHORT: 5265 evpmovdw(dst, dst, vec_enc); 5266 break; 5267 case T_BYTE: 5268 evpmovdb(dst, dst, vec_enc); 5269 break; 5270 default: assert(false, "%s", type2name(to_elem_bt)); 5271 } 5272 } 5273 5274 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5275 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5276 Register rscratch, int vec_enc) { 5277 evcvttps2qq(dst, src, vec_enc); 5278 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5279 } 5280 5281 // Handling for downcasting from double to integer or sub-word types on AVX2. 5282 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5283 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5284 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5285 int to_elem_sz = type2aelembytes(to_elem_bt); 5286 assert(to_elem_sz < 8, ""); 5287 vcvttpd2dq(dst, src, vec_enc); 5288 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5289 float_sign_flip, vec_enc); 5290 if (to_elem_sz < 4) { 5291 // xtmp4 holds all zero lanes. 5292 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5293 } 5294 } 5295 5296 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5297 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5298 KRegister ktmp2, AddressLiteral sign_flip, 5299 Register rscratch, int vec_enc) { 5300 if (VM_Version::supports_avx512dq()) { 5301 evcvttpd2qq(dst, src, vec_enc); 5302 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5303 switch(to_elem_bt) { 5304 case T_LONG: 5305 break; 5306 case T_INT: 5307 evpmovsqd(dst, dst, vec_enc); 5308 break; 5309 case T_SHORT: 5310 evpmovsqd(dst, dst, vec_enc); 5311 evpmovdw(dst, dst, vec_enc); 5312 break; 5313 case T_BYTE: 5314 evpmovsqd(dst, dst, vec_enc); 5315 evpmovdb(dst, dst, vec_enc); 5316 break; 5317 default: assert(false, "%s", type2name(to_elem_bt)); 5318 } 5319 } else { 5320 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5321 vcvttpd2dq(dst, src, vec_enc); 5322 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5323 switch(to_elem_bt) { 5324 case T_INT: 5325 break; 5326 case T_SHORT: 5327 evpmovdw(dst, dst, vec_enc); 5328 break; 5329 case T_BYTE: 5330 evpmovdb(dst, dst, vec_enc); 5331 break; 5332 default: assert(false, "%s", type2name(to_elem_bt)); 5333 } 5334 } 5335 } 5336 5337 #ifdef _LP64 5338 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5339 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5340 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5341 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5342 // and re-instantiate original MXCSR.RC mode after that. 5343 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5344 5345 mov64(tmp, julong_cast(0.5L)); 5346 evpbroadcastq(xtmp1, tmp, vec_enc); 5347 vaddpd(xtmp1, src , xtmp1, vec_enc); 5348 evcvtpd2qq(dst, xtmp1, vec_enc); 5349 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5350 double_sign_flip, vec_enc);; 5351 5352 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5353 } 5354 5355 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5356 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5357 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5358 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5359 // and re-instantiate original MXCSR.RC mode after that. 5360 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5361 5362 movl(tmp, jint_cast(0.5)); 5363 movq(xtmp1, tmp); 5364 vbroadcastss(xtmp1, xtmp1, vec_enc); 5365 vaddps(xtmp1, src , xtmp1, vec_enc); 5366 vcvtps2dq(dst, xtmp1, vec_enc); 5367 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5368 float_sign_flip, vec_enc); 5369 5370 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5371 } 5372 5373 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5374 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5375 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5376 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5377 // and re-instantiate original MXCSR.RC mode after that. 5378 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5379 5380 movl(tmp, jint_cast(0.5)); 5381 movq(xtmp1, tmp); 5382 vbroadcastss(xtmp1, xtmp1, vec_enc); 5383 vaddps(xtmp1, src , xtmp1, vec_enc); 5384 vcvtps2dq(dst, xtmp1, vec_enc); 5385 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5386 5387 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5388 } 5389 #endif // _LP64 5390 5391 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5392 BasicType from_elem_bt, BasicType to_elem_bt) { 5393 switch (from_elem_bt) { 5394 case T_BYTE: 5395 switch (to_elem_bt) { 5396 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5397 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5398 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5399 default: ShouldNotReachHere(); 5400 } 5401 break; 5402 case T_SHORT: 5403 switch (to_elem_bt) { 5404 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5405 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5406 default: ShouldNotReachHere(); 5407 } 5408 break; 5409 case T_INT: 5410 assert(to_elem_bt == T_LONG, ""); 5411 vpmovzxdq(dst, src, vlen_enc); 5412 break; 5413 default: 5414 ShouldNotReachHere(); 5415 } 5416 } 5417 5418 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5419 BasicType from_elem_bt, BasicType to_elem_bt) { 5420 switch (from_elem_bt) { 5421 case T_BYTE: 5422 switch (to_elem_bt) { 5423 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5424 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5425 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5426 default: ShouldNotReachHere(); 5427 } 5428 break; 5429 case T_SHORT: 5430 switch (to_elem_bt) { 5431 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5432 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5433 default: ShouldNotReachHere(); 5434 } 5435 break; 5436 case T_INT: 5437 assert(to_elem_bt == T_LONG, ""); 5438 vpmovsxdq(dst, src, vlen_enc); 5439 break; 5440 default: 5441 ShouldNotReachHere(); 5442 } 5443 } 5444 5445 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5446 BasicType dst_bt, BasicType src_bt, int vlen) { 5447 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5448 assert(vlen_enc != AVX_512bit, ""); 5449 5450 int dst_bt_size = type2aelembytes(dst_bt); 5451 int src_bt_size = type2aelembytes(src_bt); 5452 if (dst_bt_size > src_bt_size) { 5453 switch (dst_bt_size / src_bt_size) { 5454 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5455 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5456 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5457 default: ShouldNotReachHere(); 5458 } 5459 } else { 5460 assert(dst_bt_size < src_bt_size, ""); 5461 switch (src_bt_size / dst_bt_size) { 5462 case 2: { 5463 if (vlen_enc == AVX_128bit) { 5464 vpacksswb(dst, src, src, vlen_enc); 5465 } else { 5466 vpacksswb(dst, src, src, vlen_enc); 5467 vpermq(dst, dst, 0x08, vlen_enc); 5468 } 5469 break; 5470 } 5471 case 4: { 5472 if (vlen_enc == AVX_128bit) { 5473 vpackssdw(dst, src, src, vlen_enc); 5474 vpacksswb(dst, dst, dst, vlen_enc); 5475 } else { 5476 vpackssdw(dst, src, src, vlen_enc); 5477 vpermq(dst, dst, 0x08, vlen_enc); 5478 vpacksswb(dst, dst, dst, AVX_128bit); 5479 } 5480 break; 5481 } 5482 case 8: { 5483 if (vlen_enc == AVX_128bit) { 5484 vpshufd(dst, src, 0x08, vlen_enc); 5485 vpackssdw(dst, dst, dst, vlen_enc); 5486 vpacksswb(dst, dst, dst, vlen_enc); 5487 } else { 5488 vpshufd(dst, src, 0x08, vlen_enc); 5489 vpermq(dst, dst, 0x08, vlen_enc); 5490 vpackssdw(dst, dst, dst, AVX_128bit); 5491 vpacksswb(dst, dst, dst, AVX_128bit); 5492 } 5493 break; 5494 } 5495 default: ShouldNotReachHere(); 5496 } 5497 } 5498 } 5499 5500 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5501 bool merge, BasicType bt, int vlen_enc) { 5502 if (bt == T_INT) { 5503 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5504 } else { 5505 assert(bt == T_LONG, ""); 5506 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5507 } 5508 } 5509 5510 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5511 bool merge, BasicType bt, int vlen_enc) { 5512 if (bt == T_INT) { 5513 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5514 } else { 5515 assert(bt == T_LONG, ""); 5516 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5517 } 5518 } 5519 5520 #ifdef _LP64 5521 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5522 Register rtmp2, XMMRegister xtmp, int mask_len, 5523 int vec_enc) { 5524 int index = 0; 5525 int vindex = 0; 5526 mov64(rtmp1, 0x0101010101010101L); 5527 pdepq(rtmp1, src, rtmp1); 5528 if (mask_len > 8) { 5529 movq(rtmp2, src); 5530 vpxor(xtmp, xtmp, xtmp, vec_enc); 5531 movq(xtmp, rtmp1); 5532 } 5533 movq(dst, rtmp1); 5534 5535 mask_len -= 8; 5536 while (mask_len > 0) { 5537 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5538 index++; 5539 if ((index % 2) == 0) { 5540 pxor(xtmp, xtmp); 5541 } 5542 mov64(rtmp1, 0x0101010101010101L); 5543 shrq(rtmp2, 8); 5544 pdepq(rtmp1, rtmp2, rtmp1); 5545 pinsrq(xtmp, rtmp1, index % 2); 5546 vindex = index / 2; 5547 if (vindex) { 5548 // Write entire 16 byte vector when both 64 bit 5549 // lanes are update to save redundant instructions. 5550 if (index % 2) { 5551 vinsertf128(dst, dst, xtmp, vindex); 5552 } 5553 } else { 5554 vmovdqu(dst, xtmp); 5555 } 5556 mask_len -= 8; 5557 } 5558 } 5559 5560 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5561 switch(opc) { 5562 case Op_VectorMaskTrueCount: 5563 popcntq(dst, tmp); 5564 break; 5565 case Op_VectorMaskLastTrue: 5566 if (VM_Version::supports_lzcnt()) { 5567 lzcntq(tmp, tmp); 5568 movl(dst, 63); 5569 subl(dst, tmp); 5570 } else { 5571 movl(dst, -1); 5572 bsrq(tmp, tmp); 5573 cmov32(Assembler::notZero, dst, tmp); 5574 } 5575 break; 5576 case Op_VectorMaskFirstTrue: 5577 if (VM_Version::supports_bmi1()) { 5578 if (masklen < 32) { 5579 orl(tmp, 1 << masklen); 5580 tzcntl(dst, tmp); 5581 } else if (masklen == 32) { 5582 tzcntl(dst, tmp); 5583 } else { 5584 assert(masklen == 64, ""); 5585 tzcntq(dst, tmp); 5586 } 5587 } else { 5588 if (masklen < 32) { 5589 orl(tmp, 1 << masklen); 5590 bsfl(dst, tmp); 5591 } else { 5592 assert(masklen == 32 || masklen == 64, ""); 5593 movl(dst, masklen); 5594 if (masklen == 32) { 5595 bsfl(tmp, tmp); 5596 } else { 5597 bsfq(tmp, tmp); 5598 } 5599 cmov32(Assembler::notZero, dst, tmp); 5600 } 5601 } 5602 break; 5603 case Op_VectorMaskToLong: 5604 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5605 break; 5606 default: assert(false, "Unhandled mask operation"); 5607 } 5608 } 5609 5610 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5611 int masklen, int masksize, int vec_enc) { 5612 assert(VM_Version::supports_popcnt(), ""); 5613 5614 if(VM_Version::supports_avx512bw()) { 5615 kmovql(tmp, mask); 5616 } else { 5617 assert(masklen <= 16, ""); 5618 kmovwl(tmp, mask); 5619 } 5620 5621 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5622 // operations needs to be clipped. 5623 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5624 andq(tmp, (1 << masklen) - 1); 5625 } 5626 5627 vector_mask_operation_helper(opc, dst, tmp, masklen); 5628 } 5629 5630 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5631 Register tmp, int masklen, BasicType bt, int vec_enc) { 5632 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5633 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5634 assert(VM_Version::supports_popcnt(), ""); 5635 5636 bool need_clip = false; 5637 switch(bt) { 5638 case T_BOOLEAN: 5639 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5640 vpxor(xtmp, xtmp, xtmp, vec_enc); 5641 vpsubb(xtmp, xtmp, mask, vec_enc); 5642 vpmovmskb(tmp, xtmp, vec_enc); 5643 need_clip = masklen < 16; 5644 break; 5645 case T_BYTE: 5646 vpmovmskb(tmp, mask, vec_enc); 5647 need_clip = masklen < 16; 5648 break; 5649 case T_SHORT: 5650 vpacksswb(xtmp, mask, mask, vec_enc); 5651 if (masklen >= 16) { 5652 vpermpd(xtmp, xtmp, 8, vec_enc); 5653 } 5654 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5655 need_clip = masklen < 16; 5656 break; 5657 case T_INT: 5658 case T_FLOAT: 5659 vmovmskps(tmp, mask, vec_enc); 5660 need_clip = masklen < 4; 5661 break; 5662 case T_LONG: 5663 case T_DOUBLE: 5664 vmovmskpd(tmp, mask, vec_enc); 5665 need_clip = masklen < 2; 5666 break; 5667 default: assert(false, "Unhandled type, %s", type2name(bt)); 5668 } 5669 5670 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5671 // operations needs to be clipped. 5672 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5673 // need_clip implies masklen < 32 5674 andq(tmp, (1 << masklen) - 1); 5675 } 5676 5677 vector_mask_operation_helper(opc, dst, tmp, masklen); 5678 } 5679 5680 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5681 Register rtmp2, int mask_len) { 5682 kmov(rtmp1, src); 5683 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5684 mov64(rtmp2, -1L); 5685 pextq(rtmp2, rtmp2, rtmp1); 5686 kmov(dst, rtmp2); 5687 } 5688 5689 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5690 XMMRegister mask, Register rtmp, Register rscratch, 5691 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5692 int vec_enc) { 5693 assert(type2aelembytes(bt) >= 4, ""); 5694 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5695 address compress_perm_table = nullptr; 5696 address expand_perm_table = nullptr; 5697 if (type2aelembytes(bt) == 8) { 5698 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5699 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5700 vmovmskpd(rtmp, mask, vec_enc); 5701 } else { 5702 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5703 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5704 vmovmskps(rtmp, mask, vec_enc); 5705 } 5706 shlq(rtmp, 5); // for 32 byte permute row. 5707 if (opcode == Op_CompressV) { 5708 lea(rscratch, ExternalAddress(compress_perm_table)); 5709 } else { 5710 lea(rscratch, ExternalAddress(expand_perm_table)); 5711 } 5712 addptr(rtmp, rscratch); 5713 vmovdqu(permv, Address(rtmp)); 5714 vpermps(dst, permv, src, Assembler::AVX_256bit); 5715 vpxor(xtmp, xtmp, xtmp, vec_enc); 5716 // Blend the result with zero vector using permute mask, each column entry 5717 // in a permute table row contains either a valid permute index or a -1 (default) 5718 // value, this can potentially be used as a blending mask after 5719 // compressing/expanding the source vector lanes. 5720 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5721 } 5722 5723 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5724 bool merge, BasicType bt, int vec_enc) { 5725 if (opcode == Op_CompressV) { 5726 switch(bt) { 5727 case T_BYTE: 5728 evpcompressb(dst, mask, src, merge, vec_enc); 5729 break; 5730 case T_CHAR: 5731 case T_SHORT: 5732 evpcompressw(dst, mask, src, merge, vec_enc); 5733 break; 5734 case T_INT: 5735 evpcompressd(dst, mask, src, merge, vec_enc); 5736 break; 5737 case T_FLOAT: 5738 evcompressps(dst, mask, src, merge, vec_enc); 5739 break; 5740 case T_LONG: 5741 evpcompressq(dst, mask, src, merge, vec_enc); 5742 break; 5743 case T_DOUBLE: 5744 evcompresspd(dst, mask, src, merge, vec_enc); 5745 break; 5746 default: 5747 fatal("Unsupported type %s", type2name(bt)); 5748 break; 5749 } 5750 } else { 5751 assert(opcode == Op_ExpandV, ""); 5752 switch(bt) { 5753 case T_BYTE: 5754 evpexpandb(dst, mask, src, merge, vec_enc); 5755 break; 5756 case T_CHAR: 5757 case T_SHORT: 5758 evpexpandw(dst, mask, src, merge, vec_enc); 5759 break; 5760 case T_INT: 5761 evpexpandd(dst, mask, src, merge, vec_enc); 5762 break; 5763 case T_FLOAT: 5764 evexpandps(dst, mask, src, merge, vec_enc); 5765 break; 5766 case T_LONG: 5767 evpexpandq(dst, mask, src, merge, vec_enc); 5768 break; 5769 case T_DOUBLE: 5770 evexpandpd(dst, mask, src, merge, vec_enc); 5771 break; 5772 default: 5773 fatal("Unsupported type %s", type2name(bt)); 5774 break; 5775 } 5776 } 5777 } 5778 #endif 5779 5780 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5781 KRegister ktmp1, int vec_enc) { 5782 if (opcode == Op_SignumVD) { 5783 vsubpd(dst, zero, one, vec_enc); 5784 // if src < 0 ? -1 : 1 5785 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5786 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5787 // if src == NaN, -0.0 or 0.0 return src. 5788 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5789 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5790 } else { 5791 assert(opcode == Op_SignumVF, ""); 5792 vsubps(dst, zero, one, vec_enc); 5793 // if src < 0 ? -1 : 1 5794 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5795 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5796 // if src == NaN, -0.0 or 0.0 return src. 5797 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5798 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5799 } 5800 } 5801 5802 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5803 XMMRegister xtmp1, int vec_enc) { 5804 if (opcode == Op_SignumVD) { 5805 vsubpd(dst, zero, one, vec_enc); 5806 // if src < 0 ? -1 : 1 5807 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5808 // if src == NaN, -0.0 or 0.0 return src. 5809 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5810 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5811 } else { 5812 assert(opcode == Op_SignumVF, ""); 5813 vsubps(dst, zero, one, vec_enc); 5814 // if src < 0 ? -1 : 1 5815 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5816 // if src == NaN, -0.0 or 0.0 return src. 5817 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5818 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5819 } 5820 } 5821 5822 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5823 if (VM_Version::supports_avx512bw()) { 5824 if (mask_len > 32) { 5825 kmovql(dst, src); 5826 } else { 5827 kmovdl(dst, src); 5828 if (mask_len != 32) { 5829 kshiftrdl(dst, dst, 32 - mask_len); 5830 } 5831 } 5832 } else { 5833 assert(mask_len <= 16, ""); 5834 kmovwl(dst, src); 5835 if (mask_len != 16) { 5836 kshiftrwl(dst, dst, 16 - mask_len); 5837 } 5838 } 5839 } 5840 5841 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5842 int lane_size = type2aelembytes(bt); 5843 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5844 if ((is_LP64 || lane_size < 8) && 5845 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5846 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5847 movptr(rtmp, imm32); 5848 switch(lane_size) { 5849 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5850 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5851 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5852 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5853 fatal("Unsupported lane size %d", lane_size); 5854 break; 5855 } 5856 } else { 5857 movptr(rtmp, imm32); 5858 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5859 switch(lane_size) { 5860 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5861 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5862 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5863 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5864 fatal("Unsupported lane size %d", lane_size); 5865 break; 5866 } 5867 } 5868 } 5869 5870 // 5871 // Following is lookup table based popcount computation algorithm:- 5872 // Index Bit set count 5873 // [ 0000 -> 0, 5874 // 0001 -> 1, 5875 // 0010 -> 1, 5876 // 0011 -> 2, 5877 // 0100 -> 1, 5878 // 0101 -> 2, 5879 // 0110 -> 2, 5880 // 0111 -> 3, 5881 // 1000 -> 1, 5882 // 1001 -> 2, 5883 // 1010 -> 3, 5884 // 1011 -> 3, 5885 // 1100 -> 2, 5886 // 1101 -> 3, 5887 // 1111 -> 4 ] 5888 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5889 // shuffle indices for lookup table access. 5890 // b. Right shift each byte of vector lane by 4 positions. 5891 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5892 // shuffle indices for lookup table access. 5893 // d. Add the bitset count of upper and lower 4 bits of each byte. 5894 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5895 // count of all the bytes of a quadword. 5896 // f. Perform step e. for upper 128bit vector lane. 5897 // g. Pack the bitset count of quadwords back to double word. 5898 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5899 5900 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5901 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5902 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5903 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5904 vpsrlw(dst, src, 4, vec_enc); 5905 vpand(dst, dst, xtmp1, vec_enc); 5906 vpand(xtmp1, src, xtmp1, vec_enc); 5907 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5908 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5909 vpshufb(dst, xtmp2, dst, vec_enc); 5910 vpaddb(dst, dst, xtmp1, vec_enc); 5911 } 5912 5913 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5914 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5915 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5916 // Following code is as per steps e,f,g and h of above algorithm. 5917 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5918 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5919 vpsadbw(dst, dst, xtmp2, vec_enc); 5920 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5921 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5922 vpackuswb(dst, xtmp1, dst, vec_enc); 5923 } 5924 5925 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5926 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5927 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5928 // Add the popcount of upper and lower bytes of word. 5929 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5930 vpsrlw(dst, xtmp1, 8, vec_enc); 5931 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5932 vpaddw(dst, dst, xtmp1, vec_enc); 5933 } 5934 5935 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5936 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5937 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5938 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5939 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5940 } 5941 5942 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5943 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5944 switch(bt) { 5945 case T_LONG: 5946 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5947 break; 5948 case T_INT: 5949 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5950 break; 5951 case T_CHAR: 5952 case T_SHORT: 5953 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5954 break; 5955 case T_BYTE: 5956 case T_BOOLEAN: 5957 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5958 break; 5959 default: 5960 fatal("Unsupported type %s", type2name(bt)); 5961 break; 5962 } 5963 } 5964 5965 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5966 KRegister mask, bool merge, int vec_enc) { 5967 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5968 switch(bt) { 5969 case T_LONG: 5970 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5971 evpopcntq(dst, mask, src, merge, vec_enc); 5972 break; 5973 case T_INT: 5974 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5975 evpopcntd(dst, mask, src, merge, vec_enc); 5976 break; 5977 case T_CHAR: 5978 case T_SHORT: 5979 assert(VM_Version::supports_avx512_bitalg(), ""); 5980 evpopcntw(dst, mask, src, merge, vec_enc); 5981 break; 5982 case T_BYTE: 5983 case T_BOOLEAN: 5984 assert(VM_Version::supports_avx512_bitalg(), ""); 5985 evpopcntb(dst, mask, src, merge, vec_enc); 5986 break; 5987 default: 5988 fatal("Unsupported type %s", type2name(bt)); 5989 break; 5990 } 5991 } 5992 5993 #ifndef _LP64 5994 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5995 assert(VM_Version::supports_avx512bw(), ""); 5996 kmovdl(tmp, src); 5997 kunpckdql(dst, tmp, tmp); 5998 } 5999 #endif 6000 6001 // Bit reversal algorithm first reverses the bits of each byte followed by 6002 // a byte level reversal for multi-byte primitive types (short/int/long). 6003 // Algorithm performs a lookup table access to get reverse bit sequence 6004 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 6005 // is obtained by swapping the reverse bit sequences of upper and lower 6006 // nibble of a byte. 6007 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6008 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6009 if (VM_Version::supports_avx512vlbw()) { 6010 6011 // Get the reverse bit sequence of lower nibble of each byte. 6012 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 6013 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6014 evpandq(dst, xtmp2, src, vec_enc); 6015 vpshufb(dst, xtmp1, dst, vec_enc); 6016 vpsllq(dst, dst, 4, vec_enc); 6017 6018 // Get the reverse bit sequence of upper nibble of each byte. 6019 vpandn(xtmp2, xtmp2, src, vec_enc); 6020 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6021 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6022 6023 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6024 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6025 evporq(xtmp2, dst, xtmp2, vec_enc); 6026 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6027 6028 } else if(vec_enc == Assembler::AVX_512bit) { 6029 // Shift based bit reversal. 6030 assert(bt == T_LONG || bt == T_INT, ""); 6031 6032 // Swap lower and upper nibble of each byte. 6033 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 6034 6035 // Swap two least and most significant bits of each nibble. 6036 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 6037 6038 // Swap adjacent pair of bits. 6039 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6040 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 6041 6042 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6043 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 6044 } else { 6045 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 6046 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6047 6048 // Get the reverse bit sequence of lower nibble of each byte. 6049 vpand(dst, xtmp2, src, vec_enc); 6050 vpshufb(dst, xtmp1, dst, vec_enc); 6051 vpsllq(dst, dst, 4, vec_enc); 6052 6053 // Get the reverse bit sequence of upper nibble of each byte. 6054 vpandn(xtmp2, xtmp2, src, vec_enc); 6055 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6056 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6057 6058 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6059 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6060 vpor(xtmp2, dst, xtmp2, vec_enc); 6061 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6062 } 6063 } 6064 6065 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 6066 XMMRegister xtmp, Register rscratch) { 6067 assert(VM_Version::supports_gfni(), ""); 6068 assert(rscratch != noreg || always_reachable(mask), "missing"); 6069 6070 // Galois field instruction based bit reversal based on following algorithm. 6071 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6072 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 6073 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 6074 vector_reverse_byte(bt, dst, xtmp, vec_enc); 6075 } 6076 6077 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 6078 XMMRegister xtmp1, Register rtmp, int vec_enc) { 6079 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 6080 evpandq(dst, xtmp1, src, vec_enc); 6081 vpsllq(dst, dst, nbits, vec_enc); 6082 vpandn(xtmp1, xtmp1, src, vec_enc); 6083 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 6084 evporq(dst, dst, xtmp1, vec_enc); 6085 } 6086 6087 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6088 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6089 // Shift based bit reversal. 6090 assert(VM_Version::supports_evex(), ""); 6091 switch(bt) { 6092 case T_LONG: 6093 // Swap upper and lower double word of each quad word. 6094 evprorq(xtmp1, k0, src, 32, true, vec_enc); 6095 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 6096 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6097 break; 6098 case T_INT: 6099 // Swap upper and lower word of each double word. 6100 evprord(xtmp1, k0, src, 16, true, vec_enc); 6101 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6102 break; 6103 case T_CHAR: 6104 case T_SHORT: 6105 // Swap upper and lower byte of each word. 6106 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 6107 break; 6108 case T_BYTE: 6109 evmovdquq(dst, k0, src, true, vec_enc); 6110 break; 6111 default: 6112 fatal("Unsupported type %s", type2name(bt)); 6113 break; 6114 } 6115 } 6116 6117 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 6118 if (bt == T_BYTE) { 6119 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 6120 evmovdquq(dst, k0, src, true, vec_enc); 6121 } else { 6122 vmovdqu(dst, src); 6123 } 6124 return; 6125 } 6126 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 6127 // pre-computed shuffle indices. 6128 switch(bt) { 6129 case T_LONG: 6130 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 6131 break; 6132 case T_INT: 6133 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 6134 break; 6135 case T_CHAR: 6136 case T_SHORT: 6137 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6138 break; 6139 default: 6140 fatal("Unsupported type %s", type2name(bt)); 6141 break; 6142 } 6143 vpshufb(dst, src, dst, vec_enc); 6144 } 6145 6146 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6147 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6148 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6149 assert(is_integral_type(bt), ""); 6150 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6151 assert(VM_Version::supports_avx512cd(), ""); 6152 switch(bt) { 6153 case T_LONG: 6154 evplzcntq(dst, ktmp, src, merge, vec_enc); 6155 break; 6156 case T_INT: 6157 evplzcntd(dst, ktmp, src, merge, vec_enc); 6158 break; 6159 case T_SHORT: 6160 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6161 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6162 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6163 vpunpckhwd(dst, xtmp1, src, vec_enc); 6164 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6165 vpackusdw(dst, xtmp2, dst, vec_enc); 6166 break; 6167 case T_BYTE: 6168 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6169 // accessing the lookup table. 6170 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6171 // accessing the lookup table. 6172 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6173 assert(VM_Version::supports_avx512bw(), ""); 6174 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6175 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6176 vpand(xtmp2, dst, src, vec_enc); 6177 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6178 vpsrlw(xtmp3, src, 4, vec_enc); 6179 vpand(xtmp3, dst, xtmp3, vec_enc); 6180 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6181 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6182 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6183 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6184 break; 6185 default: 6186 fatal("Unsupported type %s", type2name(bt)); 6187 break; 6188 } 6189 } 6190 6191 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6192 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6193 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6194 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6195 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6196 // accessing the lookup table. 6197 vpand(dst, xtmp2, src, vec_enc); 6198 vpshufb(dst, xtmp1, dst, vec_enc); 6199 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6200 // accessing the lookup table. 6201 vpsrlw(xtmp3, src, 4, vec_enc); 6202 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6203 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6204 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6205 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6206 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6207 vpaddb(dst, dst, xtmp2, vec_enc); 6208 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6209 } 6210 6211 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6212 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6213 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6214 // Add zero counts of lower byte and upper byte of a word if 6215 // upper byte holds a zero value. 6216 vpsrlw(xtmp3, src, 8, vec_enc); 6217 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6218 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6219 vpsllw(xtmp2, dst, 8, vec_enc); 6220 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6221 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6222 vpsrlw(dst, dst, 8, vec_enc); 6223 } 6224 6225 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6226 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6227 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6228 // hence biased exponent can be used to compute leading zero count as per 6229 // following formula:- 6230 // LZCNT = 32 - (biased_exp - 127) 6231 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6232 6233 // Broadcast 0xFF 6234 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6235 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6236 6237 // Extract biased exponent. 6238 vcvtdq2ps(dst, src, vec_enc); 6239 vpsrld(dst, dst, 23, vec_enc); 6240 vpand(dst, dst, xtmp1, vec_enc); 6241 6242 // Broadcast 127. 6243 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6244 // Exponent = biased_exp - 127 6245 vpsubd(dst, dst, xtmp1, vec_enc); 6246 6247 // Exponent = Exponent + 1 6248 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6249 vpaddd(dst, dst, xtmp3, vec_enc); 6250 6251 // Replace -ve exponent with zero, exponent is -ve when src 6252 // lane contains a zero value. 6253 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6254 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6255 6256 // Rematerialize broadcast 32. 6257 vpslld(xtmp1, xtmp3, 5, vec_enc); 6258 // Exponent is 32 if corresponding source lane contains max_int value. 6259 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6260 // LZCNT = 32 - exponent 6261 vpsubd(dst, xtmp1, dst, vec_enc); 6262 6263 // Replace LZCNT with a value 1 if corresponding source lane 6264 // contains max_int value. 6265 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6266 6267 // Replace biased_exp with 0 if source lane value is less than zero. 6268 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6269 vblendvps(dst, dst, xtmp2, src, vec_enc); 6270 } 6271 6272 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6273 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6274 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6275 // Add zero counts of lower word and upper word of a double word if 6276 // upper word holds a zero value. 6277 vpsrld(xtmp3, src, 16, vec_enc); 6278 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6279 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6280 vpslld(xtmp2, dst, 16, vec_enc); 6281 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6282 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6283 vpsrld(dst, dst, 16, vec_enc); 6284 // Add zero counts of lower doubleword and upper doubleword of a 6285 // quadword if upper doubleword holds a zero value. 6286 vpsrlq(xtmp3, src, 32, vec_enc); 6287 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6288 vpsllq(xtmp2, dst, 32, vec_enc); 6289 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6290 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6291 vpsrlq(dst, dst, 32, vec_enc); 6292 } 6293 6294 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6295 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6296 Register rtmp, int vec_enc) { 6297 assert(is_integral_type(bt), "unexpected type"); 6298 assert(vec_enc < Assembler::AVX_512bit, ""); 6299 switch(bt) { 6300 case T_LONG: 6301 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6302 break; 6303 case T_INT: 6304 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6305 break; 6306 case T_SHORT: 6307 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6308 break; 6309 case T_BYTE: 6310 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6311 break; 6312 default: 6313 fatal("Unsupported type %s", type2name(bt)); 6314 break; 6315 } 6316 } 6317 6318 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6319 switch(bt) { 6320 case T_BYTE: 6321 vpsubb(dst, src1, src2, vec_enc); 6322 break; 6323 case T_SHORT: 6324 vpsubw(dst, src1, src2, vec_enc); 6325 break; 6326 case T_INT: 6327 vpsubd(dst, src1, src2, vec_enc); 6328 break; 6329 case T_LONG: 6330 vpsubq(dst, src1, src2, vec_enc); 6331 break; 6332 default: 6333 fatal("Unsupported type %s", type2name(bt)); 6334 break; 6335 } 6336 } 6337 6338 // Trailing zero count computation is based on leading zero count operation as per 6339 // following equation. All AVX3 targets support AVX512CD feature which offers 6340 // direct vector instruction to compute leading zero count. 6341 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6342 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6343 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6344 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6345 assert(is_integral_type(bt), ""); 6346 // xtmp = -1 6347 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6348 // xtmp = xtmp + src 6349 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6350 // xtmp = xtmp & ~src 6351 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6352 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6353 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6354 vpsub(bt, dst, xtmp4, dst, vec_enc); 6355 } 6356 6357 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6358 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6359 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6360 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6361 assert(is_integral_type(bt), ""); 6362 // xtmp = 0 6363 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6364 // xtmp = 0 - src 6365 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6366 // xtmp = xtmp | src 6367 vpor(xtmp3, xtmp3, src, vec_enc); 6368 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6369 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6370 vpsub(bt, dst, xtmp1, dst, vec_enc); 6371 } 6372 6373 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6374 Label done; 6375 Label neg_divisor_fastpath; 6376 cmpl(divisor, 0); 6377 jccb(Assembler::less, neg_divisor_fastpath); 6378 xorl(rdx, rdx); 6379 divl(divisor); 6380 jmpb(done); 6381 bind(neg_divisor_fastpath); 6382 // Fastpath for divisor < 0: 6383 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6384 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6385 movl(rdx, rax); 6386 subl(rdx, divisor); 6387 if (VM_Version::supports_bmi1()) { 6388 andnl(rax, rdx, rax); 6389 } else { 6390 notl(rdx); 6391 andl(rax, rdx); 6392 } 6393 shrl(rax, 31); 6394 bind(done); 6395 } 6396 6397 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6398 Label done; 6399 Label neg_divisor_fastpath; 6400 cmpl(divisor, 0); 6401 jccb(Assembler::less, neg_divisor_fastpath); 6402 xorl(rdx, rdx); 6403 divl(divisor); 6404 jmpb(done); 6405 bind(neg_divisor_fastpath); 6406 // Fastpath when divisor < 0: 6407 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6408 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6409 movl(rdx, rax); 6410 subl(rax, divisor); 6411 if (VM_Version::supports_bmi1()) { 6412 andnl(rax, rax, rdx); 6413 } else { 6414 notl(rax); 6415 andl(rax, rdx); 6416 } 6417 sarl(rax, 31); 6418 andl(rax, divisor); 6419 subl(rdx, rax); 6420 bind(done); 6421 } 6422 6423 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6424 Label done; 6425 Label neg_divisor_fastpath; 6426 6427 cmpl(divisor, 0); 6428 jccb(Assembler::less, neg_divisor_fastpath); 6429 xorl(rdx, rdx); 6430 divl(divisor); 6431 jmpb(done); 6432 bind(neg_divisor_fastpath); 6433 // Fastpath for divisor < 0: 6434 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6435 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6436 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6437 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6438 movl(rdx, rax); 6439 subl(rax, divisor); 6440 if (VM_Version::supports_bmi1()) { 6441 andnl(rax, rax, rdx); 6442 } else { 6443 notl(rax); 6444 andl(rax, rdx); 6445 } 6446 movl(tmp, rax); 6447 shrl(rax, 31); // quotient 6448 sarl(tmp, 31); 6449 andl(tmp, divisor); 6450 subl(rdx, tmp); // remainder 6451 bind(done); 6452 } 6453 6454 #ifdef _LP64 6455 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6456 XMMRegister xtmp2, Register rtmp) { 6457 if(VM_Version::supports_gfni()) { 6458 // Galois field instruction based bit reversal based on following algorithm. 6459 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6460 mov64(rtmp, 0x8040201008040201L); 6461 movq(xtmp1, src); 6462 movq(xtmp2, rtmp); 6463 gf2p8affineqb(xtmp1, xtmp2, 0); 6464 movq(dst, xtmp1); 6465 } else { 6466 // Swap even and odd numbered bits. 6467 movl(rtmp, src); 6468 andl(rtmp, 0x55555555); 6469 shll(rtmp, 1); 6470 movl(dst, src); 6471 andl(dst, 0xAAAAAAAA); 6472 shrl(dst, 1); 6473 orl(dst, rtmp); 6474 6475 // Swap LSB and MSB 2 bits of each nibble. 6476 movl(rtmp, dst); 6477 andl(rtmp, 0x33333333); 6478 shll(rtmp, 2); 6479 andl(dst, 0xCCCCCCCC); 6480 shrl(dst, 2); 6481 orl(dst, rtmp); 6482 6483 // Swap LSB and MSB 4 bits of each byte. 6484 movl(rtmp, dst); 6485 andl(rtmp, 0x0F0F0F0F); 6486 shll(rtmp, 4); 6487 andl(dst, 0xF0F0F0F0); 6488 shrl(dst, 4); 6489 orl(dst, rtmp); 6490 } 6491 bswapl(dst); 6492 } 6493 6494 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6495 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6496 if(VM_Version::supports_gfni()) { 6497 // Galois field instruction based bit reversal based on following algorithm. 6498 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6499 mov64(rtmp1, 0x8040201008040201L); 6500 movq(xtmp1, src); 6501 movq(xtmp2, rtmp1); 6502 gf2p8affineqb(xtmp1, xtmp2, 0); 6503 movq(dst, xtmp1); 6504 } else { 6505 // Swap even and odd numbered bits. 6506 movq(rtmp1, src); 6507 mov64(rtmp2, 0x5555555555555555L); 6508 andq(rtmp1, rtmp2); 6509 shlq(rtmp1, 1); 6510 movq(dst, src); 6511 notq(rtmp2); 6512 andq(dst, rtmp2); 6513 shrq(dst, 1); 6514 orq(dst, rtmp1); 6515 6516 // Swap LSB and MSB 2 bits of each nibble. 6517 movq(rtmp1, dst); 6518 mov64(rtmp2, 0x3333333333333333L); 6519 andq(rtmp1, rtmp2); 6520 shlq(rtmp1, 2); 6521 notq(rtmp2); 6522 andq(dst, rtmp2); 6523 shrq(dst, 2); 6524 orq(dst, rtmp1); 6525 6526 // Swap LSB and MSB 4 bits of each byte. 6527 movq(rtmp1, dst); 6528 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6529 andq(rtmp1, rtmp2); 6530 shlq(rtmp1, 4); 6531 notq(rtmp2); 6532 andq(dst, rtmp2); 6533 shrq(dst, 4); 6534 orq(dst, rtmp1); 6535 } 6536 bswapq(dst); 6537 } 6538 6539 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6540 Label done; 6541 Label neg_divisor_fastpath; 6542 cmpq(divisor, 0); 6543 jccb(Assembler::less, neg_divisor_fastpath); 6544 xorl(rdx, rdx); 6545 divq(divisor); 6546 jmpb(done); 6547 bind(neg_divisor_fastpath); 6548 // Fastpath for divisor < 0: 6549 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6550 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6551 movq(rdx, rax); 6552 subq(rdx, divisor); 6553 if (VM_Version::supports_bmi1()) { 6554 andnq(rax, rdx, rax); 6555 } else { 6556 notq(rdx); 6557 andq(rax, rdx); 6558 } 6559 shrq(rax, 63); 6560 bind(done); 6561 } 6562 6563 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6564 Label done; 6565 Label neg_divisor_fastpath; 6566 cmpq(divisor, 0); 6567 jccb(Assembler::less, neg_divisor_fastpath); 6568 xorq(rdx, rdx); 6569 divq(divisor); 6570 jmp(done); 6571 bind(neg_divisor_fastpath); 6572 // Fastpath when divisor < 0: 6573 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6574 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6575 movq(rdx, rax); 6576 subq(rax, divisor); 6577 if (VM_Version::supports_bmi1()) { 6578 andnq(rax, rax, rdx); 6579 } else { 6580 notq(rax); 6581 andq(rax, rdx); 6582 } 6583 sarq(rax, 63); 6584 andq(rax, divisor); 6585 subq(rdx, rax); 6586 bind(done); 6587 } 6588 6589 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6590 Label done; 6591 Label neg_divisor_fastpath; 6592 cmpq(divisor, 0); 6593 jccb(Assembler::less, neg_divisor_fastpath); 6594 xorq(rdx, rdx); 6595 divq(divisor); 6596 jmp(done); 6597 bind(neg_divisor_fastpath); 6598 // Fastpath for divisor < 0: 6599 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6600 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6601 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6602 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6603 movq(rdx, rax); 6604 subq(rax, divisor); 6605 if (VM_Version::supports_bmi1()) { 6606 andnq(rax, rax, rdx); 6607 } else { 6608 notq(rax); 6609 andq(rax, rdx); 6610 } 6611 movq(tmp, rax); 6612 shrq(rax, 63); // quotient 6613 sarq(tmp, 63); 6614 andq(tmp, divisor); 6615 subq(rdx, tmp); // remainder 6616 bind(done); 6617 } 6618 #endif 6619 6620 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6621 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6622 int vlen_enc) { 6623 assert(VM_Version::supports_avx512bw(), ""); 6624 // Byte shuffles are inlane operations and indices are determined using 6625 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6626 // normalized to index range 0-15. This makes sure that all the multiples 6627 // of an index value are placed at same relative position in 128 bit 6628 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6629 // will be 16th element in their respective 128 bit lanes. 6630 movl(rtmp, 16); 6631 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6632 6633 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6634 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6635 // original shuffle indices and move the shuffled lanes corresponding to true 6636 // mask to destination vector. 6637 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6638 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6639 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6640 6641 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6642 // and broadcasting second 128 bit lane. 6643 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6644 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6645 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6646 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6647 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6648 6649 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6650 // and broadcasting third 128 bit lane. 6651 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6652 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6653 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6654 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6655 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6656 6657 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6658 // and broadcasting third 128 bit lane. 6659 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6660 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6661 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6662 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6663 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6664 } 6665 6666 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6667 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6668 if (vlen_enc == AVX_128bit) { 6669 vpermilps(dst, src, shuffle, vlen_enc); 6670 } else if (bt == T_INT) { 6671 vpermd(dst, shuffle, src, vlen_enc); 6672 } else { 6673 assert(bt == T_FLOAT, ""); 6674 vpermps(dst, shuffle, src, vlen_enc); 6675 } 6676 } 6677 6678 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 6679 switch(opcode) { 6680 case Op_AddHF: vaddsh(dst, src1, src2); break; 6681 case Op_SubHF: vsubsh(dst, src1, src2); break; 6682 case Op_MulHF: vmulsh(dst, src1, src2); break; 6683 case Op_DivHF: vdivsh(dst, src1, src2); break; 6684 case Op_MaxHF: vmaxsh(dst, src1, src2); break; 6685 case Op_MinHF: vminsh(dst, src1, src2); break; 6686 default: assert(false, "%s", NodeClassNames[opcode]); break; 6687 } 6688 } 6689 6690 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6691 switch(elem_bt) { 6692 case T_BYTE: 6693 if (ideal_opc == Op_SaturatingAddV) { 6694 vpaddsb(dst, src1, src2, vlen_enc); 6695 } else { 6696 assert(ideal_opc == Op_SaturatingSubV, ""); 6697 vpsubsb(dst, src1, src2, vlen_enc); 6698 } 6699 break; 6700 case T_SHORT: 6701 if (ideal_opc == Op_SaturatingAddV) { 6702 vpaddsw(dst, src1, src2, vlen_enc); 6703 } else { 6704 assert(ideal_opc == Op_SaturatingSubV, ""); 6705 vpsubsw(dst, src1, src2, vlen_enc); 6706 } 6707 break; 6708 default: 6709 fatal("Unsupported type %s", type2name(elem_bt)); 6710 break; 6711 } 6712 } 6713 6714 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6715 switch(elem_bt) { 6716 case T_BYTE: 6717 if (ideal_opc == Op_SaturatingAddV) { 6718 vpaddusb(dst, src1, src2, vlen_enc); 6719 } else { 6720 assert(ideal_opc == Op_SaturatingSubV, ""); 6721 vpsubusb(dst, src1, src2, vlen_enc); 6722 } 6723 break; 6724 case T_SHORT: 6725 if (ideal_opc == Op_SaturatingAddV) { 6726 vpaddusw(dst, src1, src2, vlen_enc); 6727 } else { 6728 assert(ideal_opc == Op_SaturatingSubV, ""); 6729 vpsubusw(dst, src1, src2, vlen_enc); 6730 } 6731 break; 6732 default: 6733 fatal("Unsupported type %s", type2name(elem_bt)); 6734 break; 6735 } 6736 } 6737 6738 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6739 XMMRegister src2, KRegister ktmp, int vlen_enc) { 6740 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input. 6741 // overflow_mask = Inp1 <u Inp2 6742 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc); 6743 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative) 6744 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false); 6745 } 6746 6747 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6748 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6749 // Emulate unsigned comparison using signed comparison 6750 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE 6751 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true); 6752 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc); 6753 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc); 6754 6755 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc); 6756 6757 // Res = INP1 - INP2 (non-commutative and non-associative) 6758 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6759 // Res = Mask ? Zero : Res 6760 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); 6761 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc); 6762 } 6763 6764 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6765 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) { 6766 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation. 6767 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2) 6768 // Res = Signed Add INP1, INP2 6769 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6770 // T1 = SRC1 | SRC2 6771 vpor(xtmp1, src1, src2, vlen_enc); 6772 // Max_Unsigned = -1 6773 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6774 // Unsigned compare: Mask = Res <u T1 6775 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc); 6776 // res = Mask ? Max_Unsigned : Res 6777 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc); 6778 } 6779 6780 // 6781 // Section 2-13 Hacker's Delight list following overflow detection check for saturating 6782 // unsigned addition operation. 6783 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1 6784 // 6785 // We empirically determined its semantic equivalence to following reduced expression 6786 // overflow_mask = (a + b) <u (a | b) 6787 // 6788 // and also verified it though Alive2 solver. 6789 // (https://alive2.llvm.org/ce/z/XDQ7dY) 6790 // 6791 6792 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6793 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) { 6794 // Res = Signed Add INP1, INP2 6795 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6796 // Compute T1 = INP1 | INP2 6797 vpor(xtmp3, src1, src2, vlen_enc); 6798 // T1 = Minimum signed value. 6799 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6800 // Convert T1 to signed value, T1 = T1 + MIN_VALUE 6801 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc); 6802 // Convert Res to signed value, Res<s> = Res + MIN_VALUE 6803 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc); 6804 // Compute overflow detection mask = Res<1> <s T1 6805 if (elem_bt == T_INT) { 6806 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc); 6807 } else { 6808 assert(elem_bt == T_LONG, ""); 6809 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc); 6810 } 6811 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6812 } 6813 6814 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6815 int vlen_enc, bool xtmp2_hold_M1) { 6816 if (VM_Version::supports_avx512dq()) { 6817 evpmovq2m(ktmp, src, vlen_enc); 6818 } else { 6819 assert(VM_Version::supports_evex(), ""); 6820 if (!xtmp2_hold_M1) { 6821 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6822 } 6823 evpsraq(xtmp1, src, 63, vlen_enc); 6824 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6825 } 6826 } 6827 6828 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6829 int vlen_enc, bool xtmp2_hold_M1) { 6830 if (VM_Version::supports_avx512dq()) { 6831 evpmovd2m(ktmp, src, vlen_enc); 6832 } else { 6833 assert(VM_Version::supports_evex(), ""); 6834 if (!xtmp2_hold_M1) { 6835 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6836 } 6837 vpsrad(xtmp1, src, 31, vlen_enc); 6838 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6839 } 6840 } 6841 6842 6843 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 6844 if (elem_bt == T_LONG) { 6845 if (VM_Version::supports_evex()) { 6846 evpsraq(dst, src, 63, vlen_enc); 6847 } else { 6848 vpsrad(dst, src, 31, vlen_enc); 6849 vpshufd(dst, dst, 0xF5, vlen_enc); 6850 } 6851 } else { 6852 assert(elem_bt == T_INT, ""); 6853 vpsrad(dst, src, 31, vlen_enc); 6854 } 6855 } 6856 6857 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6858 if (compute_allones) { 6859 if (vlen_enc == Assembler::AVX_512bit) { 6860 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6861 } else { 6862 vpcmpeqq(allones, allones, allones, vlen_enc); 6863 } 6864 } 6865 if (elem_bt == T_LONG) { 6866 vpsrlq(dst, allones, 1, vlen_enc); 6867 } else { 6868 assert(elem_bt == T_INT, ""); 6869 vpsrld(dst, allones, 1, vlen_enc); 6870 } 6871 } 6872 6873 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6874 if (compute_allones) { 6875 if (vlen_enc == Assembler::AVX_512bit) { 6876 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6877 } else { 6878 vpcmpeqq(allones, allones, allones, vlen_enc); 6879 } 6880 } 6881 if (elem_bt == T_LONG) { 6882 vpsllq(dst, allones, 63, vlen_enc); 6883 } else { 6884 assert(elem_bt == T_INT, ""); 6885 vpslld(dst, allones, 31, vlen_enc); 6886 } 6887 } 6888 6889 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2, 6890 Assembler::ComparisonPredicate cond, int vlen_enc) { 6891 switch(elem_bt) { 6892 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break; 6893 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break; 6894 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break; 6895 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break; 6896 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6897 } 6898 } 6899 6900 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6901 switch(elem_bt) { 6902 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break; 6903 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break; 6904 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break; 6905 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break; 6906 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6907 } 6908 } 6909 6910 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1, 6911 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) { 6912 if (elem_bt == T_LONG) { 6913 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6914 } else { 6915 assert(elem_bt == T_INT, ""); 6916 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6917 } 6918 } 6919 6920 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6921 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6922 KRegister ktmp1, KRegister ktmp2, int vlen_enc) { 6923 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6924 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6925 // Overflow detection based on Hacker's delight section 2-13. 6926 if (ideal_opc == Op_SaturatingAddV) { 6927 // res = src1 + src2 6928 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6929 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6930 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6931 vpxor(xtmp1, dst, src1, vlen_enc); 6932 vpxor(xtmp2, dst, src2, vlen_enc); 6933 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6934 } else { 6935 assert(ideal_opc == Op_SaturatingSubV, ""); 6936 // res = src1 - src2 6937 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6938 // Overflow occurs when both inputs have opposite polarity and 6939 // result polarity does not comply with first input polarity. 6940 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6941 vpxor(xtmp1, src1, src2, vlen_enc); 6942 vpxor(xtmp2, dst, src1, vlen_enc); 6943 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6944 } 6945 6946 // Compute overflow detection mask. 6947 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc); 6948 // Note: xtmp1 hold -1 in all its lanes after above call. 6949 6950 // Compute mask based on first input polarity. 6951 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true); 6952 6953 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6954 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6955 6956 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to 6957 // set bits in first input polarity mask holds a min value. 6958 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc); 6959 // Blend destination lanes with saturated values using overflow detection mask. 6960 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc); 6961 } 6962 6963 6964 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6965 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6966 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) { 6967 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6968 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6969 // Overflow detection based on Hacker's delight section 2-13. 6970 if (ideal_opc == Op_SaturatingAddV) { 6971 // res = src1 + src2 6972 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6973 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6974 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6975 vpxor(xtmp1, dst, src1, vlen_enc); 6976 vpxor(xtmp2, dst, src2, vlen_enc); 6977 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6978 } else { 6979 assert(ideal_opc == Op_SaturatingSubV, ""); 6980 // res = src1 - src2 6981 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6982 // Overflow occurs when both inputs have opposite polarity and 6983 // result polarity does not comply with first input polarity. 6984 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6985 vpxor(xtmp1, src1, src2, vlen_enc); 6986 vpxor(xtmp2, dst, src1, vlen_enc); 6987 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6988 } 6989 6990 // Sign-extend to compute overflow detection mask. 6991 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc); 6992 6993 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc); 6994 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc); 6995 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6996 6997 // Compose saturating min/max vector using first input polarity mask. 6998 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc); 6999 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc); 7000 7001 // Blend result with saturating vector using overflow detection mask. 7002 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 7003 } 7004 7005 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7006 switch(elem_bt) { 7007 case T_BYTE: 7008 if (ideal_opc == Op_SaturatingAddV) { 7009 vpaddsb(dst, src1, src2, vlen_enc); 7010 } else { 7011 assert(ideal_opc == Op_SaturatingSubV, ""); 7012 vpsubsb(dst, src1, src2, vlen_enc); 7013 } 7014 break; 7015 case T_SHORT: 7016 if (ideal_opc == Op_SaturatingAddV) { 7017 vpaddsw(dst, src1, src2, vlen_enc); 7018 } else { 7019 assert(ideal_opc == Op_SaturatingSubV, ""); 7020 vpsubsw(dst, src1, src2, vlen_enc); 7021 } 7022 break; 7023 default: 7024 fatal("Unsupported type %s", type2name(elem_bt)); 7025 break; 7026 } 7027 } 7028 7029 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7030 switch(elem_bt) { 7031 case T_BYTE: 7032 if (ideal_opc == Op_SaturatingAddV) { 7033 vpaddusb(dst, src1, src2, vlen_enc); 7034 } else { 7035 assert(ideal_opc == Op_SaturatingSubV, ""); 7036 vpsubusb(dst, src1, src2, vlen_enc); 7037 } 7038 break; 7039 case T_SHORT: 7040 if (ideal_opc == Op_SaturatingAddV) { 7041 vpaddusw(dst, src1, src2, vlen_enc); 7042 } else { 7043 assert(ideal_opc == Op_SaturatingSubV, ""); 7044 vpsubusw(dst, src1, src2, vlen_enc); 7045 } 7046 break; 7047 default: 7048 fatal("Unsupported type %s", type2name(elem_bt)); 7049 break; 7050 } 7051 } 7052 7053 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 7054 XMMRegister src2, int vlen_enc) { 7055 switch(elem_bt) { 7056 case T_BYTE: 7057 evpermi2b(dst, src1, src2, vlen_enc); 7058 break; 7059 case T_SHORT: 7060 evpermi2w(dst, src1, src2, vlen_enc); 7061 break; 7062 case T_INT: 7063 evpermi2d(dst, src1, src2, vlen_enc); 7064 break; 7065 case T_LONG: 7066 evpermi2q(dst, src1, src2, vlen_enc); 7067 break; 7068 case T_FLOAT: 7069 evpermi2ps(dst, src1, src2, vlen_enc); 7070 break; 7071 case T_DOUBLE: 7072 evpermi2pd(dst, src1, src2, vlen_enc); 7073 break; 7074 default: 7075 fatal("Unsupported type %s", type2name(elem_bt)); 7076 break; 7077 } 7078 } 7079 7080 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) { 7081 if (is_unsigned) { 7082 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7083 } else { 7084 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7085 } 7086 } 7087 7088 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) { 7089 if (is_unsigned) { 7090 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7091 } else { 7092 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7093 } 7094 }