1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/globals.hpp" 37 #include "runtime/objectMonitor.hpp" 38 #include "runtime/stubRoutines.hpp" 39 #include "utilities/checkedCast.hpp" 40 #include "utilities/globalDefinitions.hpp" 41 #include "utilities/powerOfTwo.hpp" 42 #include "utilities/sizes.hpp" 43 44 #ifdef PRODUCT 45 #define BLOCK_COMMENT(str) /* nothing */ 46 #define STOP(error) stop(error) 47 #else 48 #define BLOCK_COMMENT(str) block_comment(str) 49 #define STOP(error) block_comment(error); stop(error) 50 #endif 51 52 // C2 compiled method's prolog code. 53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 54 55 // WARNING: Initial instruction MUST be 5 bytes or longer so that 56 // NativeJump::patch_verified_entry will be able to patch out the entry 57 // code safely. The push to verify stack depth is ok at 5 bytes, 58 // the frame allocation can be either 3 or 6 bytes. So if we don't do 59 // stack bang then we must use the 6 byte frame allocation even if 60 // we have no frame. :-( 61 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 62 63 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 64 // Remove word for return addr 65 framesize -= wordSize; 66 stack_bang_size -= wordSize; 67 68 // Calls to C2R adapters often do not accept exceptional returns. 69 // We require that their callers must bang for them. But be careful, because 70 // some VM calls (such as call site linkage) can use several kilobytes of 71 // stack. But the stack safety zone should account for that. 72 // See bugs 4446381, 4468289, 4497237. 73 if (stack_bang_size > 0) { 74 generate_stack_overflow_check(stack_bang_size); 75 76 // We always push rbp, so that on return to interpreter rbp, will be 77 // restored correctly and we can correct the stack. 78 push(rbp); 79 // Save caller's stack pointer into RBP if the frame pointer is preserved. 80 if (PreserveFramePointer) { 81 mov(rbp, rsp); 82 } 83 // Remove word for ebp 84 framesize -= wordSize; 85 86 // Create frame 87 if (framesize) { 88 subptr(rsp, framesize); 89 } 90 } else { 91 // Create frame (force generation of a 4 byte immediate value) 92 subptr_imm32(rsp, framesize); 93 94 // Save RBP register now. 95 framesize -= wordSize; 96 movptr(Address(rsp, framesize), rbp); 97 // Save caller's stack pointer into RBP if the frame pointer is preserved. 98 if (PreserveFramePointer) { 99 movptr(rbp, rsp); 100 if (framesize > 0) { 101 addptr(rbp, framesize); 102 } 103 } 104 } 105 106 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 107 framesize -= wordSize; 108 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 109 } 110 111 #ifndef _LP64 112 // If method sets FPU control word do it now 113 if (fp_mode_24b) { 114 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 115 } 116 if (UseSSE >= 2 && VerifyFPU) { 117 verify_FPU(0, "FPU stack must be clean on entry"); 118 } 119 #endif 120 121 #ifdef ASSERT 122 if (VerifyStackAtCalls) { 123 Label L; 124 push(rax); 125 mov(rax, rsp); 126 andptr(rax, StackAlignmentInBytes-1); 127 cmpptr(rax, StackAlignmentInBytes-wordSize); 128 pop(rax); 129 jcc(Assembler::equal, L); 130 STOP("Stack is not properly aligned!"); 131 bind(L); 132 } 133 #endif 134 135 if (!is_stub) { 136 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 137 #ifdef _LP64 138 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 139 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 140 Label dummy_slow_path; 141 Label dummy_continuation; 142 Label* slow_path = &dummy_slow_path; 143 Label* continuation = &dummy_continuation; 144 if (!Compile::current()->output()->in_scratch_emit_size()) { 145 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 146 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 147 Compile::current()->output()->add_stub(stub); 148 slow_path = &stub->entry(); 149 continuation = &stub->continuation(); 150 } 151 bs->nmethod_entry_barrier(this, slow_path, continuation); 152 } 153 #else 154 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 155 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 156 #endif 157 } 158 } 159 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 161 switch (vlen_in_bytes) { 162 case 4: // fall-through 163 case 8: // fall-through 164 case 16: return Assembler::AVX_128bit; 165 case 32: return Assembler::AVX_256bit; 166 case 64: return Assembler::AVX_512bit; 167 168 default: { 169 ShouldNotReachHere(); 170 return Assembler::AVX_NoVec; 171 } 172 } 173 } 174 175 // fast_lock and fast_unlock used by C2 176 177 // Because the transitions from emitted code to the runtime 178 // monitorenter/exit helper stubs are so slow it's critical that 179 // we inline both the stack-locking fast path and the inflated fast path. 180 // 181 // See also: cmpFastLock and cmpFastUnlock. 182 // 183 // What follows is a specialized inline transliteration of the code 184 // in enter() and exit(). If we're concerned about I$ bloat another 185 // option would be to emit TrySlowEnter and TrySlowExit methods 186 // at startup-time. These methods would accept arguments as 187 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 188 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 189 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 190 // In practice, however, the # of lock sites is bounded and is usually small. 191 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 192 // if the processor uses simple bimodal branch predictors keyed by EIP 193 // Since the helper routines would be called from multiple synchronization 194 // sites. 195 // 196 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 197 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 198 // to those specialized methods. That'd give us a mostly platform-independent 199 // implementation that the JITs could optimize and inline at their pleasure. 200 // Done correctly, the only time we'd need to cross to native could would be 201 // to park() or unpark() threads. We'd also need a few more unsafe operators 202 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 203 // (b) explicit barriers or fence operations. 204 // 205 // TODO: 206 // 207 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 208 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 209 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 210 // the lock operators would typically be faster than reifying Self. 211 // 212 // * Ideally I'd define the primitives as: 213 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 214 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 215 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 216 // Instead, we're stuck with a rather awkward and brittle register assignments below. 217 // Furthermore the register assignments are overconstrained, possibly resulting in 218 // sub-optimal code near the synchronization site. 219 // 220 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 221 // Alternately, use a better sp-proximity test. 222 // 223 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 224 // Either one is sufficient to uniquely identify a thread. 225 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 226 // 227 // * Intrinsify notify() and notifyAll() for the common cases where the 228 // object is locked by the calling thread but the waitlist is empty. 229 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 230 // 231 // * use jccb and jmpb instead of jcc and jmp to improve code density. 232 // But beware of excessive branch density on AMD Opterons. 233 // 234 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 235 // or failure of the fast path. If the fast path fails then we pass 236 // control to the slow path, typically in C. In fast_lock and 237 // fast_unlock we often branch to DONE_LABEL, just to find that C2 238 // will emit a conditional branch immediately after the node. 239 // So we have branches to branches and lots of ICC.ZF games. 240 // Instead, it might be better to have C2 pass a "FailureLabel" 241 // into fast_lock and fast_unlock. In the case of success, control 242 // will drop through the node. ICC.ZF is undefined at exit. 243 // In the case of failure, the node will branch directly to the 244 // FailureLabel 245 246 247 // obj: object to lock 248 // box: on-stack box address (displaced header location) - KILLED 249 // rax,: tmp -- KILLED 250 // scr: tmp -- KILLED 251 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 252 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 253 Metadata* method_data) { 254 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 255 // Ensure the register assignments are disjoint 256 assert(tmpReg == rax, ""); 257 assert(cx1Reg == noreg, ""); 258 assert(cx2Reg == noreg, ""); 259 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 260 261 // Possible cases that we'll encounter in fast_lock 262 // ------------------------------------------------ 263 // * Inflated 264 // -- unlocked 265 // -- Locked 266 // = by self 267 // = by other 268 // * neutral 269 // * stack-locked 270 // -- by self 271 // = sp-proximity test hits 272 // = sp-proximity test generates false-negative 273 // -- by other 274 // 275 276 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 277 278 if (DiagnoseSyncOnValueBasedClasses != 0) { 279 load_klass(tmpReg, objReg, scrReg); 280 testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 281 jcc(Assembler::notZero, DONE_LABEL); 282 } 283 284 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 285 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 286 jcc(Assembler::notZero, IsInflated); 287 288 if (LockingMode == LM_MONITOR) { 289 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 290 testptr(objReg, objReg); 291 } else { 292 assert(LockingMode == LM_LEGACY, "must be"); 293 // Attempt stack-locking ... 294 orptr (tmpReg, markWord::unlocked_value); 295 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 296 lock(); 297 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 298 jcc(Assembler::equal, COUNT); // Success 299 300 // Recursive locking. 301 // The object is stack-locked: markword contains stack pointer to BasicLock. 302 // Locked by current thread if difference with current SP is less than one page. 303 subptr(tmpReg, rsp); 304 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 305 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 306 movptr(Address(boxReg, 0), tmpReg); 307 } 308 jmp(DONE_LABEL); 309 310 bind(IsInflated); 311 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 312 313 #ifndef _LP64 314 // Just take slow path to avoid dealing with 64 bit atomic instructions here. 315 orl(boxReg, 1); // set ICC.ZF=0 to indicate failure 316 #else 317 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 318 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 319 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 320 321 // It's inflated and we use scrReg for ObjectMonitor* in this section. 322 movptr(boxReg, Address(r15_thread, JavaThread::lock_id_offset())); 323 movq(scrReg, tmpReg); 324 xorq(tmpReg, tmpReg); 325 lock(); 326 cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 327 328 // Propagate ICC.ZF from CAS above into DONE_LABEL. 329 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 330 331 cmpptr(boxReg, rax); // Check if we are already the owner (recursive lock) 332 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 333 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 334 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 335 #endif // _LP64 336 bind(DONE_LABEL); 337 338 // ZFlag == 1 count in fast path 339 // ZFlag == 0 count in slow path 340 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 341 342 bind(COUNT); 343 if (LockingMode == LM_LEGACY) { 344 #ifdef _LP64 345 // Count monitors in fast path 346 increment(Address(thread, JavaThread::held_monitor_count_offset())); 347 #endif 348 } 349 xorl(tmpReg, tmpReg); // Set ZF == 1 350 351 bind(NO_COUNT); 352 353 // At NO_COUNT the icc ZFlag is set as follows ... 354 // fast_unlock uses the same protocol. 355 // ZFlag == 1 -> Success 356 // ZFlag == 0 -> Failure - force control through the slow path 357 } 358 359 // obj: object to unlock 360 // box: box address (displaced header location), killed. Must be EAX. 361 // tmp: killed, cannot be obj nor box. 362 // 363 // Some commentary on balanced locking: 364 // 365 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 366 // Methods that don't have provably balanced locking are forced to run in the 367 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 368 // The interpreter provides two properties: 369 // I1: At return-time the interpreter automatically and quietly unlocks any 370 // objects acquired the current activation (frame). Recall that the 371 // interpreter maintains an on-stack list of locks currently held by 372 // a frame. 373 // I2: If a method attempts to unlock an object that is not held by the 374 // the frame the interpreter throws IMSX. 375 // 376 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 377 // B() doesn't have provably balanced locking so it runs in the interpreter. 378 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 379 // is still locked by A(). 380 // 381 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 382 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 383 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 384 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 385 // Arguably given that the spec legislates the JNI case as undefined our implementation 386 // could reasonably *avoid* checking owner in fast_unlock(). 387 // In the interest of performance we elide m->Owner==Self check in unlock. 388 // A perfectly viable alternative is to elide the owner check except when 389 // Xcheck:jni is enabled. 390 391 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { 392 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 393 assert(boxReg == rax, ""); 394 assert_different_registers(objReg, boxReg, tmpReg); 395 396 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 397 398 if (LockingMode == LM_LEGACY) { 399 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 400 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 401 } 402 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 403 if (LockingMode != LM_MONITOR) { 404 testptr(tmpReg, markWord::monitor_value); // Inflated? 405 jcc(Assembler::zero, Stacked); 406 } 407 408 // It's inflated. 409 410 #ifndef _LP64 411 // Just take slow path to avoid dealing with 64 bit atomic instructions here. 412 orl(boxReg, 1); // set ICC.ZF=0 to indicate failure 413 jmpb(DONE_LABEL); 414 #else 415 // Despite our balanced locking property we still check that m->_owner == Self 416 // as java routines or native JNI code called by this thread might 417 // have released the lock. 418 // Refer to the comments in synchronizer.cpp for how we might encode extra 419 // state in _succ so we can avoid fetching EntryList|cxq. 420 // 421 // If there's no contention try a 1-0 exit. That is, exit without 422 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 423 // we detect and recover from the race that the 1-0 exit admits. 424 // 425 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 426 // before it STs null into _owner, releasing the lock. Updates 427 // to data protected by the critical section must be visible before 428 // we drop the lock (and thus before any other thread could acquire 429 // the lock and observe the fields protected by the lock). 430 // IA32's memory-model is SPO, so STs are ordered with respect to 431 // each other and there's no need for an explicit barrier (fence). 432 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 433 Label LSuccess, LNotRecursive; 434 435 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 436 jccb(Assembler::equal, LNotRecursive); 437 438 // Recursive inflated unlock 439 decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 440 jmpb(LSuccess); 441 442 bind(LNotRecursive); 443 444 // Set owner to null. 445 // Release to satisfy the JMM 446 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 447 // We need a full fence after clearing owner to avoid stranding. 448 // StoreLoad achieves this. 449 membar(StoreLoad); 450 451 // Check if the entry lists are empty (EntryList first - by convention). 452 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 453 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 454 jccb(Assembler::zero, LSuccess); // If so we are done. 455 456 // Check if there is a successor. 457 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 458 jccb(Assembler::notZero, LSuccess); // If so we are done. 459 460 // Save the monitor pointer in the current thread, so we can try to 461 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 462 andptr(tmpReg, ~(int32_t)markWord::monitor_value); 463 movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg); 464 465 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 466 jmpb (DONE_LABEL); 467 468 bind (LSuccess); 469 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 470 jmpb (DONE_LABEL); 471 #endif // _LP64 472 473 if (LockingMode == LM_LEGACY) { 474 bind (Stacked); 475 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 476 lock(); 477 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 478 // Intentional fall-thru into DONE_LABEL 479 } 480 481 bind(DONE_LABEL); 482 483 // ZFlag == 1 count in fast path 484 // ZFlag == 0 count in slow path 485 jccb(Assembler::notZero, NO_COUNT); 486 487 bind(COUNT); 488 489 if (LockingMode == LM_LEGACY) { 490 // Count monitors in fast path 491 #ifdef _LP64 492 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 493 #endif 494 } 495 496 xorl(tmpReg, tmpReg); // Set ZF == 1 497 498 bind(NO_COUNT); 499 } 500 501 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 502 Register t, Register thread) { 503 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 504 assert(rax_reg == rax, "Used for CAS"); 505 assert_different_registers(obj, box, rax_reg, t, thread); 506 507 // Handle inflated monitor. 508 Label inflated; 509 // Finish fast lock successfully. ZF value is irrelevant. 510 Label locked; 511 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 512 Label slow_path; 513 514 if (UseObjectMonitorTable) { 515 // Clear cache in case fast locking succeeds. 516 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 517 } 518 519 if (DiagnoseSyncOnValueBasedClasses != 0) { 520 load_klass(rax_reg, obj, t); 521 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 522 jcc(Assembler::notZero, slow_path); 523 } 524 525 const Register mark = t; 526 527 { // Lightweight Lock 528 529 Label push; 530 531 const Register top = UseObjectMonitorTable ? rax_reg : box; 532 533 // Load the mark. 534 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 535 536 // Prefetch top. 537 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 538 539 // Check for monitor (0b10). 540 testptr(mark, markWord::monitor_value); 541 jcc(Assembler::notZero, inflated); 542 543 // Check if lock-stack is full. 544 cmpl(top, LockStack::end_offset() - 1); 545 jcc(Assembler::greater, slow_path); 546 547 // Check if recursive. 548 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 549 jccb(Assembler::equal, push); 550 551 // Try to lock. Transition lock bits 0b01 => 0b00 552 movptr(rax_reg, mark); 553 orptr(rax_reg, markWord::unlocked_value); 554 andptr(mark, ~(int32_t)markWord::unlocked_value); 555 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 556 jcc(Assembler::notEqual, slow_path); 557 558 if (UseObjectMonitorTable) { 559 // Need to reload top, clobbered by CAS. 560 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 561 } 562 bind(push); 563 // After successful lock, push object on lock-stack. 564 movptr(Address(thread, top), obj); 565 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 566 jmpb(locked); 567 } 568 569 { // Handle inflated monitor. 570 bind(inflated); 571 572 #ifndef _LP64 573 // Just take slow path to avoid dealing with 64 bit atomic instructions here. 574 orl(box, 1); // set ICC.ZF=0 to indicate failure 575 jmpb(slow_path); 576 #else 577 const Register monitor = t; 578 579 if (!UseObjectMonitorTable) { 580 assert(mark == monitor, "should be the same here"); 581 } else { 582 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 583 // Fetch ObjectMonitor* from the cache or take the slow-path. 584 Label monitor_found; 585 586 // Load cache address 587 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 588 589 const int num_unrolled = 2; 590 for (int i = 0; i < num_unrolled; i++) { 591 cmpptr(obj, Address(t)); 592 jccb(Assembler::equal, monitor_found); 593 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 594 } 595 596 Label loop; 597 598 // Search for obj in cache. 599 bind(loop); 600 601 // Check for match. 602 cmpptr(obj, Address(t)); 603 jccb(Assembler::equal, monitor_found); 604 605 // Search until null encountered, guaranteed _null_sentinel at end. 606 cmpptr(Address(t), 1); 607 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 608 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 609 jmpb(loop); 610 611 // Cache hit. 612 bind(monitor_found); 613 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 614 } 615 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 616 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 617 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 618 619 Label monitor_locked; 620 // Lock the monitor. 621 622 if (UseObjectMonitorTable) { 623 // Cache the monitor for unlock before trashing box. On failure to acquire 624 // the lock, the slow path will reset the entry accordingly (see CacheSetter). 625 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 626 } 627 628 // Try to CAS owner (no owner => current thread's _lock_id). 629 xorptr(rax_reg, rax_reg); 630 movptr(box, Address(thread, JavaThread::lock_id_offset())); 631 lock(); cmpxchgptr(box, owner_address); 632 jccb(Assembler::equal, monitor_locked); 633 634 // Check if recursive. 635 cmpptr(box, rax_reg); 636 jccb(Assembler::notEqual, slow_path); 637 638 // Recursive. 639 increment(recursions_address); 640 641 bind(monitor_locked); 642 #endif // _LP64 643 } 644 645 bind(locked); 646 // Set ZF = 1 647 xorl(rax_reg, rax_reg); 648 649 #ifdef ASSERT 650 // Check that locked label is reached with ZF set. 651 Label zf_correct; 652 Label zf_bad_zero; 653 jcc(Assembler::zero, zf_correct); 654 jmp(zf_bad_zero); 655 #endif 656 657 bind(slow_path); 658 #ifdef ASSERT 659 // Check that slow_path label is reached with ZF not set. 660 jcc(Assembler::notZero, zf_correct); 661 stop("Fast Lock ZF != 0"); 662 bind(zf_bad_zero); 663 stop("Fast Lock ZF != 1"); 664 bind(zf_correct); 665 #endif 666 // C2 uses the value of ZF to determine the continuation. 667 } 668 669 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 670 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 671 assert(reg_rax == rax, "Used for CAS"); 672 assert_different_registers(obj, reg_rax, t); 673 674 // Handle inflated monitor. 675 Label inflated, inflated_check_lock_stack; 676 // Finish fast unlock successfully. MUST jump with ZF == 1 677 Label unlocked, slow_path; 678 679 const Register mark = t; 680 const Register monitor = t; 681 const Register top = UseObjectMonitorTable ? t : reg_rax; 682 const Register box = reg_rax; 683 684 Label dummy; 685 C2FastUnlockLightweightStub* stub = nullptr; 686 687 if (!Compile::current()->output()->in_scratch_emit_size()) { 688 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 689 Compile::current()->output()->add_stub(stub); 690 } 691 692 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 693 694 { // Lightweight Unlock 695 696 // Load top. 697 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 698 699 if (!UseObjectMonitorTable) { 700 // Prefetch mark. 701 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 702 } 703 704 // Check if obj is top of lock-stack. 705 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 706 // Top of lock stack was not obj. Must be monitor. 707 jcc(Assembler::notEqual, inflated_check_lock_stack); 708 709 // Pop lock-stack. 710 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 711 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 712 713 // Check if recursive. 714 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 715 jcc(Assembler::equal, unlocked); 716 717 // We elide the monitor check, let the CAS fail instead. 718 719 if (UseObjectMonitorTable) { 720 // Load mark. 721 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 722 } 723 724 // Try to unlock. Transition lock bits 0b00 => 0b01 725 movptr(reg_rax, mark); 726 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 727 orptr(mark, markWord::unlocked_value); 728 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 729 jcc(Assembler::notEqual, push_and_slow_path); 730 jmp(unlocked); 731 } 732 733 734 { // Handle inflated monitor. 735 bind(inflated_check_lock_stack); 736 #ifdef ASSERT 737 Label check_done; 738 subl(top, oopSize); 739 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 740 jcc(Assembler::below, check_done); 741 cmpptr(obj, Address(thread, top)); 742 jccb(Assembler::notEqual, inflated_check_lock_stack); 743 stop("Fast Unlock lock on stack"); 744 bind(check_done); 745 if (UseObjectMonitorTable) { 746 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 747 } 748 testptr(mark, markWord::monitor_value); 749 jccb(Assembler::notZero, inflated); 750 stop("Fast Unlock not monitor"); 751 #endif 752 753 bind(inflated); 754 755 #ifndef _LP64 756 // Just take slow path to avoid dealing with 64 bit atomic instructions here. 757 orl(t, 1); // set ICC.ZF=0 to indicate failure 758 jmpb(slow_path); 759 #else 760 if (!UseObjectMonitorTable) { 761 assert(mark == monitor, "should be the same here"); 762 } else { 763 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 764 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 765 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 766 cmpptr(monitor, alignof(ObjectMonitor*)); 767 jcc(Assembler::below, slow_path); 768 } 769 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 770 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 771 const Address cxq_address{monitor, ObjectMonitor::cxq_offset() - monitor_tag}; 772 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 773 const Address EntryList_address{monitor, ObjectMonitor::EntryList_offset() - monitor_tag}; 774 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 775 776 Label recursive; 777 778 // Check if recursive. 779 cmpptr(recursions_address, 0); 780 jccb(Assembler::notZero, recursive); 781 782 // Set owner to null. 783 // Release to satisfy the JMM 784 movptr(owner_address, NULL_WORD); 785 // We need a full fence after clearing owner to avoid stranding. 786 // StoreLoad achieves this. 787 membar(StoreLoad); 788 789 // Check if the entry lists are empty (EntryList first - by convention). 790 movptr(reg_rax, EntryList_address); 791 orptr(reg_rax, cxq_address); 792 jccb(Assembler::zero, unlocked); // If so we are done. 793 794 // Check if there is a successor. 795 cmpptr(succ_address, NULL_WORD); 796 jccb(Assembler::notZero, unlocked); // If so we are done. 797 798 // Save the monitor pointer in the current thread, so we can try to 799 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 800 if (!UseObjectMonitorTable) { 801 andptr(monitor, ~(int32_t)markWord::monitor_value); 802 } 803 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 804 805 orl(t, 1); // Fast Unlock ZF = 0 806 jmpb(slow_path); 807 808 // Recursive unlock. 809 bind(recursive); 810 decrement(recursions_address); 811 #endif // _LP64 812 } 813 814 bind(unlocked); 815 xorl(t, t); // Fast Unlock ZF = 1 816 817 #ifdef ASSERT 818 // Check that unlocked label is reached with ZF set. 819 Label zf_correct; 820 Label zf_bad_zero; 821 jcc(Assembler::zero, zf_correct); 822 jmp(zf_bad_zero); 823 #endif 824 825 bind(slow_path); 826 if (stub != nullptr) { 827 bind(stub->slow_path_continuation()); 828 } 829 #ifdef ASSERT 830 // Check that stub->continuation() label is reached with ZF not set. 831 jcc(Assembler::notZero, zf_correct); 832 stop("Fast Unlock ZF != 0"); 833 bind(zf_bad_zero); 834 stop("Fast Unlock ZF != 1"); 835 bind(zf_correct); 836 #endif 837 // C2 uses the value of ZF to determine the continuation. 838 } 839 840 //------------------------------------------------------------------------------------------- 841 // Generic instructions support for use in .ad files C2 code generation 842 843 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 844 if (dst != src) { 845 movdqu(dst, src); 846 } 847 if (opcode == Op_AbsVD) { 848 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 849 } else { 850 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 851 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 852 } 853 } 854 855 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 856 if (opcode == Op_AbsVD) { 857 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 858 } else { 859 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 860 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 861 } 862 } 863 864 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 865 if (dst != src) { 866 movdqu(dst, src); 867 } 868 if (opcode == Op_AbsVF) { 869 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 870 } else { 871 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 872 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 873 } 874 } 875 876 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 877 if (opcode == Op_AbsVF) { 878 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 879 } else { 880 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 881 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 882 } 883 } 884 885 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 886 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 887 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 888 889 if (opcode == Op_MinV) { 890 if (elem_bt == T_BYTE) { 891 pminsb(dst, src); 892 } else if (elem_bt == T_SHORT) { 893 pminsw(dst, src); 894 } else if (elem_bt == T_INT) { 895 pminsd(dst, src); 896 } else { 897 assert(elem_bt == T_LONG, "required"); 898 assert(tmp == xmm0, "required"); 899 assert_different_registers(dst, src, tmp); 900 movdqu(xmm0, dst); 901 pcmpgtq(xmm0, src); 902 blendvpd(dst, src); // xmm0 as mask 903 } 904 } else { // opcode == Op_MaxV 905 if (elem_bt == T_BYTE) { 906 pmaxsb(dst, src); 907 } else if (elem_bt == T_SHORT) { 908 pmaxsw(dst, src); 909 } else if (elem_bt == T_INT) { 910 pmaxsd(dst, src); 911 } else { 912 assert(elem_bt == T_LONG, "required"); 913 assert(tmp == xmm0, "required"); 914 assert_different_registers(dst, src, tmp); 915 movdqu(xmm0, src); 916 pcmpgtq(xmm0, dst); 917 blendvpd(dst, src); // xmm0 as mask 918 } 919 } 920 } 921 922 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 923 XMMRegister src1, Address src2, int vlen_enc) { 924 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 925 if (opcode == Op_UMinV) { 926 switch(elem_bt) { 927 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 928 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 929 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 930 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 931 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 932 } 933 } else { 934 assert(opcode == Op_UMaxV, "required"); 935 switch(elem_bt) { 936 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 937 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 938 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 939 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 940 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 941 } 942 } 943 } 944 945 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 946 // For optimality, leverage a full vector width of 512 bits 947 // for operations over smaller vector sizes on AVX512 targets. 948 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { 949 if (opcode == Op_UMaxV) { 950 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 951 } else { 952 assert(opcode == Op_UMinV, "required"); 953 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 954 } 955 } else { 956 // T1 = -1 957 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc); 958 // T1 = -1 << 63 959 vpsllq(xtmp1, xtmp1, 63, vlen_enc); 960 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2 961 vpaddq(xtmp2, xtmp1, src2, vlen_enc); 962 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1 963 vpaddq(xtmp1, xtmp1, src1, vlen_enc); 964 // Mask = T2 > T1 965 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc); 966 if (opcode == Op_UMaxV) { 967 // Res = Mask ? Src2 : Src1 968 vpblendvb(dst, src1, src2, xtmp1, vlen_enc); 969 } else { 970 // Res = Mask ? Src1 : Src2 971 vpblendvb(dst, src2, src1, xtmp1, vlen_enc); 972 } 973 } 974 } 975 976 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 977 XMMRegister src1, XMMRegister src2, int vlen_enc) { 978 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 979 if (opcode == Op_UMinV) { 980 switch(elem_bt) { 981 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 982 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 983 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 984 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 985 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 986 } 987 } else { 988 assert(opcode == Op_UMaxV, "required"); 989 switch(elem_bt) { 990 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 991 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 992 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 993 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 994 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 995 } 996 } 997 } 998 999 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1000 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1001 int vlen_enc) { 1002 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1003 1004 if (opcode == Op_MinV) { 1005 if (elem_bt == T_BYTE) { 1006 vpminsb(dst, src1, src2, vlen_enc); 1007 } else if (elem_bt == T_SHORT) { 1008 vpminsw(dst, src1, src2, vlen_enc); 1009 } else if (elem_bt == T_INT) { 1010 vpminsd(dst, src1, src2, vlen_enc); 1011 } else { 1012 assert(elem_bt == T_LONG, "required"); 1013 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1014 vpminsq(dst, src1, src2, vlen_enc); 1015 } else { 1016 assert_different_registers(dst, src1, src2); 1017 vpcmpgtq(dst, src1, src2, vlen_enc); 1018 vblendvpd(dst, src1, src2, dst, vlen_enc); 1019 } 1020 } 1021 } else { // opcode == Op_MaxV 1022 if (elem_bt == T_BYTE) { 1023 vpmaxsb(dst, src1, src2, vlen_enc); 1024 } else if (elem_bt == T_SHORT) { 1025 vpmaxsw(dst, src1, src2, vlen_enc); 1026 } else if (elem_bt == T_INT) { 1027 vpmaxsd(dst, src1, src2, vlen_enc); 1028 } else { 1029 assert(elem_bt == T_LONG, "required"); 1030 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1031 vpmaxsq(dst, src1, src2, vlen_enc); 1032 } else { 1033 assert_different_registers(dst, src1, src2); 1034 vpcmpgtq(dst, src1, src2, vlen_enc); 1035 vblendvpd(dst, src2, src1, dst, vlen_enc); 1036 } 1037 } 1038 } 1039 } 1040 1041 // Float/Double min max 1042 1043 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1044 XMMRegister dst, XMMRegister a, XMMRegister b, 1045 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1046 int vlen_enc) { 1047 assert(UseAVX > 0, "required"); 1048 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1049 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1050 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1051 assert_different_registers(a, tmp, atmp, btmp); 1052 assert_different_registers(b, tmp, atmp, btmp); 1053 1054 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1055 bool is_double_word = is_double_word_type(elem_bt); 1056 1057 /* Note on 'non-obvious' assembly sequence: 1058 * 1059 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1060 * and Java on how they handle floats: 1061 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1062 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1063 * 1064 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1065 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1066 * (only useful when signs differ, noop otherwise) 1067 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1068 1069 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1070 * btmp = (b < +0.0) ? a : b 1071 * atmp = (b < +0.0) ? b : a 1072 * Tmp = Max_Float(atmp , btmp) 1073 * Res = (atmp == NaN) ? atmp : Tmp 1074 */ 1075 1076 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1077 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1078 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1079 XMMRegister mask; 1080 1081 if (!is_double_word && is_min) { 1082 mask = a; 1083 vblend = &MacroAssembler::vblendvps; 1084 vmaxmin = &MacroAssembler::vminps; 1085 vcmp = &MacroAssembler::vcmpps; 1086 } else if (!is_double_word && !is_min) { 1087 mask = b; 1088 vblend = &MacroAssembler::vblendvps; 1089 vmaxmin = &MacroAssembler::vmaxps; 1090 vcmp = &MacroAssembler::vcmpps; 1091 } else if (is_double_word && is_min) { 1092 mask = a; 1093 vblend = &MacroAssembler::vblendvpd; 1094 vmaxmin = &MacroAssembler::vminpd; 1095 vcmp = &MacroAssembler::vcmppd; 1096 } else { 1097 assert(is_double_word && !is_min, "sanity"); 1098 mask = b; 1099 vblend = &MacroAssembler::vblendvpd; 1100 vmaxmin = &MacroAssembler::vmaxpd; 1101 vcmp = &MacroAssembler::vcmppd; 1102 } 1103 1104 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1105 XMMRegister maxmin, scratch; 1106 if (dst == btmp) { 1107 maxmin = btmp; 1108 scratch = tmp; 1109 } else { 1110 maxmin = tmp; 1111 scratch = btmp; 1112 } 1113 1114 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1115 if (precompute_mask && !is_double_word) { 1116 vpsrad(tmp, mask, 32, vlen_enc); 1117 mask = tmp; 1118 } else if (precompute_mask && is_double_word) { 1119 vpxor(tmp, tmp, tmp, vlen_enc); 1120 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1121 mask = tmp; 1122 } 1123 1124 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1125 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1126 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1127 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1128 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1129 } 1130 1131 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1132 XMMRegister dst, XMMRegister a, XMMRegister b, 1133 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1134 int vlen_enc) { 1135 assert(UseAVX > 2, "required"); 1136 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1137 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1138 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1139 assert_different_registers(dst, a, atmp, btmp); 1140 assert_different_registers(dst, b, atmp, btmp); 1141 1142 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1143 bool is_double_word = is_double_word_type(elem_bt); 1144 bool merge = true; 1145 1146 if (!is_double_word && is_min) { 1147 evpmovd2m(ktmp, a, vlen_enc); 1148 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1149 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1150 vminps(dst, atmp, btmp, vlen_enc); 1151 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1152 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1153 } else if (!is_double_word && !is_min) { 1154 evpmovd2m(ktmp, b, vlen_enc); 1155 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1156 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1157 vmaxps(dst, atmp, btmp, vlen_enc); 1158 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1159 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1160 } else if (is_double_word && is_min) { 1161 evpmovq2m(ktmp, a, vlen_enc); 1162 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1163 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1164 vminpd(dst, atmp, btmp, vlen_enc); 1165 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1166 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1167 } else { 1168 assert(is_double_word && !is_min, "sanity"); 1169 evpmovq2m(ktmp, b, vlen_enc); 1170 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1171 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1172 vmaxpd(dst, atmp, btmp, vlen_enc); 1173 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1174 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1175 } 1176 } 1177 1178 // Float/Double signum 1179 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1180 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1181 1182 Label DONE_LABEL; 1183 1184 if (opcode == Op_SignumF) { 1185 assert(UseSSE > 0, "required"); 1186 ucomiss(dst, zero); 1187 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1188 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1189 movflt(dst, one); 1190 jcc(Assembler::above, DONE_LABEL); 1191 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1192 } else if (opcode == Op_SignumD) { 1193 assert(UseSSE > 1, "required"); 1194 ucomisd(dst, zero); 1195 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1196 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1197 movdbl(dst, one); 1198 jcc(Assembler::above, DONE_LABEL); 1199 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1200 } 1201 1202 bind(DONE_LABEL); 1203 } 1204 1205 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1206 if (sign) { 1207 pmovsxbw(dst, src); 1208 } else { 1209 pmovzxbw(dst, src); 1210 } 1211 } 1212 1213 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1214 if (sign) { 1215 vpmovsxbw(dst, src, vector_len); 1216 } else { 1217 vpmovzxbw(dst, src, vector_len); 1218 } 1219 } 1220 1221 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1222 if (sign) { 1223 vpmovsxbd(dst, src, vector_len); 1224 } else { 1225 vpmovzxbd(dst, src, vector_len); 1226 } 1227 } 1228 1229 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1230 if (sign) { 1231 vpmovsxwd(dst, src, vector_len); 1232 } else { 1233 vpmovzxwd(dst, src, vector_len); 1234 } 1235 } 1236 1237 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1238 int shift, int vector_len) { 1239 if (opcode == Op_RotateLeftV) { 1240 if (etype == T_INT) { 1241 evprold(dst, src, shift, vector_len); 1242 } else { 1243 assert(etype == T_LONG, "expected type T_LONG"); 1244 evprolq(dst, src, shift, vector_len); 1245 } 1246 } else { 1247 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1248 if (etype == T_INT) { 1249 evprord(dst, src, shift, vector_len); 1250 } else { 1251 assert(etype == T_LONG, "expected type T_LONG"); 1252 evprorq(dst, src, shift, vector_len); 1253 } 1254 } 1255 } 1256 1257 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1258 XMMRegister shift, int vector_len) { 1259 if (opcode == Op_RotateLeftV) { 1260 if (etype == T_INT) { 1261 evprolvd(dst, src, shift, vector_len); 1262 } else { 1263 assert(etype == T_LONG, "expected type T_LONG"); 1264 evprolvq(dst, src, shift, vector_len); 1265 } 1266 } else { 1267 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1268 if (etype == T_INT) { 1269 evprorvd(dst, src, shift, vector_len); 1270 } else { 1271 assert(etype == T_LONG, "expected type T_LONG"); 1272 evprorvq(dst, src, shift, vector_len); 1273 } 1274 } 1275 } 1276 1277 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1278 if (opcode == Op_RShiftVI) { 1279 psrad(dst, shift); 1280 } else if (opcode == Op_LShiftVI) { 1281 pslld(dst, shift); 1282 } else { 1283 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1284 psrld(dst, shift); 1285 } 1286 } 1287 1288 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1289 switch (opcode) { 1290 case Op_RShiftVI: psrad(dst, shift); break; 1291 case Op_LShiftVI: pslld(dst, shift); break; 1292 case Op_URShiftVI: psrld(dst, shift); break; 1293 1294 default: assert(false, "%s", NodeClassNames[opcode]); 1295 } 1296 } 1297 1298 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1299 if (opcode == Op_RShiftVI) { 1300 vpsrad(dst, nds, shift, vector_len); 1301 } else if (opcode == Op_LShiftVI) { 1302 vpslld(dst, nds, shift, vector_len); 1303 } else { 1304 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1305 vpsrld(dst, nds, shift, vector_len); 1306 } 1307 } 1308 1309 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1310 switch (opcode) { 1311 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1312 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1313 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1314 1315 default: assert(false, "%s", NodeClassNames[opcode]); 1316 } 1317 } 1318 1319 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1320 switch (opcode) { 1321 case Op_RShiftVB: // fall-through 1322 case Op_RShiftVS: psraw(dst, shift); break; 1323 1324 case Op_LShiftVB: // fall-through 1325 case Op_LShiftVS: psllw(dst, shift); break; 1326 1327 case Op_URShiftVS: // fall-through 1328 case Op_URShiftVB: psrlw(dst, shift); break; 1329 1330 default: assert(false, "%s", NodeClassNames[opcode]); 1331 } 1332 } 1333 1334 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1335 switch (opcode) { 1336 case Op_RShiftVB: // fall-through 1337 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1338 1339 case Op_LShiftVB: // fall-through 1340 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1341 1342 case Op_URShiftVS: // fall-through 1343 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1344 1345 default: assert(false, "%s", NodeClassNames[opcode]); 1346 } 1347 } 1348 1349 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1350 switch (opcode) { 1351 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1352 case Op_LShiftVL: psllq(dst, shift); break; 1353 case Op_URShiftVL: psrlq(dst, shift); break; 1354 1355 default: assert(false, "%s", NodeClassNames[opcode]); 1356 } 1357 } 1358 1359 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1360 if (opcode == Op_RShiftVL) { 1361 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1362 } else if (opcode == Op_LShiftVL) { 1363 psllq(dst, shift); 1364 } else { 1365 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1366 psrlq(dst, shift); 1367 } 1368 } 1369 1370 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1371 switch (opcode) { 1372 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1373 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1374 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1375 1376 default: assert(false, "%s", NodeClassNames[opcode]); 1377 } 1378 } 1379 1380 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1381 if (opcode == Op_RShiftVL) { 1382 evpsraq(dst, nds, shift, vector_len); 1383 } else if (opcode == Op_LShiftVL) { 1384 vpsllq(dst, nds, shift, vector_len); 1385 } else { 1386 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1387 vpsrlq(dst, nds, shift, vector_len); 1388 } 1389 } 1390 1391 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1392 switch (opcode) { 1393 case Op_RShiftVB: // fall-through 1394 case Op_RShiftVS: // fall-through 1395 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1396 1397 case Op_LShiftVB: // fall-through 1398 case Op_LShiftVS: // fall-through 1399 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1400 1401 case Op_URShiftVB: // fall-through 1402 case Op_URShiftVS: // fall-through 1403 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1404 1405 default: assert(false, "%s", NodeClassNames[opcode]); 1406 } 1407 } 1408 1409 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1410 switch (opcode) { 1411 case Op_RShiftVB: // fall-through 1412 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1413 1414 case Op_LShiftVB: // fall-through 1415 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1416 1417 case Op_URShiftVB: // fall-through 1418 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1419 1420 default: assert(false, "%s", NodeClassNames[opcode]); 1421 } 1422 } 1423 1424 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1425 assert(UseAVX >= 2, "required"); 1426 switch (opcode) { 1427 case Op_RShiftVL: { 1428 if (UseAVX > 2) { 1429 assert(tmp == xnoreg, "not used"); 1430 if (!VM_Version::supports_avx512vl()) { 1431 vlen_enc = Assembler::AVX_512bit; 1432 } 1433 evpsravq(dst, src, shift, vlen_enc); 1434 } else { 1435 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1436 vpsrlvq(dst, src, shift, vlen_enc); 1437 vpsrlvq(tmp, tmp, shift, vlen_enc); 1438 vpxor(dst, dst, tmp, vlen_enc); 1439 vpsubq(dst, dst, tmp, vlen_enc); 1440 } 1441 break; 1442 } 1443 case Op_LShiftVL: { 1444 assert(tmp == xnoreg, "not used"); 1445 vpsllvq(dst, src, shift, vlen_enc); 1446 break; 1447 } 1448 case Op_URShiftVL: { 1449 assert(tmp == xnoreg, "not used"); 1450 vpsrlvq(dst, src, shift, vlen_enc); 1451 break; 1452 } 1453 default: assert(false, "%s", NodeClassNames[opcode]); 1454 } 1455 } 1456 1457 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1458 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1459 assert(opcode == Op_LShiftVB || 1460 opcode == Op_RShiftVB || 1461 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1462 bool sign = (opcode != Op_URShiftVB); 1463 assert(vector_len == 0, "required"); 1464 vextendbd(sign, dst, src, 1); 1465 vpmovzxbd(vtmp, shift, 1); 1466 varshiftd(opcode, dst, dst, vtmp, 1); 1467 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1468 vextracti128_high(vtmp, dst); 1469 vpackusdw(dst, dst, vtmp, 0); 1470 } 1471 1472 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1473 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1474 assert(opcode == Op_LShiftVB || 1475 opcode == Op_RShiftVB || 1476 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1477 bool sign = (opcode != Op_URShiftVB); 1478 int ext_vector_len = vector_len + 1; 1479 vextendbw(sign, dst, src, ext_vector_len); 1480 vpmovzxbw(vtmp, shift, ext_vector_len); 1481 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1482 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1483 if (vector_len == 0) { 1484 vextracti128_high(vtmp, dst); 1485 vpackuswb(dst, dst, vtmp, vector_len); 1486 } else { 1487 vextracti64x4_high(vtmp, dst); 1488 vpackuswb(dst, dst, vtmp, vector_len); 1489 vpermq(dst, dst, 0xD8, vector_len); 1490 } 1491 } 1492 1493 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1494 switch(typ) { 1495 case T_BYTE: 1496 pinsrb(dst, val, idx); 1497 break; 1498 case T_SHORT: 1499 pinsrw(dst, val, idx); 1500 break; 1501 case T_INT: 1502 pinsrd(dst, val, idx); 1503 break; 1504 case T_LONG: 1505 pinsrq(dst, val, idx); 1506 break; 1507 default: 1508 assert(false,"Should not reach here."); 1509 break; 1510 } 1511 } 1512 1513 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1514 switch(typ) { 1515 case T_BYTE: 1516 vpinsrb(dst, src, val, idx); 1517 break; 1518 case T_SHORT: 1519 vpinsrw(dst, src, val, idx); 1520 break; 1521 case T_INT: 1522 vpinsrd(dst, src, val, idx); 1523 break; 1524 case T_LONG: 1525 vpinsrq(dst, src, val, idx); 1526 break; 1527 default: 1528 assert(false,"Should not reach here."); 1529 break; 1530 } 1531 } 1532 1533 #ifdef _LP64 1534 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1535 XMMRegister dst, Register base, 1536 Register idx_base, 1537 Register offset, Register mask, 1538 Register mask_idx, Register rtmp, 1539 int vlen_enc) { 1540 vpxor(dst, dst, dst, vlen_enc); 1541 if (elem_bt == T_SHORT) { 1542 for (int i = 0; i < 4; i++) { 1543 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1544 Label skip_load; 1545 btq(mask, mask_idx); 1546 jccb(Assembler::carryClear, skip_load); 1547 movl(rtmp, Address(idx_base, i * 4)); 1548 if (offset != noreg) { 1549 addl(rtmp, offset); 1550 } 1551 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1552 bind(skip_load); 1553 incq(mask_idx); 1554 } 1555 } else { 1556 assert(elem_bt == T_BYTE, ""); 1557 for (int i = 0; i < 8; i++) { 1558 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1559 Label skip_load; 1560 btq(mask, mask_idx); 1561 jccb(Assembler::carryClear, skip_load); 1562 movl(rtmp, Address(idx_base, i * 4)); 1563 if (offset != noreg) { 1564 addl(rtmp, offset); 1565 } 1566 pinsrb(dst, Address(base, rtmp), i); 1567 bind(skip_load); 1568 incq(mask_idx); 1569 } 1570 } 1571 } 1572 #endif // _LP64 1573 1574 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1575 Register base, Register idx_base, 1576 Register offset, Register rtmp, 1577 int vlen_enc) { 1578 vpxor(dst, dst, dst, vlen_enc); 1579 if (elem_bt == T_SHORT) { 1580 for (int i = 0; i < 4; i++) { 1581 // dst[i] = src[offset + idx_base[i]] 1582 movl(rtmp, Address(idx_base, i * 4)); 1583 if (offset != noreg) { 1584 addl(rtmp, offset); 1585 } 1586 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1587 } 1588 } else { 1589 assert(elem_bt == T_BYTE, ""); 1590 for (int i = 0; i < 8; i++) { 1591 // dst[i] = src[offset + idx_base[i]] 1592 movl(rtmp, Address(idx_base, i * 4)); 1593 if (offset != noreg) { 1594 addl(rtmp, offset); 1595 } 1596 pinsrb(dst, Address(base, rtmp), i); 1597 } 1598 } 1599 } 1600 1601 /* 1602 * Gather using hybrid algorithm, first partially unroll scalar loop 1603 * to accumulate values from gather indices into a quad-word(64bit) slice. 1604 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1605 * permutation to place the slice into appropriate vector lane 1606 * locations in destination vector. Following pseudo code describes the 1607 * algorithm in detail: 1608 * 1609 * DST_VEC = ZERO_VEC 1610 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1611 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1612 * FOREACH_ITER: 1613 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1614 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1615 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1616 * PERM_INDEX = PERM_INDEX - TWO_VEC 1617 * 1618 * With each iteration, doubleword permute indices (0,1) corresponding 1619 * to gathered quadword gets right shifted by two lane positions. 1620 * 1621 */ 1622 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1623 Register base, Register idx_base, 1624 Register offset, Register mask, 1625 XMMRegister xtmp1, XMMRegister xtmp2, 1626 XMMRegister temp_dst, Register rtmp, 1627 Register mask_idx, Register length, 1628 int vector_len, int vlen_enc) { 1629 Label GATHER8_LOOP; 1630 assert(is_subword_type(elem_ty), ""); 1631 movl(length, vector_len); 1632 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1633 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1634 vallones(xtmp2, vlen_enc); 1635 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1636 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1637 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1638 1639 bind(GATHER8_LOOP); 1640 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1641 if (mask == noreg) { 1642 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1643 } else { 1644 LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc)); 1645 } 1646 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1647 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1648 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1649 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1650 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1651 vpor(dst, dst, temp_dst, vlen_enc); 1652 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1653 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1654 jcc(Assembler::notEqual, GATHER8_LOOP); 1655 } 1656 1657 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1658 switch(typ) { 1659 case T_INT: 1660 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1661 break; 1662 case T_FLOAT: 1663 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1664 break; 1665 case T_LONG: 1666 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1667 break; 1668 case T_DOUBLE: 1669 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1670 break; 1671 default: 1672 assert(false,"Should not reach here."); 1673 break; 1674 } 1675 } 1676 1677 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1678 switch(typ) { 1679 case T_INT: 1680 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1681 break; 1682 case T_FLOAT: 1683 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1684 break; 1685 case T_LONG: 1686 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1687 break; 1688 case T_DOUBLE: 1689 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1690 break; 1691 default: 1692 assert(false,"Should not reach here."); 1693 break; 1694 } 1695 } 1696 1697 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1698 switch(typ) { 1699 case T_INT: 1700 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1701 break; 1702 case T_FLOAT: 1703 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1704 break; 1705 case T_LONG: 1706 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1707 break; 1708 case T_DOUBLE: 1709 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1710 break; 1711 default: 1712 assert(false,"Should not reach here."); 1713 break; 1714 } 1715 } 1716 1717 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1718 if (vlen_in_bytes <= 16) { 1719 pxor (dst, dst); 1720 psubb(dst, src); 1721 switch (elem_bt) { 1722 case T_BYTE: /* nothing to do */ break; 1723 case T_SHORT: pmovsxbw(dst, dst); break; 1724 case T_INT: pmovsxbd(dst, dst); break; 1725 case T_FLOAT: pmovsxbd(dst, dst); break; 1726 case T_LONG: pmovsxbq(dst, dst); break; 1727 case T_DOUBLE: pmovsxbq(dst, dst); break; 1728 1729 default: assert(false, "%s", type2name(elem_bt)); 1730 } 1731 } else { 1732 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1733 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1734 1735 vpxor (dst, dst, dst, vlen_enc); 1736 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1737 1738 switch (elem_bt) { 1739 case T_BYTE: /* nothing to do */ break; 1740 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1741 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1742 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1743 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1744 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1745 1746 default: assert(false, "%s", type2name(elem_bt)); 1747 } 1748 } 1749 } 1750 1751 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1752 if (novlbwdq) { 1753 vpmovsxbd(xtmp, src, vlen_enc); 1754 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1755 Assembler::eq, true, vlen_enc, noreg); 1756 } else { 1757 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1758 vpsubb(xtmp, xtmp, src, vlen_enc); 1759 evpmovb2m(dst, xtmp, vlen_enc); 1760 } 1761 } 1762 1763 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1764 switch (vlen_in_bytes) { 1765 case 4: movdl(dst, src); break; 1766 case 8: movq(dst, src); break; 1767 case 16: movdqu(dst, src); break; 1768 case 32: vmovdqu(dst, src); break; 1769 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1770 default: ShouldNotReachHere(); 1771 } 1772 } 1773 1774 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1775 assert(rscratch != noreg || always_reachable(src), "missing"); 1776 1777 if (reachable(src)) { 1778 load_vector(dst, as_Address(src), vlen_in_bytes); 1779 } else { 1780 lea(rscratch, src); 1781 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1782 } 1783 } 1784 1785 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1786 int vlen_enc = vector_length_encoding(vlen); 1787 if (VM_Version::supports_avx()) { 1788 if (bt == T_LONG) { 1789 if (VM_Version::supports_avx2()) { 1790 vpbroadcastq(dst, src, vlen_enc); 1791 } else { 1792 vmovddup(dst, src, vlen_enc); 1793 } 1794 } else if (bt == T_DOUBLE) { 1795 if (vlen_enc != Assembler::AVX_128bit) { 1796 vbroadcastsd(dst, src, vlen_enc, noreg); 1797 } else { 1798 vmovddup(dst, src, vlen_enc); 1799 } 1800 } else { 1801 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1802 vpbroadcastd(dst, src, vlen_enc); 1803 } else { 1804 vbroadcastss(dst, src, vlen_enc); 1805 } 1806 } 1807 } else if (VM_Version::supports_sse3()) { 1808 movddup(dst, src); 1809 } else { 1810 movq(dst, src); 1811 if (vlen == 16) { 1812 punpcklqdq(dst, dst); 1813 } 1814 } 1815 } 1816 1817 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1818 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1819 int offset = exact_log2(type2aelembytes(bt)) << 6; 1820 if (is_floating_point_type(bt)) { 1821 offset += 128; 1822 } 1823 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1824 load_vector(dst, addr, vlen_in_bytes); 1825 } 1826 1827 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1828 1829 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1830 int vector_len = Assembler::AVX_128bit; 1831 1832 switch (opcode) { 1833 case Op_AndReductionV: pand(dst, src); break; 1834 case Op_OrReductionV: por (dst, src); break; 1835 case Op_XorReductionV: pxor(dst, src); break; 1836 case Op_MinReductionV: 1837 switch (typ) { 1838 case T_BYTE: pminsb(dst, src); break; 1839 case T_SHORT: pminsw(dst, src); break; 1840 case T_INT: pminsd(dst, src); break; 1841 case T_LONG: assert(UseAVX > 2, "required"); 1842 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1843 default: assert(false, "wrong type"); 1844 } 1845 break; 1846 case Op_MaxReductionV: 1847 switch (typ) { 1848 case T_BYTE: pmaxsb(dst, src); break; 1849 case T_SHORT: pmaxsw(dst, src); break; 1850 case T_INT: pmaxsd(dst, src); break; 1851 case T_LONG: assert(UseAVX > 2, "required"); 1852 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1853 default: assert(false, "wrong type"); 1854 } 1855 break; 1856 case Op_AddReductionVF: addss(dst, src); break; 1857 case Op_AddReductionVD: addsd(dst, src); break; 1858 case Op_AddReductionVI: 1859 switch (typ) { 1860 case T_BYTE: paddb(dst, src); break; 1861 case T_SHORT: paddw(dst, src); break; 1862 case T_INT: paddd(dst, src); break; 1863 default: assert(false, "wrong type"); 1864 } 1865 break; 1866 case Op_AddReductionVL: paddq(dst, src); break; 1867 case Op_MulReductionVF: mulss(dst, src); break; 1868 case Op_MulReductionVD: mulsd(dst, src); break; 1869 case Op_MulReductionVI: 1870 switch (typ) { 1871 case T_SHORT: pmullw(dst, src); break; 1872 case T_INT: pmulld(dst, src); break; 1873 default: assert(false, "wrong type"); 1874 } 1875 break; 1876 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1877 evpmullq(dst, dst, src, vector_len); break; 1878 default: assert(false, "wrong opcode"); 1879 } 1880 } 1881 1882 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1883 switch (opcode) { 1884 case Op_AddReductionVF: addps(dst, src); break; 1885 case Op_AddReductionVD: addpd(dst, src); break; 1886 case Op_MulReductionVF: mulps(dst, src); break; 1887 case Op_MulReductionVD: mulpd(dst, src); break; 1888 default: assert(false, "%s", NodeClassNames[opcode]); 1889 } 1890 } 1891 1892 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1893 int vector_len = Assembler::AVX_256bit; 1894 1895 switch (opcode) { 1896 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1897 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1898 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1899 case Op_MinReductionV: 1900 switch (typ) { 1901 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1902 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1903 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1904 case T_LONG: assert(UseAVX > 2, "required"); 1905 vpminsq(dst, src1, src2, vector_len); break; 1906 default: assert(false, "wrong type"); 1907 } 1908 break; 1909 case Op_MaxReductionV: 1910 switch (typ) { 1911 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1912 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1913 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1914 case T_LONG: assert(UseAVX > 2, "required"); 1915 vpmaxsq(dst, src1, src2, vector_len); break; 1916 default: assert(false, "wrong type"); 1917 } 1918 break; 1919 case Op_AddReductionVI: 1920 switch (typ) { 1921 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1922 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1923 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1924 default: assert(false, "wrong type"); 1925 } 1926 break; 1927 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1928 case Op_MulReductionVI: 1929 switch (typ) { 1930 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1931 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1932 default: assert(false, "wrong type"); 1933 } 1934 break; 1935 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1936 default: assert(false, "wrong opcode"); 1937 } 1938 } 1939 1940 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1941 int vector_len = Assembler::AVX_256bit; 1942 1943 switch (opcode) { 1944 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 1945 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 1946 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 1947 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 1948 default: assert(false, "%s", NodeClassNames[opcode]); 1949 } 1950 } 1951 1952 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1953 XMMRegister dst, XMMRegister src, 1954 XMMRegister vtmp1, XMMRegister vtmp2) { 1955 switch (opcode) { 1956 case Op_AddReductionVF: 1957 case Op_MulReductionVF: 1958 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1959 break; 1960 1961 case Op_AddReductionVD: 1962 case Op_MulReductionVD: 1963 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1964 break; 1965 1966 default: assert(false, "wrong opcode"); 1967 } 1968 } 1969 1970 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 1971 XMMRegister dst, XMMRegister src, 1972 XMMRegister vtmp1, XMMRegister vtmp2) { 1973 switch (opcode) { 1974 case Op_AddReductionVF: 1975 case Op_MulReductionVF: 1976 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1977 break; 1978 1979 case Op_AddReductionVD: 1980 case Op_MulReductionVD: 1981 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1982 break; 1983 1984 default: assert(false, "%s", NodeClassNames[opcode]); 1985 } 1986 } 1987 1988 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1989 Register dst, Register src1, XMMRegister src2, 1990 XMMRegister vtmp1, XMMRegister vtmp2) { 1991 switch (vlen) { 1992 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1993 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1994 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1995 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1996 1997 default: assert(false, "wrong vector length"); 1998 } 1999 } 2000 2001 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 2002 Register dst, Register src1, XMMRegister src2, 2003 XMMRegister vtmp1, XMMRegister vtmp2) { 2004 switch (vlen) { 2005 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2006 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2007 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2008 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2009 2010 default: assert(false, "wrong vector length"); 2011 } 2012 } 2013 2014 void C2_MacroAssembler::reduceS(int opcode, int vlen, 2015 Register dst, Register src1, XMMRegister src2, 2016 XMMRegister vtmp1, XMMRegister vtmp2) { 2017 switch (vlen) { 2018 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2019 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2020 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2021 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2022 2023 default: assert(false, "wrong vector length"); 2024 } 2025 } 2026 2027 void C2_MacroAssembler::reduceI(int opcode, int vlen, 2028 Register dst, Register src1, XMMRegister src2, 2029 XMMRegister vtmp1, XMMRegister vtmp2) { 2030 switch (vlen) { 2031 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2032 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2033 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2034 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2035 2036 default: assert(false, "wrong vector length"); 2037 } 2038 } 2039 2040 #ifdef _LP64 2041 void C2_MacroAssembler::reduceL(int opcode, int vlen, 2042 Register dst, Register src1, XMMRegister src2, 2043 XMMRegister vtmp1, XMMRegister vtmp2) { 2044 switch (vlen) { 2045 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2046 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2047 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2048 2049 default: assert(false, "wrong vector length"); 2050 } 2051 } 2052 #endif // _LP64 2053 2054 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2055 switch (vlen) { 2056 case 2: 2057 assert(vtmp2 == xnoreg, ""); 2058 reduce2F(opcode, dst, src, vtmp1); 2059 break; 2060 case 4: 2061 assert(vtmp2 == xnoreg, ""); 2062 reduce4F(opcode, dst, src, vtmp1); 2063 break; 2064 case 8: 2065 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2066 break; 2067 case 16: 2068 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2069 break; 2070 default: assert(false, "wrong vector length"); 2071 } 2072 } 2073 2074 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2075 switch (vlen) { 2076 case 2: 2077 assert(vtmp2 == xnoreg, ""); 2078 reduce2D(opcode, dst, src, vtmp1); 2079 break; 2080 case 4: 2081 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2082 break; 2083 case 8: 2084 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2085 break; 2086 default: assert(false, "wrong vector length"); 2087 } 2088 } 2089 2090 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2091 switch (vlen) { 2092 case 2: 2093 assert(vtmp1 == xnoreg, ""); 2094 assert(vtmp2 == xnoreg, ""); 2095 unorderedReduce2F(opcode, dst, src); 2096 break; 2097 case 4: 2098 assert(vtmp2 == xnoreg, ""); 2099 unorderedReduce4F(opcode, dst, src, vtmp1); 2100 break; 2101 case 8: 2102 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 2103 break; 2104 case 16: 2105 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 2106 break; 2107 default: assert(false, "wrong vector length"); 2108 } 2109 } 2110 2111 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2112 switch (vlen) { 2113 case 2: 2114 assert(vtmp1 == xnoreg, ""); 2115 assert(vtmp2 == xnoreg, ""); 2116 unorderedReduce2D(opcode, dst, src); 2117 break; 2118 case 4: 2119 assert(vtmp2 == xnoreg, ""); 2120 unorderedReduce4D(opcode, dst, src, vtmp1); 2121 break; 2122 case 8: 2123 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2124 break; 2125 default: assert(false, "wrong vector length"); 2126 } 2127 } 2128 2129 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2130 if (opcode == Op_AddReductionVI) { 2131 if (vtmp1 != src2) { 2132 movdqu(vtmp1, src2); 2133 } 2134 phaddd(vtmp1, vtmp1); 2135 } else { 2136 pshufd(vtmp1, src2, 0x1); 2137 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2138 } 2139 movdl(vtmp2, src1); 2140 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2141 movdl(dst, vtmp1); 2142 } 2143 2144 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2145 if (opcode == Op_AddReductionVI) { 2146 if (vtmp1 != src2) { 2147 movdqu(vtmp1, src2); 2148 } 2149 phaddd(vtmp1, src2); 2150 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2151 } else { 2152 pshufd(vtmp2, src2, 0xE); 2153 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2154 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2155 } 2156 } 2157 2158 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2159 if (opcode == Op_AddReductionVI) { 2160 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2161 vextracti128_high(vtmp2, vtmp1); 2162 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2163 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2164 } else { 2165 vextracti128_high(vtmp1, src2); 2166 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2167 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2168 } 2169 } 2170 2171 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2172 vextracti64x4_high(vtmp2, src2); 2173 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2174 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2175 } 2176 2177 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2178 pshufd(vtmp2, src2, 0x1); 2179 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2180 movdqu(vtmp1, vtmp2); 2181 psrldq(vtmp1, 2); 2182 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2183 movdqu(vtmp2, vtmp1); 2184 psrldq(vtmp2, 1); 2185 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2186 movdl(vtmp2, src1); 2187 pmovsxbd(vtmp1, vtmp1); 2188 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2189 pextrb(dst, vtmp1, 0x0); 2190 movsbl(dst, dst); 2191 } 2192 2193 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2194 pshufd(vtmp1, src2, 0xE); 2195 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2196 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2197 } 2198 2199 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2200 vextracti128_high(vtmp2, src2); 2201 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2202 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2203 } 2204 2205 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2206 vextracti64x4_high(vtmp1, src2); 2207 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2208 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2209 } 2210 2211 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2212 pmovsxbw(vtmp2, src2); 2213 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2214 } 2215 2216 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2217 if (UseAVX > 1) { 2218 int vector_len = Assembler::AVX_256bit; 2219 vpmovsxbw(vtmp1, src2, vector_len); 2220 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2221 } else { 2222 pmovsxbw(vtmp2, src2); 2223 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2224 pshufd(vtmp2, src2, 0x1); 2225 pmovsxbw(vtmp2, src2); 2226 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2227 } 2228 } 2229 2230 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2231 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2232 int vector_len = Assembler::AVX_512bit; 2233 vpmovsxbw(vtmp1, src2, vector_len); 2234 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2235 } else { 2236 assert(UseAVX >= 2,"Should not reach here."); 2237 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2238 vextracti128_high(vtmp2, src2); 2239 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2240 } 2241 } 2242 2243 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2244 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2245 vextracti64x4_high(vtmp2, src2); 2246 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2247 } 2248 2249 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2250 if (opcode == Op_AddReductionVI) { 2251 if (vtmp1 != src2) { 2252 movdqu(vtmp1, src2); 2253 } 2254 phaddw(vtmp1, vtmp1); 2255 phaddw(vtmp1, vtmp1); 2256 } else { 2257 pshufd(vtmp2, src2, 0x1); 2258 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2259 movdqu(vtmp1, vtmp2); 2260 psrldq(vtmp1, 2); 2261 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2262 } 2263 movdl(vtmp2, src1); 2264 pmovsxwd(vtmp1, vtmp1); 2265 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2266 pextrw(dst, vtmp1, 0x0); 2267 movswl(dst, dst); 2268 } 2269 2270 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2271 if (opcode == Op_AddReductionVI) { 2272 if (vtmp1 != src2) { 2273 movdqu(vtmp1, src2); 2274 } 2275 phaddw(vtmp1, src2); 2276 } else { 2277 pshufd(vtmp1, src2, 0xE); 2278 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2279 } 2280 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2281 } 2282 2283 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2284 if (opcode == Op_AddReductionVI) { 2285 int vector_len = Assembler::AVX_256bit; 2286 vphaddw(vtmp2, src2, src2, vector_len); 2287 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2288 } else { 2289 vextracti128_high(vtmp2, src2); 2290 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2291 } 2292 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2293 } 2294 2295 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2296 int vector_len = Assembler::AVX_256bit; 2297 vextracti64x4_high(vtmp1, src2); 2298 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2299 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2300 } 2301 2302 #ifdef _LP64 2303 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2304 pshufd(vtmp2, src2, 0xE); 2305 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2306 movdq(vtmp1, src1); 2307 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2308 movdq(dst, vtmp1); 2309 } 2310 2311 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2312 vextracti128_high(vtmp1, src2); 2313 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2314 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2315 } 2316 2317 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2318 vextracti64x4_high(vtmp2, src2); 2319 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2320 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2321 } 2322 2323 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2324 mov64(temp, -1L); 2325 bzhiq(temp, temp, len); 2326 kmovql(dst, temp); 2327 } 2328 #endif // _LP64 2329 2330 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2331 reduce_operation_128(T_FLOAT, opcode, dst, src); 2332 pshufd(vtmp, src, 0x1); 2333 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2334 } 2335 2336 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2337 reduce2F(opcode, dst, src, vtmp); 2338 pshufd(vtmp, src, 0x2); 2339 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2340 pshufd(vtmp, src, 0x3); 2341 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2342 } 2343 2344 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2345 reduce4F(opcode, dst, src, vtmp2); 2346 vextractf128_high(vtmp2, src); 2347 reduce4F(opcode, dst, vtmp2, vtmp1); 2348 } 2349 2350 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2351 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2352 vextracti64x4_high(vtmp1, src); 2353 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2354 } 2355 2356 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2357 pshufd(dst, src, 0x1); 2358 reduce_operation_128(T_FLOAT, opcode, dst, src); 2359 } 2360 2361 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2362 pshufd(vtmp, src, 0xE); 2363 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2364 unorderedReduce2F(opcode, dst, vtmp); 2365 } 2366 2367 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2368 vextractf128_high(vtmp1, src); 2369 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2370 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2371 } 2372 2373 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2374 vextractf64x4_high(vtmp2, src); 2375 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2376 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2377 } 2378 2379 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2380 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2381 pshufd(vtmp, src, 0xE); 2382 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2383 } 2384 2385 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2386 reduce2D(opcode, dst, src, vtmp2); 2387 vextractf128_high(vtmp2, src); 2388 reduce2D(opcode, dst, vtmp2, vtmp1); 2389 } 2390 2391 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2392 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2393 vextracti64x4_high(vtmp1, src); 2394 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2395 } 2396 2397 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2398 pshufd(dst, src, 0xE); 2399 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2400 } 2401 2402 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2403 vextractf128_high(vtmp, src); 2404 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2405 unorderedReduce2D(opcode, dst, vtmp); 2406 } 2407 2408 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2409 vextractf64x4_high(vtmp2, src); 2410 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2411 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2412 } 2413 2414 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2415 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2416 } 2417 2418 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2419 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2420 } 2421 2422 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 2423 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2424 } 2425 2426 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2427 int vec_enc) { 2428 switch(elem_bt) { 2429 case T_INT: 2430 case T_FLOAT: 2431 vmaskmovps(dst, src, mask, vec_enc); 2432 break; 2433 case T_LONG: 2434 case T_DOUBLE: 2435 vmaskmovpd(dst, src, mask, vec_enc); 2436 break; 2437 default: 2438 fatal("Unsupported type %s", type2name(elem_bt)); 2439 break; 2440 } 2441 } 2442 2443 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2444 int vec_enc) { 2445 switch(elem_bt) { 2446 case T_INT: 2447 case T_FLOAT: 2448 vmaskmovps(dst, src, mask, vec_enc); 2449 break; 2450 case T_LONG: 2451 case T_DOUBLE: 2452 vmaskmovpd(dst, src, mask, vec_enc); 2453 break; 2454 default: 2455 fatal("Unsupported type %s", type2name(elem_bt)); 2456 break; 2457 } 2458 } 2459 2460 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2461 XMMRegister dst, XMMRegister src, 2462 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2463 XMMRegister xmm_0, XMMRegister xmm_1) { 2464 const int permconst[] = {1, 14}; 2465 XMMRegister wsrc = src; 2466 XMMRegister wdst = xmm_0; 2467 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2468 2469 int vlen_enc = Assembler::AVX_128bit; 2470 if (vlen == 16) { 2471 vlen_enc = Assembler::AVX_256bit; 2472 } 2473 2474 for (int i = log2(vlen) - 1; i >=0; i--) { 2475 if (i == 0 && !is_dst_valid) { 2476 wdst = dst; 2477 } 2478 if (i == 3) { 2479 vextracti64x4_high(wtmp, wsrc); 2480 } else if (i == 2) { 2481 vextracti128_high(wtmp, wsrc); 2482 } else { // i = [0,1] 2483 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2484 } 2485 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2486 wsrc = wdst; 2487 vlen_enc = Assembler::AVX_128bit; 2488 } 2489 if (is_dst_valid) { 2490 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2491 } 2492 } 2493 2494 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2495 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2496 XMMRegister xmm_0, XMMRegister xmm_1) { 2497 XMMRegister wsrc = src; 2498 XMMRegister wdst = xmm_0; 2499 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2500 int vlen_enc = Assembler::AVX_128bit; 2501 if (vlen == 8) { 2502 vlen_enc = Assembler::AVX_256bit; 2503 } 2504 for (int i = log2(vlen) - 1; i >=0; i--) { 2505 if (i == 0 && !is_dst_valid) { 2506 wdst = dst; 2507 } 2508 if (i == 1) { 2509 vextracti128_high(wtmp, wsrc); 2510 } else if (i == 2) { 2511 vextracti64x4_high(wtmp, wsrc); 2512 } else { 2513 assert(i == 0, "%d", i); 2514 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2515 } 2516 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2517 wsrc = wdst; 2518 vlen_enc = Assembler::AVX_128bit; 2519 } 2520 if (is_dst_valid) { 2521 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2522 } 2523 } 2524 2525 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2526 switch (bt) { 2527 case T_BYTE: pextrb(dst, src, idx); break; 2528 case T_SHORT: pextrw(dst, src, idx); break; 2529 case T_INT: pextrd(dst, src, idx); break; 2530 case T_LONG: pextrq(dst, src, idx); break; 2531 2532 default: 2533 assert(false,"Should not reach here."); 2534 break; 2535 } 2536 } 2537 2538 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2539 int esize = type2aelembytes(typ); 2540 int elem_per_lane = 16/esize; 2541 int lane = elemindex / elem_per_lane; 2542 int eindex = elemindex % elem_per_lane; 2543 2544 if (lane >= 2) { 2545 assert(UseAVX > 2, "required"); 2546 vextractf32x4(dst, src, lane & 3); 2547 return dst; 2548 } else if (lane > 0) { 2549 assert(UseAVX > 0, "required"); 2550 vextractf128(dst, src, lane); 2551 return dst; 2552 } else { 2553 return src; 2554 } 2555 } 2556 2557 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2558 if (typ == T_BYTE) { 2559 movsbl(dst, dst); 2560 } else if (typ == T_SHORT) { 2561 movswl(dst, dst); 2562 } 2563 } 2564 2565 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2566 int esize = type2aelembytes(typ); 2567 int elem_per_lane = 16/esize; 2568 int eindex = elemindex % elem_per_lane; 2569 assert(is_integral_type(typ),"required"); 2570 2571 if (eindex == 0) { 2572 if (typ == T_LONG) { 2573 movq(dst, src); 2574 } else { 2575 movdl(dst, src); 2576 movsxl(typ, dst); 2577 } 2578 } else { 2579 extract(typ, dst, src, eindex); 2580 movsxl(typ, dst); 2581 } 2582 } 2583 2584 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2585 int esize = type2aelembytes(typ); 2586 int elem_per_lane = 16/esize; 2587 int eindex = elemindex % elem_per_lane; 2588 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2589 2590 if (eindex == 0) { 2591 movq(dst, src); 2592 } else { 2593 if (typ == T_FLOAT) { 2594 if (UseAVX == 0) { 2595 movdqu(dst, src); 2596 shufps(dst, dst, eindex); 2597 } else { 2598 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2599 } 2600 } else { 2601 if (UseAVX == 0) { 2602 movdqu(dst, src); 2603 psrldq(dst, eindex*esize); 2604 } else { 2605 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2606 } 2607 movq(dst, dst); 2608 } 2609 } 2610 // Zero upper bits 2611 if (typ == T_FLOAT) { 2612 if (UseAVX == 0) { 2613 assert(vtmp != xnoreg, "required."); 2614 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2615 pand(dst, vtmp); 2616 } else { 2617 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2618 } 2619 } 2620 } 2621 2622 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2623 switch(typ) { 2624 case T_BYTE: 2625 case T_BOOLEAN: 2626 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2627 break; 2628 case T_SHORT: 2629 case T_CHAR: 2630 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2631 break; 2632 case T_INT: 2633 case T_FLOAT: 2634 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2635 break; 2636 case T_LONG: 2637 case T_DOUBLE: 2638 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2639 break; 2640 default: 2641 assert(false,"Should not reach here."); 2642 break; 2643 } 2644 } 2645 2646 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2647 assert(rscratch != noreg || always_reachable(src2), "missing"); 2648 2649 switch(typ) { 2650 case T_BOOLEAN: 2651 case T_BYTE: 2652 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2653 break; 2654 case T_CHAR: 2655 case T_SHORT: 2656 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2657 break; 2658 case T_INT: 2659 case T_FLOAT: 2660 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2661 break; 2662 case T_LONG: 2663 case T_DOUBLE: 2664 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2665 break; 2666 default: 2667 assert(false,"Should not reach here."); 2668 break; 2669 } 2670 } 2671 2672 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2673 switch(typ) { 2674 case T_BYTE: 2675 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2676 break; 2677 case T_SHORT: 2678 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2679 break; 2680 case T_INT: 2681 case T_FLOAT: 2682 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2683 break; 2684 case T_LONG: 2685 case T_DOUBLE: 2686 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2687 break; 2688 default: 2689 assert(false,"Should not reach here."); 2690 break; 2691 } 2692 } 2693 2694 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2695 assert(vlen_in_bytes <= 32, ""); 2696 int esize = type2aelembytes(bt); 2697 if (vlen_in_bytes == 32) { 2698 assert(vtmp == xnoreg, "required."); 2699 if (esize >= 4) { 2700 vtestps(src1, src2, AVX_256bit); 2701 } else { 2702 vptest(src1, src2, AVX_256bit); 2703 } 2704 return; 2705 } 2706 if (vlen_in_bytes < 16) { 2707 // Duplicate the lower part to fill the whole register, 2708 // Don't need to do so for src2 2709 assert(vtmp != xnoreg, "required"); 2710 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2711 pshufd(vtmp, src1, shuffle_imm); 2712 } else { 2713 assert(vtmp == xnoreg, "required"); 2714 vtmp = src1; 2715 } 2716 if (esize >= 4 && VM_Version::supports_avx()) { 2717 vtestps(vtmp, src2, AVX_128bit); 2718 } else { 2719 ptest(vtmp, src2); 2720 } 2721 } 2722 2723 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2724 #ifdef ASSERT 2725 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2726 bool is_bw_supported = VM_Version::supports_avx512bw(); 2727 if (is_bw && !is_bw_supported) { 2728 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2729 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2730 "XMM register should be 0-15"); 2731 } 2732 #endif // ASSERT 2733 switch (elem_bt) { 2734 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2735 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2736 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2737 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2738 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2739 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2740 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2741 } 2742 } 2743 2744 #ifdef _LP64 2745 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2746 assert(UseAVX >= 2, "required"); 2747 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2748 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2749 if ((UseAVX > 2) && 2750 (!is_bw || VM_Version::supports_avx512bw()) && 2751 (!is_vl || VM_Version::supports_avx512vl())) { 2752 switch (elem_bt) { 2753 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2754 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2755 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2756 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2757 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2758 } 2759 } else { 2760 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2761 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2762 switch (elem_bt) { 2763 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2764 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2765 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2766 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2767 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2768 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2769 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2770 } 2771 } 2772 } 2773 #endif 2774 2775 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2776 switch (to_elem_bt) { 2777 case T_SHORT: 2778 vpmovsxbw(dst, src, vlen_enc); 2779 break; 2780 case T_INT: 2781 vpmovsxbd(dst, src, vlen_enc); 2782 break; 2783 case T_FLOAT: 2784 vpmovsxbd(dst, src, vlen_enc); 2785 vcvtdq2ps(dst, dst, vlen_enc); 2786 break; 2787 case T_LONG: 2788 vpmovsxbq(dst, src, vlen_enc); 2789 break; 2790 case T_DOUBLE: { 2791 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2792 vpmovsxbd(dst, src, mid_vlen_enc); 2793 vcvtdq2pd(dst, dst, vlen_enc); 2794 break; 2795 } 2796 default: 2797 fatal("Unsupported type %s", type2name(to_elem_bt)); 2798 break; 2799 } 2800 } 2801 2802 //------------------------------------------------------------------------------------------- 2803 2804 // IndexOf for constant substrings with size >= 8 chars 2805 // which don't need to be loaded through stack. 2806 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2807 Register cnt1, Register cnt2, 2808 int int_cnt2, Register result, 2809 XMMRegister vec, Register tmp, 2810 int ae) { 2811 ShortBranchVerifier sbv(this); 2812 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2813 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2814 2815 // This method uses the pcmpestri instruction with bound registers 2816 // inputs: 2817 // xmm - substring 2818 // rax - substring length (elements count) 2819 // mem - scanned string 2820 // rdx - string length (elements count) 2821 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2822 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2823 // outputs: 2824 // rcx - matched index in string 2825 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2826 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2827 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2828 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2829 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2830 2831 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2832 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2833 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2834 2835 // Note, inline_string_indexOf() generates checks: 2836 // if (substr.count > string.count) return -1; 2837 // if (substr.count == 0) return 0; 2838 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2839 2840 // Load substring. 2841 if (ae == StrIntrinsicNode::UL) { 2842 pmovzxbw(vec, Address(str2, 0)); 2843 } else { 2844 movdqu(vec, Address(str2, 0)); 2845 } 2846 movl(cnt2, int_cnt2); 2847 movptr(result, str1); // string addr 2848 2849 if (int_cnt2 > stride) { 2850 jmpb(SCAN_TO_SUBSTR); 2851 2852 // Reload substr for rescan, this code 2853 // is executed only for large substrings (> 8 chars) 2854 bind(RELOAD_SUBSTR); 2855 if (ae == StrIntrinsicNode::UL) { 2856 pmovzxbw(vec, Address(str2, 0)); 2857 } else { 2858 movdqu(vec, Address(str2, 0)); 2859 } 2860 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2861 2862 bind(RELOAD_STR); 2863 // We came here after the beginning of the substring was 2864 // matched but the rest of it was not so we need to search 2865 // again. Start from the next element after the previous match. 2866 2867 // cnt2 is number of substring reminding elements and 2868 // cnt1 is number of string reminding elements when cmp failed. 2869 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2870 subl(cnt1, cnt2); 2871 addl(cnt1, int_cnt2); 2872 movl(cnt2, int_cnt2); // Now restore cnt2 2873 2874 decrementl(cnt1); // Shift to next element 2875 cmpl(cnt1, cnt2); 2876 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2877 2878 addptr(result, (1<<scale1)); 2879 2880 } // (int_cnt2 > 8) 2881 2882 // Scan string for start of substr in 16-byte vectors 2883 bind(SCAN_TO_SUBSTR); 2884 pcmpestri(vec, Address(result, 0), mode); 2885 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2886 subl(cnt1, stride); 2887 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2888 cmpl(cnt1, cnt2); 2889 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2890 addptr(result, 16); 2891 jmpb(SCAN_TO_SUBSTR); 2892 2893 // Found a potential substr 2894 bind(FOUND_CANDIDATE); 2895 // Matched whole vector if first element matched (tmp(rcx) == 0). 2896 if (int_cnt2 == stride) { 2897 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2898 } else { // int_cnt2 > 8 2899 jccb(Assembler::overflow, FOUND_SUBSTR); 2900 } 2901 // After pcmpestri tmp(rcx) contains matched element index 2902 // Compute start addr of substr 2903 lea(result, Address(result, tmp, scale1)); 2904 2905 // Make sure string is still long enough 2906 subl(cnt1, tmp); 2907 cmpl(cnt1, cnt2); 2908 if (int_cnt2 == stride) { 2909 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2910 } else { // int_cnt2 > 8 2911 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2912 } 2913 // Left less then substring. 2914 2915 bind(RET_NOT_FOUND); 2916 movl(result, -1); 2917 jmp(EXIT); 2918 2919 if (int_cnt2 > stride) { 2920 // This code is optimized for the case when whole substring 2921 // is matched if its head is matched. 2922 bind(MATCH_SUBSTR_HEAD); 2923 pcmpestri(vec, Address(result, 0), mode); 2924 // Reload only string if does not match 2925 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2926 2927 Label CONT_SCAN_SUBSTR; 2928 // Compare the rest of substring (> 8 chars). 2929 bind(FOUND_SUBSTR); 2930 // First 8 chars are already matched. 2931 negptr(cnt2); 2932 addptr(cnt2, stride); 2933 2934 bind(SCAN_SUBSTR); 2935 subl(cnt1, stride); 2936 cmpl(cnt2, -stride); // Do not read beyond substring 2937 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2938 // Back-up strings to avoid reading beyond substring: 2939 // cnt1 = cnt1 - cnt2 + 8 2940 addl(cnt1, cnt2); // cnt2 is negative 2941 addl(cnt1, stride); 2942 movl(cnt2, stride); negptr(cnt2); 2943 bind(CONT_SCAN_SUBSTR); 2944 if (int_cnt2 < (int)G) { 2945 int tail_off1 = int_cnt2<<scale1; 2946 int tail_off2 = int_cnt2<<scale2; 2947 if (ae == StrIntrinsicNode::UL) { 2948 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2949 } else { 2950 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2951 } 2952 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2953 } else { 2954 // calculate index in register to avoid integer overflow (int_cnt2*2) 2955 movl(tmp, int_cnt2); 2956 addptr(tmp, cnt2); 2957 if (ae == StrIntrinsicNode::UL) { 2958 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2959 } else { 2960 movdqu(vec, Address(str2, tmp, scale2, 0)); 2961 } 2962 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2963 } 2964 // Need to reload strings pointers if not matched whole vector 2965 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2966 addptr(cnt2, stride); 2967 jcc(Assembler::negative, SCAN_SUBSTR); 2968 // Fall through if found full substring 2969 2970 } // (int_cnt2 > 8) 2971 2972 bind(RET_FOUND); 2973 // Found result if we matched full small substring. 2974 // Compute substr offset 2975 subptr(result, str1); 2976 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2977 shrl(result, 1); // index 2978 } 2979 bind(EXIT); 2980 2981 } // string_indexofC8 2982 2983 // Small strings are loaded through stack if they cross page boundary. 2984 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2985 Register cnt1, Register cnt2, 2986 int int_cnt2, Register result, 2987 XMMRegister vec, Register tmp, 2988 int ae) { 2989 ShortBranchVerifier sbv(this); 2990 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2991 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2992 2993 // 2994 // int_cnt2 is length of small (< 8 chars) constant substring 2995 // or (-1) for non constant substring in which case its length 2996 // is in cnt2 register. 2997 // 2998 // Note, inline_string_indexOf() generates checks: 2999 // if (substr.count > string.count) return -1; 3000 // if (substr.count == 0) return 0; 3001 // 3002 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 3003 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 3004 // This method uses the pcmpestri instruction with bound registers 3005 // inputs: 3006 // xmm - substring 3007 // rax - substring length (elements count) 3008 // mem - scanned string 3009 // rdx - string length (elements count) 3010 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 3011 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 3012 // outputs: 3013 // rcx - matched index in string 3014 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3015 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 3016 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 3017 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 3018 3019 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 3020 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 3021 FOUND_CANDIDATE; 3022 3023 { //======================================================== 3024 // We don't know where these strings are located 3025 // and we can't read beyond them. Load them through stack. 3026 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 3027 3028 movptr(tmp, rsp); // save old SP 3029 3030 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 3031 if (int_cnt2 == (1>>scale2)) { // One byte 3032 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 3033 load_unsigned_byte(result, Address(str2, 0)); 3034 movdl(vec, result); // move 32 bits 3035 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 3036 // Not enough header space in 32-bit VM: 12+3 = 15. 3037 movl(result, Address(str2, -1)); 3038 shrl(result, 8); 3039 movdl(vec, result); // move 32 bits 3040 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 3041 load_unsigned_short(result, Address(str2, 0)); 3042 movdl(vec, result); // move 32 bits 3043 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 3044 movdl(vec, Address(str2, 0)); // move 32 bits 3045 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 3046 movq(vec, Address(str2, 0)); // move 64 bits 3047 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 3048 // Array header size is 12 bytes in 32-bit VM 3049 // + 6 bytes for 3 chars == 18 bytes, 3050 // enough space to load vec and shift. 3051 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3052 if (ae == StrIntrinsicNode::UL) { 3053 int tail_off = int_cnt2-8; 3054 pmovzxbw(vec, Address(str2, tail_off)); 3055 psrldq(vec, -2*tail_off); 3056 } 3057 else { 3058 int tail_off = int_cnt2*(1<<scale2); 3059 movdqu(vec, Address(str2, tail_off-16)); 3060 psrldq(vec, 16-tail_off); 3061 } 3062 } 3063 } else { // not constant substring 3064 cmpl(cnt2, stride); 3065 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3066 3067 // We can read beyond string if srt+16 does not cross page boundary 3068 // since heaps are aligned and mapped by pages. 3069 assert(os::vm_page_size() < (int)G, "default page should be small"); 3070 movl(result, str2); // We need only low 32 bits 3071 andl(result, ((int)os::vm_page_size()-1)); 3072 cmpl(result, ((int)os::vm_page_size()-16)); 3073 jccb(Assembler::belowEqual, CHECK_STR); 3074 3075 // Move small strings to stack to allow load 16 bytes into vec. 3076 subptr(rsp, 16); 3077 int stk_offset = wordSize-(1<<scale2); 3078 push(cnt2); 3079 3080 bind(COPY_SUBSTR); 3081 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3082 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3083 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3084 } else if (ae == StrIntrinsicNode::UU) { 3085 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3086 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3087 } 3088 decrement(cnt2); 3089 jccb(Assembler::notZero, COPY_SUBSTR); 3090 3091 pop(cnt2); 3092 movptr(str2, rsp); // New substring address 3093 } // non constant 3094 3095 bind(CHECK_STR); 3096 cmpl(cnt1, stride); 3097 jccb(Assembler::aboveEqual, BIG_STRINGS); 3098 3099 // Check cross page boundary. 3100 movl(result, str1); // We need only low 32 bits 3101 andl(result, ((int)os::vm_page_size()-1)); 3102 cmpl(result, ((int)os::vm_page_size()-16)); 3103 jccb(Assembler::belowEqual, BIG_STRINGS); 3104 3105 subptr(rsp, 16); 3106 int stk_offset = -(1<<scale1); 3107 if (int_cnt2 < 0) { // not constant 3108 push(cnt2); 3109 stk_offset += wordSize; 3110 } 3111 movl(cnt2, cnt1); 3112 3113 bind(COPY_STR); 3114 if (ae == StrIntrinsicNode::LL) { 3115 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3116 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3117 } else { 3118 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3119 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3120 } 3121 decrement(cnt2); 3122 jccb(Assembler::notZero, COPY_STR); 3123 3124 if (int_cnt2 < 0) { // not constant 3125 pop(cnt2); 3126 } 3127 movptr(str1, rsp); // New string address 3128 3129 bind(BIG_STRINGS); 3130 // Load substring. 3131 if (int_cnt2 < 0) { // -1 3132 if (ae == StrIntrinsicNode::UL) { 3133 pmovzxbw(vec, Address(str2, 0)); 3134 } else { 3135 movdqu(vec, Address(str2, 0)); 3136 } 3137 push(cnt2); // substr count 3138 push(str2); // substr addr 3139 push(str1); // string addr 3140 } else { 3141 // Small (< 8 chars) constant substrings are loaded already. 3142 movl(cnt2, int_cnt2); 3143 } 3144 push(tmp); // original SP 3145 3146 } // Finished loading 3147 3148 //======================================================== 3149 // Start search 3150 // 3151 3152 movptr(result, str1); // string addr 3153 3154 if (int_cnt2 < 0) { // Only for non constant substring 3155 jmpb(SCAN_TO_SUBSTR); 3156 3157 // SP saved at sp+0 3158 // String saved at sp+1*wordSize 3159 // Substr saved at sp+2*wordSize 3160 // Substr count saved at sp+3*wordSize 3161 3162 // Reload substr for rescan, this code 3163 // is executed only for large substrings (> 8 chars) 3164 bind(RELOAD_SUBSTR); 3165 movptr(str2, Address(rsp, 2*wordSize)); 3166 movl(cnt2, Address(rsp, 3*wordSize)); 3167 if (ae == StrIntrinsicNode::UL) { 3168 pmovzxbw(vec, Address(str2, 0)); 3169 } else { 3170 movdqu(vec, Address(str2, 0)); 3171 } 3172 // We came here after the beginning of the substring was 3173 // matched but the rest of it was not so we need to search 3174 // again. Start from the next element after the previous match. 3175 subptr(str1, result); // Restore counter 3176 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3177 shrl(str1, 1); 3178 } 3179 addl(cnt1, str1); 3180 decrementl(cnt1); // Shift to next element 3181 cmpl(cnt1, cnt2); 3182 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3183 3184 addptr(result, (1<<scale1)); 3185 } // non constant 3186 3187 // Scan string for start of substr in 16-byte vectors 3188 bind(SCAN_TO_SUBSTR); 3189 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3190 pcmpestri(vec, Address(result, 0), mode); 3191 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3192 subl(cnt1, stride); 3193 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3194 cmpl(cnt1, cnt2); 3195 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3196 addptr(result, 16); 3197 3198 bind(ADJUST_STR); 3199 cmpl(cnt1, stride); // Do not read beyond string 3200 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3201 // Back-up string to avoid reading beyond string. 3202 lea(result, Address(result, cnt1, scale1, -16)); 3203 movl(cnt1, stride); 3204 jmpb(SCAN_TO_SUBSTR); 3205 3206 // Found a potential substr 3207 bind(FOUND_CANDIDATE); 3208 // After pcmpestri tmp(rcx) contains matched element index 3209 3210 // Make sure string is still long enough 3211 subl(cnt1, tmp); 3212 cmpl(cnt1, cnt2); 3213 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3214 // Left less then substring. 3215 3216 bind(RET_NOT_FOUND); 3217 movl(result, -1); 3218 jmp(CLEANUP); 3219 3220 bind(FOUND_SUBSTR); 3221 // Compute start addr of substr 3222 lea(result, Address(result, tmp, scale1)); 3223 if (int_cnt2 > 0) { // Constant substring 3224 // Repeat search for small substring (< 8 chars) 3225 // from new point without reloading substring. 3226 // Have to check that we don't read beyond string. 3227 cmpl(tmp, stride-int_cnt2); 3228 jccb(Assembler::greater, ADJUST_STR); 3229 // Fall through if matched whole substring. 3230 } else { // non constant 3231 assert(int_cnt2 == -1, "should be != 0"); 3232 3233 addl(tmp, cnt2); 3234 // Found result if we matched whole substring. 3235 cmpl(tmp, stride); 3236 jcc(Assembler::lessEqual, RET_FOUND); 3237 3238 // Repeat search for small substring (<= 8 chars) 3239 // from new point 'str1' without reloading substring. 3240 cmpl(cnt2, stride); 3241 // Have to check that we don't read beyond string. 3242 jccb(Assembler::lessEqual, ADJUST_STR); 3243 3244 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3245 // Compare the rest of substring (> 8 chars). 3246 movptr(str1, result); 3247 3248 cmpl(tmp, cnt2); 3249 // First 8 chars are already matched. 3250 jccb(Assembler::equal, CHECK_NEXT); 3251 3252 bind(SCAN_SUBSTR); 3253 pcmpestri(vec, Address(str1, 0), mode); 3254 // Need to reload strings pointers if not matched whole vector 3255 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3256 3257 bind(CHECK_NEXT); 3258 subl(cnt2, stride); 3259 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3260 addptr(str1, 16); 3261 if (ae == StrIntrinsicNode::UL) { 3262 addptr(str2, 8); 3263 } else { 3264 addptr(str2, 16); 3265 } 3266 subl(cnt1, stride); 3267 cmpl(cnt2, stride); // Do not read beyond substring 3268 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3269 // Back-up strings to avoid reading beyond substring. 3270 3271 if (ae == StrIntrinsicNode::UL) { 3272 lea(str2, Address(str2, cnt2, scale2, -8)); 3273 lea(str1, Address(str1, cnt2, scale1, -16)); 3274 } else { 3275 lea(str2, Address(str2, cnt2, scale2, -16)); 3276 lea(str1, Address(str1, cnt2, scale1, -16)); 3277 } 3278 subl(cnt1, cnt2); 3279 movl(cnt2, stride); 3280 addl(cnt1, stride); 3281 bind(CONT_SCAN_SUBSTR); 3282 if (ae == StrIntrinsicNode::UL) { 3283 pmovzxbw(vec, Address(str2, 0)); 3284 } else { 3285 movdqu(vec, Address(str2, 0)); 3286 } 3287 jmp(SCAN_SUBSTR); 3288 3289 bind(RET_FOUND_LONG); 3290 movptr(str1, Address(rsp, wordSize)); 3291 } // non constant 3292 3293 bind(RET_FOUND); 3294 // Compute substr offset 3295 subptr(result, str1); 3296 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3297 shrl(result, 1); // index 3298 } 3299 bind(CLEANUP); 3300 pop(rsp); // restore SP 3301 3302 } // string_indexof 3303 3304 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3305 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3306 ShortBranchVerifier sbv(this); 3307 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3308 3309 int stride = 8; 3310 3311 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3312 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3313 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3314 FOUND_SEQ_CHAR, DONE_LABEL; 3315 3316 movptr(result, str1); 3317 if (UseAVX >= 2) { 3318 cmpl(cnt1, stride); 3319 jcc(Assembler::less, SCAN_TO_CHAR); 3320 cmpl(cnt1, 2*stride); 3321 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3322 movdl(vec1, ch); 3323 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3324 vpxor(vec2, vec2); 3325 movl(tmp, cnt1); 3326 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3327 andl(cnt1,0x0000000F); //tail count (in chars) 3328 3329 bind(SCAN_TO_16_CHAR_LOOP); 3330 vmovdqu(vec3, Address(result, 0)); 3331 vpcmpeqw(vec3, vec3, vec1, 1); 3332 vptest(vec2, vec3); 3333 jcc(Assembler::carryClear, FOUND_CHAR); 3334 addptr(result, 32); 3335 subl(tmp, 2*stride); 3336 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3337 jmp(SCAN_TO_8_CHAR); 3338 bind(SCAN_TO_8_CHAR_INIT); 3339 movdl(vec1, ch); 3340 pshuflw(vec1, vec1, 0x00); 3341 pshufd(vec1, vec1, 0); 3342 pxor(vec2, vec2); 3343 } 3344 bind(SCAN_TO_8_CHAR); 3345 cmpl(cnt1, stride); 3346 jcc(Assembler::less, SCAN_TO_CHAR); 3347 if (UseAVX < 2) { 3348 movdl(vec1, ch); 3349 pshuflw(vec1, vec1, 0x00); 3350 pshufd(vec1, vec1, 0); 3351 pxor(vec2, vec2); 3352 } 3353 movl(tmp, cnt1); 3354 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3355 andl(cnt1,0x00000007); //tail count (in chars) 3356 3357 bind(SCAN_TO_8_CHAR_LOOP); 3358 movdqu(vec3, Address(result, 0)); 3359 pcmpeqw(vec3, vec1); 3360 ptest(vec2, vec3); 3361 jcc(Assembler::carryClear, FOUND_CHAR); 3362 addptr(result, 16); 3363 subl(tmp, stride); 3364 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3365 bind(SCAN_TO_CHAR); 3366 testl(cnt1, cnt1); 3367 jcc(Assembler::zero, RET_NOT_FOUND); 3368 bind(SCAN_TO_CHAR_LOOP); 3369 load_unsigned_short(tmp, Address(result, 0)); 3370 cmpl(ch, tmp); 3371 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3372 addptr(result, 2); 3373 subl(cnt1, 1); 3374 jccb(Assembler::zero, RET_NOT_FOUND); 3375 jmp(SCAN_TO_CHAR_LOOP); 3376 3377 bind(RET_NOT_FOUND); 3378 movl(result, -1); 3379 jmpb(DONE_LABEL); 3380 3381 bind(FOUND_CHAR); 3382 if (UseAVX >= 2) { 3383 vpmovmskb(tmp, vec3); 3384 } else { 3385 pmovmskb(tmp, vec3); 3386 } 3387 bsfl(ch, tmp); 3388 addptr(result, ch); 3389 3390 bind(FOUND_SEQ_CHAR); 3391 subptr(result, str1); 3392 shrl(result, 1); 3393 3394 bind(DONE_LABEL); 3395 } // string_indexof_char 3396 3397 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3398 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3399 ShortBranchVerifier sbv(this); 3400 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3401 3402 int stride = 16; 3403 3404 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3405 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3406 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3407 FOUND_SEQ_CHAR, DONE_LABEL; 3408 3409 movptr(result, str1); 3410 if (UseAVX >= 2) { 3411 cmpl(cnt1, stride); 3412 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3413 cmpl(cnt1, stride*2); 3414 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3415 movdl(vec1, ch); 3416 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3417 vpxor(vec2, vec2); 3418 movl(tmp, cnt1); 3419 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3420 andl(cnt1,0x0000001F); //tail count (in chars) 3421 3422 bind(SCAN_TO_32_CHAR_LOOP); 3423 vmovdqu(vec3, Address(result, 0)); 3424 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3425 vptest(vec2, vec3); 3426 jcc(Assembler::carryClear, FOUND_CHAR); 3427 addptr(result, 32); 3428 subl(tmp, stride*2); 3429 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3430 jmp(SCAN_TO_16_CHAR); 3431 3432 bind(SCAN_TO_16_CHAR_INIT); 3433 movdl(vec1, ch); 3434 pxor(vec2, vec2); 3435 pshufb(vec1, vec2); 3436 } 3437 3438 bind(SCAN_TO_16_CHAR); 3439 cmpl(cnt1, stride); 3440 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3441 if (UseAVX < 2) { 3442 movdl(vec1, ch); 3443 pxor(vec2, vec2); 3444 pshufb(vec1, vec2); 3445 } 3446 movl(tmp, cnt1); 3447 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3448 andl(cnt1,0x0000000F); //tail count (in bytes) 3449 3450 bind(SCAN_TO_16_CHAR_LOOP); 3451 movdqu(vec3, Address(result, 0)); 3452 pcmpeqb(vec3, vec1); 3453 ptest(vec2, vec3); 3454 jcc(Assembler::carryClear, FOUND_CHAR); 3455 addptr(result, 16); 3456 subl(tmp, stride); 3457 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3458 3459 bind(SCAN_TO_CHAR_INIT); 3460 testl(cnt1, cnt1); 3461 jcc(Assembler::zero, RET_NOT_FOUND); 3462 bind(SCAN_TO_CHAR_LOOP); 3463 load_unsigned_byte(tmp, Address(result, 0)); 3464 cmpl(ch, tmp); 3465 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3466 addptr(result, 1); 3467 subl(cnt1, 1); 3468 jccb(Assembler::zero, RET_NOT_FOUND); 3469 jmp(SCAN_TO_CHAR_LOOP); 3470 3471 bind(RET_NOT_FOUND); 3472 movl(result, -1); 3473 jmpb(DONE_LABEL); 3474 3475 bind(FOUND_CHAR); 3476 if (UseAVX >= 2) { 3477 vpmovmskb(tmp, vec3); 3478 } else { 3479 pmovmskb(tmp, vec3); 3480 } 3481 bsfl(ch, tmp); 3482 addptr(result, ch); 3483 3484 bind(FOUND_SEQ_CHAR); 3485 subptr(result, str1); 3486 3487 bind(DONE_LABEL); 3488 } // stringL_indexof_char 3489 3490 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3491 switch (eltype) { 3492 case T_BOOLEAN: return sizeof(jboolean); 3493 case T_BYTE: return sizeof(jbyte); 3494 case T_SHORT: return sizeof(jshort); 3495 case T_CHAR: return sizeof(jchar); 3496 case T_INT: return sizeof(jint); 3497 default: 3498 ShouldNotReachHere(); 3499 return -1; 3500 } 3501 } 3502 3503 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3504 switch (eltype) { 3505 // T_BOOLEAN used as surrogate for unsigned byte 3506 case T_BOOLEAN: movzbl(dst, src); break; 3507 case T_BYTE: movsbl(dst, src); break; 3508 case T_SHORT: movswl(dst, src); break; 3509 case T_CHAR: movzwl(dst, src); break; 3510 case T_INT: movl(dst, src); break; 3511 default: 3512 ShouldNotReachHere(); 3513 } 3514 } 3515 3516 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3517 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3518 } 3519 3520 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3521 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3522 } 3523 3524 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3525 const int vlen = Assembler::AVX_256bit; 3526 switch (eltype) { 3527 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3528 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3529 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3530 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3531 case T_INT: 3532 // do nothing 3533 break; 3534 default: 3535 ShouldNotReachHere(); 3536 } 3537 } 3538 3539 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3540 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3541 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3542 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3543 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3544 BasicType eltype) { 3545 ShortBranchVerifier sbv(this); 3546 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3547 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3548 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3549 3550 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3551 SHORT_UNROLLED_LOOP_EXIT, 3552 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3553 UNROLLED_VECTOR_LOOP_BEGIN, 3554 END; 3555 switch (eltype) { 3556 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3557 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3558 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3559 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3560 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3561 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3562 } 3563 3564 // For "renaming" for readibility of the code 3565 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3566 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3567 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3568 3569 const int elsize = arrays_hashcode_elsize(eltype); 3570 3571 /* 3572 if (cnt1 >= 2) { 3573 if (cnt1 >= 32) { 3574 UNROLLED VECTOR LOOP 3575 } 3576 UNROLLED SCALAR LOOP 3577 } 3578 SINGLE SCALAR 3579 */ 3580 3581 cmpl(cnt1, 32); 3582 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3583 3584 // cnt1 >= 32 && generate_vectorized_loop 3585 xorl(index, index); 3586 3587 // vresult = IntVector.zero(I256); 3588 for (int idx = 0; idx < 4; idx++) { 3589 vpxor(vresult[idx], vresult[idx]); 3590 } 3591 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3592 Register bound = tmp2; 3593 Register next = tmp3; 3594 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3595 movl(next, Address(tmp2, 0)); 3596 movdl(vnext, next); 3597 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3598 3599 // index = 0; 3600 // bound = cnt1 & ~(32 - 1); 3601 movl(bound, cnt1); 3602 andl(bound, ~(32 - 1)); 3603 // for (; index < bound; index += 32) { 3604 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3605 // result *= next; 3606 imull(result, next); 3607 // loop fission to upfront the cost of fetching from memory, OOO execution 3608 // can then hopefully do a better job of prefetching 3609 for (int idx = 0; idx < 4; idx++) { 3610 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3611 } 3612 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3613 for (int idx = 0; idx < 4; idx++) { 3614 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3615 arrays_hashcode_elvcast(vtmp[idx], eltype); 3616 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3617 } 3618 // index += 32; 3619 addl(index, 32); 3620 // index < bound; 3621 cmpl(index, bound); 3622 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3623 // } 3624 3625 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3626 subl(cnt1, bound); 3627 // release bound 3628 3629 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3630 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3631 for (int idx = 0; idx < 4; idx++) { 3632 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, (int)((8 * idx + 1) * sizeof(jint))), T_INT); 3633 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3634 } 3635 // result += vresult.reduceLanes(ADD); 3636 for (int idx = 0; idx < 4; idx++) { 3637 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3638 } 3639 3640 // } else if (cnt1 < 32) { 3641 3642 bind(SHORT_UNROLLED_BEGIN); 3643 // int i = 1; 3644 movl(index, 1); 3645 cmpl(index, cnt1); 3646 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3647 3648 // for (; i < cnt1 ; i += 2) { 3649 bind(SHORT_UNROLLED_LOOP_BEGIN); 3650 movl(tmp3, 961); 3651 imull(result, tmp3); 3652 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3653 movl(tmp3, tmp2); 3654 shll(tmp3, 5); 3655 subl(tmp3, tmp2); 3656 addl(result, tmp3); 3657 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3658 addl(result, tmp3); 3659 addl(index, 2); 3660 cmpl(index, cnt1); 3661 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3662 3663 // } 3664 // if (i >= cnt1) { 3665 bind(SHORT_UNROLLED_LOOP_EXIT); 3666 jccb(Assembler::greater, END); 3667 movl(tmp2, result); 3668 shll(result, 5); 3669 subl(result, tmp2); 3670 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3671 addl(result, tmp3); 3672 // } 3673 bind(END); 3674 3675 BLOCK_COMMENT("} // arrays_hashcode"); 3676 3677 } // arrays_hashcode 3678 3679 // helper function for string_compare 3680 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3681 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3682 Address::ScaleFactor scale2, Register index, int ae) { 3683 if (ae == StrIntrinsicNode::LL) { 3684 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3685 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3686 } else if (ae == StrIntrinsicNode::UU) { 3687 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3688 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3689 } else { 3690 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3691 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3692 } 3693 } 3694 3695 // Compare strings, used for char[] and byte[]. 3696 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3697 Register cnt1, Register cnt2, Register result, 3698 XMMRegister vec1, int ae, KRegister mask) { 3699 ShortBranchVerifier sbv(this); 3700 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3701 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3702 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3703 int stride2x2 = 0x40; 3704 Address::ScaleFactor scale = Address::no_scale; 3705 Address::ScaleFactor scale1 = Address::no_scale; 3706 Address::ScaleFactor scale2 = Address::no_scale; 3707 3708 if (ae != StrIntrinsicNode::LL) { 3709 stride2x2 = 0x20; 3710 } 3711 3712 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3713 shrl(cnt2, 1); 3714 } 3715 // Compute the minimum of the string lengths and the 3716 // difference of the string lengths (stack). 3717 // Do the conditional move stuff 3718 movl(result, cnt1); 3719 subl(cnt1, cnt2); 3720 push(cnt1); 3721 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3722 3723 // Is the minimum length zero? 3724 testl(cnt2, cnt2); 3725 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3726 if (ae == StrIntrinsicNode::LL) { 3727 // Load first bytes 3728 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3729 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3730 } else if (ae == StrIntrinsicNode::UU) { 3731 // Load first characters 3732 load_unsigned_short(result, Address(str1, 0)); 3733 load_unsigned_short(cnt1, Address(str2, 0)); 3734 } else { 3735 load_unsigned_byte(result, Address(str1, 0)); 3736 load_unsigned_short(cnt1, Address(str2, 0)); 3737 } 3738 subl(result, cnt1); 3739 jcc(Assembler::notZero, POP_LABEL); 3740 3741 if (ae == StrIntrinsicNode::UU) { 3742 // Divide length by 2 to get number of chars 3743 shrl(cnt2, 1); 3744 } 3745 cmpl(cnt2, 1); 3746 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3747 3748 // Check if the strings start at the same location and setup scale and stride 3749 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3750 cmpptr(str1, str2); 3751 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3752 if (ae == StrIntrinsicNode::LL) { 3753 scale = Address::times_1; 3754 stride = 16; 3755 } else { 3756 scale = Address::times_2; 3757 stride = 8; 3758 } 3759 } else { 3760 scale1 = Address::times_1; 3761 scale2 = Address::times_2; 3762 // scale not used 3763 stride = 8; 3764 } 3765 3766 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3767 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3768 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3769 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3770 Label COMPARE_TAIL_LONG; 3771 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3772 3773 int pcmpmask = 0x19; 3774 if (ae == StrIntrinsicNode::LL) { 3775 pcmpmask &= ~0x01; 3776 } 3777 3778 // Setup to compare 16-chars (32-bytes) vectors, 3779 // start from first character again because it has aligned address. 3780 if (ae == StrIntrinsicNode::LL) { 3781 stride2 = 32; 3782 } else { 3783 stride2 = 16; 3784 } 3785 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3786 adr_stride = stride << scale; 3787 } else { 3788 adr_stride1 = 8; //stride << scale1; 3789 adr_stride2 = 16; //stride << scale2; 3790 } 3791 3792 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3793 // rax and rdx are used by pcmpestri as elements counters 3794 movl(result, cnt2); 3795 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3796 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3797 3798 // fast path : compare first 2 8-char vectors. 3799 bind(COMPARE_16_CHARS); 3800 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3801 movdqu(vec1, Address(str1, 0)); 3802 } else { 3803 pmovzxbw(vec1, Address(str1, 0)); 3804 } 3805 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3806 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3807 3808 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3809 movdqu(vec1, Address(str1, adr_stride)); 3810 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3811 } else { 3812 pmovzxbw(vec1, Address(str1, adr_stride1)); 3813 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3814 } 3815 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3816 addl(cnt1, stride); 3817 3818 // Compare the characters at index in cnt1 3819 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3820 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3821 subl(result, cnt2); 3822 jmp(POP_LABEL); 3823 3824 // Setup the registers to start vector comparison loop 3825 bind(COMPARE_WIDE_VECTORS); 3826 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3827 lea(str1, Address(str1, result, scale)); 3828 lea(str2, Address(str2, result, scale)); 3829 } else { 3830 lea(str1, Address(str1, result, scale1)); 3831 lea(str2, Address(str2, result, scale2)); 3832 } 3833 subl(result, stride2); 3834 subl(cnt2, stride2); 3835 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3836 negptr(result); 3837 3838 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3839 bind(COMPARE_WIDE_VECTORS_LOOP); 3840 3841 #ifdef _LP64 3842 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3843 cmpl(cnt2, stride2x2); 3844 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3845 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3846 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3847 3848 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3849 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3850 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3851 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3852 } else { 3853 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3854 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3855 } 3856 kortestql(mask, mask); 3857 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3858 addptr(result, stride2x2); // update since we already compared at this addr 3859 subl(cnt2, stride2x2); // and sub the size too 3860 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3861 3862 vpxor(vec1, vec1); 3863 jmpb(COMPARE_WIDE_TAIL); 3864 }//if (VM_Version::supports_avx512vlbw()) 3865 #endif // _LP64 3866 3867 3868 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3869 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3870 vmovdqu(vec1, Address(str1, result, scale)); 3871 vpxor(vec1, Address(str2, result, scale)); 3872 } else { 3873 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3874 vpxor(vec1, Address(str2, result, scale2)); 3875 } 3876 vptest(vec1, vec1); 3877 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3878 addptr(result, stride2); 3879 subl(cnt2, stride2); 3880 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3881 // clean upper bits of YMM registers 3882 vpxor(vec1, vec1); 3883 3884 // compare wide vectors tail 3885 bind(COMPARE_WIDE_TAIL); 3886 testptr(result, result); 3887 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3888 3889 movl(result, stride2); 3890 movl(cnt2, result); 3891 negptr(result); 3892 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3893 3894 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3895 bind(VECTOR_NOT_EQUAL); 3896 // clean upper bits of YMM registers 3897 vpxor(vec1, vec1); 3898 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3899 lea(str1, Address(str1, result, scale)); 3900 lea(str2, Address(str2, result, scale)); 3901 } else { 3902 lea(str1, Address(str1, result, scale1)); 3903 lea(str2, Address(str2, result, scale2)); 3904 } 3905 jmp(COMPARE_16_CHARS); 3906 3907 // Compare tail chars, length between 1 to 15 chars 3908 bind(COMPARE_TAIL_LONG); 3909 movl(cnt2, result); 3910 cmpl(cnt2, stride); 3911 jcc(Assembler::less, COMPARE_SMALL_STR); 3912 3913 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3914 movdqu(vec1, Address(str1, 0)); 3915 } else { 3916 pmovzxbw(vec1, Address(str1, 0)); 3917 } 3918 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3919 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3920 subptr(cnt2, stride); 3921 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3922 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3923 lea(str1, Address(str1, result, scale)); 3924 lea(str2, Address(str2, result, scale)); 3925 } else { 3926 lea(str1, Address(str1, result, scale1)); 3927 lea(str2, Address(str2, result, scale2)); 3928 } 3929 negptr(cnt2); 3930 jmpb(WHILE_HEAD_LABEL); 3931 3932 bind(COMPARE_SMALL_STR); 3933 } else if (UseSSE42Intrinsics) { 3934 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3935 int pcmpmask = 0x19; 3936 // Setup to compare 8-char (16-byte) vectors, 3937 // start from first character again because it has aligned address. 3938 movl(result, cnt2); 3939 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3940 if (ae == StrIntrinsicNode::LL) { 3941 pcmpmask &= ~0x01; 3942 } 3943 jcc(Assembler::zero, COMPARE_TAIL); 3944 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3945 lea(str1, Address(str1, result, scale)); 3946 lea(str2, Address(str2, result, scale)); 3947 } else { 3948 lea(str1, Address(str1, result, scale1)); 3949 lea(str2, Address(str2, result, scale2)); 3950 } 3951 negptr(result); 3952 3953 // pcmpestri 3954 // inputs: 3955 // vec1- substring 3956 // rax - negative string length (elements count) 3957 // mem - scanned string 3958 // rdx - string length (elements count) 3959 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3960 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3961 // outputs: 3962 // rcx - first mismatched element index 3963 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3964 3965 bind(COMPARE_WIDE_VECTORS); 3966 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3967 movdqu(vec1, Address(str1, result, scale)); 3968 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3969 } else { 3970 pmovzxbw(vec1, Address(str1, result, scale1)); 3971 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3972 } 3973 // After pcmpestri cnt1(rcx) contains mismatched element index 3974 3975 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3976 addptr(result, stride); 3977 subptr(cnt2, stride); 3978 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3979 3980 // compare wide vectors tail 3981 testptr(result, result); 3982 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3983 3984 movl(cnt2, stride); 3985 movl(result, stride); 3986 negptr(result); 3987 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3988 movdqu(vec1, Address(str1, result, scale)); 3989 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3990 } else { 3991 pmovzxbw(vec1, Address(str1, result, scale1)); 3992 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3993 } 3994 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3995 3996 // Mismatched characters in the vectors 3997 bind(VECTOR_NOT_EQUAL); 3998 addptr(cnt1, result); 3999 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 4000 subl(result, cnt2); 4001 jmpb(POP_LABEL); 4002 4003 bind(COMPARE_TAIL); // limit is zero 4004 movl(cnt2, result); 4005 // Fallthru to tail compare 4006 } 4007 // Shift str2 and str1 to the end of the arrays, negate min 4008 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4009 lea(str1, Address(str1, cnt2, scale)); 4010 lea(str2, Address(str2, cnt2, scale)); 4011 } else { 4012 lea(str1, Address(str1, cnt2, scale1)); 4013 lea(str2, Address(str2, cnt2, scale2)); 4014 } 4015 decrementl(cnt2); // first character was compared already 4016 negptr(cnt2); 4017 4018 // Compare the rest of the elements 4019 bind(WHILE_HEAD_LABEL); 4020 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 4021 subl(result, cnt1); 4022 jccb(Assembler::notZero, POP_LABEL); 4023 increment(cnt2); 4024 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 4025 4026 // Strings are equal up to min length. Return the length difference. 4027 bind(LENGTH_DIFF_LABEL); 4028 pop(result); 4029 if (ae == StrIntrinsicNode::UU) { 4030 // Divide diff by 2 to get number of chars 4031 sarl(result, 1); 4032 } 4033 jmpb(DONE_LABEL); 4034 4035 #ifdef _LP64 4036 if (VM_Version::supports_avx512vlbw()) { 4037 4038 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 4039 4040 kmovql(cnt1, mask); 4041 notq(cnt1); 4042 bsfq(cnt2, cnt1); 4043 if (ae != StrIntrinsicNode::LL) { 4044 // Divide diff by 2 to get number of chars 4045 sarl(cnt2, 1); 4046 } 4047 addq(result, cnt2); 4048 if (ae == StrIntrinsicNode::LL) { 4049 load_unsigned_byte(cnt1, Address(str2, result)); 4050 load_unsigned_byte(result, Address(str1, result)); 4051 } else if (ae == StrIntrinsicNode::UU) { 4052 load_unsigned_short(cnt1, Address(str2, result, scale)); 4053 load_unsigned_short(result, Address(str1, result, scale)); 4054 } else { 4055 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4056 load_unsigned_byte(result, Address(str1, result, scale1)); 4057 } 4058 subl(result, cnt1); 4059 jmpb(POP_LABEL); 4060 }//if (VM_Version::supports_avx512vlbw()) 4061 #endif // _LP64 4062 4063 // Discard the stored length difference 4064 bind(POP_LABEL); 4065 pop(cnt1); 4066 4067 // That's it 4068 bind(DONE_LABEL); 4069 if(ae == StrIntrinsicNode::UL) { 4070 negl(result); 4071 } 4072 4073 } 4074 4075 // Search for Non-ASCII character (Negative byte value) in a byte array, 4076 // return the index of the first such character, otherwise the length 4077 // of the array segment searched. 4078 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4079 // @IntrinsicCandidate 4080 // public static int countPositives(byte[] ba, int off, int len) { 4081 // for (int i = off; i < off + len; i++) { 4082 // if (ba[i] < 0) { 4083 // return i - off; 4084 // } 4085 // } 4086 // return len; 4087 // } 4088 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4089 Register result, Register tmp1, 4090 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4091 // rsi: byte array 4092 // rcx: len 4093 // rax: result 4094 ShortBranchVerifier sbv(this); 4095 assert_different_registers(ary1, len, result, tmp1); 4096 assert_different_registers(vec1, vec2); 4097 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4098 4099 movl(result, len); // copy 4100 // len == 0 4101 testl(len, len); 4102 jcc(Assembler::zero, DONE); 4103 4104 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4105 VM_Version::supports_avx512vlbw() && 4106 VM_Version::supports_bmi2()) { 4107 4108 Label test_64_loop, test_tail, BREAK_LOOP; 4109 movl(tmp1, len); 4110 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4111 4112 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4113 andl(len, 0xffffffc0); // vector count (in chars) 4114 jccb(Assembler::zero, test_tail); 4115 4116 lea(ary1, Address(ary1, len, Address::times_1)); 4117 negptr(len); 4118 4119 bind(test_64_loop); 4120 // Check whether our 64 elements of size byte contain negatives 4121 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4122 kortestql(mask1, mask1); 4123 jcc(Assembler::notZero, BREAK_LOOP); 4124 4125 addptr(len, 64); 4126 jccb(Assembler::notZero, test_64_loop); 4127 4128 bind(test_tail); 4129 // bail out when there is nothing to be done 4130 testl(tmp1, -1); 4131 jcc(Assembler::zero, DONE); 4132 4133 4134 // check the tail for absense of negatives 4135 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4136 #ifdef _LP64 4137 { 4138 Register tmp3_aliased = len; 4139 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4140 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4141 notq(tmp3_aliased); 4142 kmovql(mask2, tmp3_aliased); 4143 } 4144 #else 4145 Label k_init; 4146 jmp(k_init); 4147 4148 // We could not read 64-bits from a general purpose register thus we move 4149 // data required to compose 64 1's to the instruction stream 4150 // We emit 64 byte wide series of elements from 0..63 which later on would 4151 // be used as a compare targets with tail count contained in tmp1 register. 4152 // Result would be a k register having tmp1 consecutive number or 1 4153 // counting from least significant bit. 4154 address tmp = pc(); 4155 emit_int64(0x0706050403020100); 4156 emit_int64(0x0F0E0D0C0B0A0908); 4157 emit_int64(0x1716151413121110); 4158 emit_int64(0x1F1E1D1C1B1A1918); 4159 emit_int64(0x2726252423222120); 4160 emit_int64(0x2F2E2D2C2B2A2928); 4161 emit_int64(0x3736353433323130); 4162 emit_int64(0x3F3E3D3C3B3A3938); 4163 4164 bind(k_init); 4165 lea(len, InternalAddress(tmp)); 4166 // create mask to test for negative byte inside a vector 4167 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 4168 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 4169 4170 #endif 4171 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4172 ktestq(mask1, mask2); 4173 jcc(Assembler::zero, DONE); 4174 4175 // do a full check for negative registers in the tail 4176 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4177 // ary1 already pointing to the right place 4178 jmpb(TAIL_START); 4179 4180 bind(BREAK_LOOP); 4181 // At least one byte in the last 64 byte block was negative. 4182 // Set up to look at the last 64 bytes as if they were a tail 4183 lea(ary1, Address(ary1, len, Address::times_1)); 4184 addptr(result, len); 4185 // Ignore the very last byte: if all others are positive, 4186 // it must be negative, so we can skip right to the 2+1 byte 4187 // end comparison at this point 4188 orl(result, 63); 4189 movl(len, 63); 4190 // Fallthru to tail compare 4191 } else { 4192 4193 if (UseAVX >= 2 && UseSSE >= 2) { 4194 // With AVX2, use 32-byte vector compare 4195 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4196 4197 // Compare 32-byte vectors 4198 testl(len, 0xffffffe0); // vector count (in bytes) 4199 jccb(Assembler::zero, TAIL_START); 4200 4201 andl(len, 0xffffffe0); 4202 lea(ary1, Address(ary1, len, Address::times_1)); 4203 negptr(len); 4204 4205 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4206 movdl(vec2, tmp1); 4207 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4208 4209 bind(COMPARE_WIDE_VECTORS); 4210 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4211 vptest(vec1, vec2); 4212 jccb(Assembler::notZero, BREAK_LOOP); 4213 addptr(len, 32); 4214 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4215 4216 testl(result, 0x0000001f); // any bytes remaining? 4217 jcc(Assembler::zero, DONE); 4218 4219 // Quick test using the already prepared vector mask 4220 movl(len, result); 4221 andl(len, 0x0000001f); 4222 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4223 vptest(vec1, vec2); 4224 jcc(Assembler::zero, DONE); 4225 // There are zeros, jump to the tail to determine exactly where 4226 jmpb(TAIL_START); 4227 4228 bind(BREAK_LOOP); 4229 // At least one byte in the last 32-byte vector is negative. 4230 // Set up to look at the last 32 bytes as if they were a tail 4231 lea(ary1, Address(ary1, len, Address::times_1)); 4232 addptr(result, len); 4233 // Ignore the very last byte: if all others are positive, 4234 // it must be negative, so we can skip right to the 2+1 byte 4235 // end comparison at this point 4236 orl(result, 31); 4237 movl(len, 31); 4238 // Fallthru to tail compare 4239 } else if (UseSSE42Intrinsics) { 4240 // With SSE4.2, use double quad vector compare 4241 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4242 4243 // Compare 16-byte vectors 4244 testl(len, 0xfffffff0); // vector count (in bytes) 4245 jcc(Assembler::zero, TAIL_START); 4246 4247 andl(len, 0xfffffff0); 4248 lea(ary1, Address(ary1, len, Address::times_1)); 4249 negptr(len); 4250 4251 movl(tmp1, 0x80808080); 4252 movdl(vec2, tmp1); 4253 pshufd(vec2, vec2, 0); 4254 4255 bind(COMPARE_WIDE_VECTORS); 4256 movdqu(vec1, Address(ary1, len, Address::times_1)); 4257 ptest(vec1, vec2); 4258 jccb(Assembler::notZero, BREAK_LOOP); 4259 addptr(len, 16); 4260 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4261 4262 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4263 jcc(Assembler::zero, DONE); 4264 4265 // Quick test using the already prepared vector mask 4266 movl(len, result); 4267 andl(len, 0x0000000f); // tail count (in bytes) 4268 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4269 ptest(vec1, vec2); 4270 jcc(Assembler::zero, DONE); 4271 jmpb(TAIL_START); 4272 4273 bind(BREAK_LOOP); 4274 // At least one byte in the last 16-byte vector is negative. 4275 // Set up and look at the last 16 bytes as if they were a tail 4276 lea(ary1, Address(ary1, len, Address::times_1)); 4277 addptr(result, len); 4278 // Ignore the very last byte: if all others are positive, 4279 // it must be negative, so we can skip right to the 2+1 byte 4280 // end comparison at this point 4281 orl(result, 15); 4282 movl(len, 15); 4283 // Fallthru to tail compare 4284 } 4285 } 4286 4287 bind(TAIL_START); 4288 // Compare 4-byte vectors 4289 andl(len, 0xfffffffc); // vector count (in bytes) 4290 jccb(Assembler::zero, COMPARE_CHAR); 4291 4292 lea(ary1, Address(ary1, len, Address::times_1)); 4293 negptr(len); 4294 4295 bind(COMPARE_VECTORS); 4296 movl(tmp1, Address(ary1, len, Address::times_1)); 4297 andl(tmp1, 0x80808080); 4298 jccb(Assembler::notZero, TAIL_ADJUST); 4299 addptr(len, 4); 4300 jccb(Assembler::notZero, COMPARE_VECTORS); 4301 4302 // Compare trailing char (final 2-3 bytes), if any 4303 bind(COMPARE_CHAR); 4304 4305 testl(result, 0x2); // tail char 4306 jccb(Assembler::zero, COMPARE_BYTE); 4307 load_unsigned_short(tmp1, Address(ary1, 0)); 4308 andl(tmp1, 0x00008080); 4309 jccb(Assembler::notZero, CHAR_ADJUST); 4310 lea(ary1, Address(ary1, 2)); 4311 4312 bind(COMPARE_BYTE); 4313 testl(result, 0x1); // tail byte 4314 jccb(Assembler::zero, DONE); 4315 load_unsigned_byte(tmp1, Address(ary1, 0)); 4316 testl(tmp1, 0x00000080); 4317 jccb(Assembler::zero, DONE); 4318 subptr(result, 1); 4319 jmpb(DONE); 4320 4321 bind(TAIL_ADJUST); 4322 // there are negative bits in the last 4 byte block. 4323 // Adjust result and check the next three bytes 4324 addptr(result, len); 4325 orl(result, 3); 4326 lea(ary1, Address(ary1, len, Address::times_1)); 4327 jmpb(COMPARE_CHAR); 4328 4329 bind(CHAR_ADJUST); 4330 // We are looking at a char + optional byte tail, and found that one 4331 // of the bytes in the char is negative. Adjust the result, check the 4332 // first byte and readjust if needed. 4333 andl(result, 0xfffffffc); 4334 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4335 jccb(Assembler::notZero, DONE); 4336 addptr(result, 1); 4337 4338 // That's it 4339 bind(DONE); 4340 if (UseAVX >= 2 && UseSSE >= 2) { 4341 // clean upper bits of YMM registers 4342 vpxor(vec1, vec1); 4343 vpxor(vec2, vec2); 4344 } 4345 } 4346 4347 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4348 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4349 Register limit, Register result, Register chr, 4350 XMMRegister vec1, XMMRegister vec2, bool is_char, 4351 KRegister mask, bool expand_ary2) { 4352 // for expand_ary2, limit is the (smaller) size of the second array. 4353 ShortBranchVerifier sbv(this); 4354 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4355 4356 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4357 "Expansion only implemented for AVX2"); 4358 4359 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4360 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4361 4362 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4363 int scaleIncr = expand_ary2 ? 8 : 16; 4364 4365 if (is_array_equ) { 4366 // Check the input args 4367 cmpoop(ary1, ary2); 4368 jcc(Assembler::equal, TRUE_LABEL); 4369 4370 // Need additional checks for arrays_equals. 4371 testptr(ary1, ary1); 4372 jcc(Assembler::zero, FALSE_LABEL); 4373 testptr(ary2, ary2); 4374 jcc(Assembler::zero, FALSE_LABEL); 4375 4376 // Check the lengths 4377 movl(limit, Address(ary1, length_offset)); 4378 cmpl(limit, Address(ary2, length_offset)); 4379 jcc(Assembler::notEqual, FALSE_LABEL); 4380 } 4381 4382 // count == 0 4383 testl(limit, limit); 4384 jcc(Assembler::zero, TRUE_LABEL); 4385 4386 if (is_array_equ) { 4387 // Load array address 4388 lea(ary1, Address(ary1, base_offset)); 4389 lea(ary2, Address(ary2, base_offset)); 4390 } 4391 4392 if (is_array_equ && is_char) { 4393 // arrays_equals when used for char[]. 4394 shll(limit, 1); // byte count != 0 4395 } 4396 movl(result, limit); // copy 4397 4398 if (UseAVX >= 2) { 4399 // With AVX2, use 32-byte vector compare 4400 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4401 4402 // Compare 32-byte vectors 4403 if (expand_ary2) { 4404 andl(result, 0x0000000f); // tail count (in bytes) 4405 andl(limit, 0xfffffff0); // vector count (in bytes) 4406 jcc(Assembler::zero, COMPARE_TAIL); 4407 } else { 4408 andl(result, 0x0000001f); // tail count (in bytes) 4409 andl(limit, 0xffffffe0); // vector count (in bytes) 4410 jcc(Assembler::zero, COMPARE_TAIL_16); 4411 } 4412 4413 lea(ary1, Address(ary1, limit, scaleFactor)); 4414 lea(ary2, Address(ary2, limit, Address::times_1)); 4415 negptr(limit); 4416 4417 #ifdef _LP64 4418 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4419 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4420 4421 cmpl(limit, -64); 4422 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4423 4424 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4425 4426 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4427 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4428 kortestql(mask, mask); 4429 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4430 addptr(limit, 64); // update since we already compared at this addr 4431 cmpl(limit, -64); 4432 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4433 4434 // At this point we may still need to compare -limit+result bytes. 4435 // We could execute the next two instruction and just continue via non-wide path: 4436 // cmpl(limit, 0); 4437 // jcc(Assembler::equal, COMPARE_TAIL); // true 4438 // But since we stopped at the points ary{1,2}+limit which are 4439 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4440 // (|limit| <= 32 and result < 32), 4441 // we may just compare the last 64 bytes. 4442 // 4443 addptr(result, -64); // it is safe, bc we just came from this area 4444 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4445 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4446 kortestql(mask, mask); 4447 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4448 4449 jmp(TRUE_LABEL); 4450 4451 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4452 4453 }//if (VM_Version::supports_avx512vlbw()) 4454 #endif //_LP64 4455 bind(COMPARE_WIDE_VECTORS); 4456 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4457 if (expand_ary2) { 4458 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4459 } else { 4460 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4461 } 4462 vpxor(vec1, vec2); 4463 4464 vptest(vec1, vec1); 4465 jcc(Assembler::notZero, FALSE_LABEL); 4466 addptr(limit, scaleIncr * 2); 4467 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4468 4469 testl(result, result); 4470 jcc(Assembler::zero, TRUE_LABEL); 4471 4472 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4473 if (expand_ary2) { 4474 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4475 } else { 4476 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4477 } 4478 vpxor(vec1, vec2); 4479 4480 vptest(vec1, vec1); 4481 jcc(Assembler::notZero, FALSE_LABEL); 4482 jmp(TRUE_LABEL); 4483 4484 bind(COMPARE_TAIL_16); // limit is zero 4485 movl(limit, result); 4486 4487 // Compare 16-byte chunks 4488 andl(result, 0x0000000f); // tail count (in bytes) 4489 andl(limit, 0xfffffff0); // vector count (in bytes) 4490 jcc(Assembler::zero, COMPARE_TAIL); 4491 4492 lea(ary1, Address(ary1, limit, scaleFactor)); 4493 lea(ary2, Address(ary2, limit, Address::times_1)); 4494 negptr(limit); 4495 4496 bind(COMPARE_WIDE_VECTORS_16); 4497 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4498 if (expand_ary2) { 4499 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4500 } else { 4501 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4502 } 4503 pxor(vec1, vec2); 4504 4505 ptest(vec1, vec1); 4506 jcc(Assembler::notZero, FALSE_LABEL); 4507 addptr(limit, scaleIncr); 4508 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4509 4510 bind(COMPARE_TAIL); // limit is zero 4511 movl(limit, result); 4512 // Fallthru to tail compare 4513 } else if (UseSSE42Intrinsics) { 4514 // With SSE4.2, use double quad vector compare 4515 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4516 4517 // Compare 16-byte vectors 4518 andl(result, 0x0000000f); // tail count (in bytes) 4519 andl(limit, 0xfffffff0); // vector count (in bytes) 4520 jcc(Assembler::zero, COMPARE_TAIL); 4521 4522 lea(ary1, Address(ary1, limit, Address::times_1)); 4523 lea(ary2, Address(ary2, limit, Address::times_1)); 4524 negptr(limit); 4525 4526 bind(COMPARE_WIDE_VECTORS); 4527 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4528 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4529 pxor(vec1, vec2); 4530 4531 ptest(vec1, vec1); 4532 jcc(Assembler::notZero, FALSE_LABEL); 4533 addptr(limit, 16); 4534 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4535 4536 testl(result, result); 4537 jcc(Assembler::zero, TRUE_LABEL); 4538 4539 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4540 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4541 pxor(vec1, vec2); 4542 4543 ptest(vec1, vec1); 4544 jccb(Assembler::notZero, FALSE_LABEL); 4545 jmpb(TRUE_LABEL); 4546 4547 bind(COMPARE_TAIL); // limit is zero 4548 movl(limit, result); 4549 // Fallthru to tail compare 4550 } 4551 4552 // Compare 4-byte vectors 4553 if (expand_ary2) { 4554 testl(result, result); 4555 jccb(Assembler::zero, TRUE_LABEL); 4556 } else { 4557 andl(limit, 0xfffffffc); // vector count (in bytes) 4558 jccb(Assembler::zero, COMPARE_CHAR); 4559 } 4560 4561 lea(ary1, Address(ary1, limit, scaleFactor)); 4562 lea(ary2, Address(ary2, limit, Address::times_1)); 4563 negptr(limit); 4564 4565 bind(COMPARE_VECTORS); 4566 if (expand_ary2) { 4567 // There are no "vector" operations for bytes to shorts 4568 movzbl(chr, Address(ary2, limit, Address::times_1)); 4569 cmpw(Address(ary1, limit, Address::times_2), chr); 4570 jccb(Assembler::notEqual, FALSE_LABEL); 4571 addptr(limit, 1); 4572 jcc(Assembler::notZero, COMPARE_VECTORS); 4573 jmp(TRUE_LABEL); 4574 } else { 4575 movl(chr, Address(ary1, limit, Address::times_1)); 4576 cmpl(chr, Address(ary2, limit, Address::times_1)); 4577 jccb(Assembler::notEqual, FALSE_LABEL); 4578 addptr(limit, 4); 4579 jcc(Assembler::notZero, COMPARE_VECTORS); 4580 } 4581 4582 // Compare trailing char (final 2 bytes), if any 4583 bind(COMPARE_CHAR); 4584 testl(result, 0x2); // tail char 4585 jccb(Assembler::zero, COMPARE_BYTE); 4586 load_unsigned_short(chr, Address(ary1, 0)); 4587 load_unsigned_short(limit, Address(ary2, 0)); 4588 cmpl(chr, limit); 4589 jccb(Assembler::notEqual, FALSE_LABEL); 4590 4591 if (is_array_equ && is_char) { 4592 bind(COMPARE_BYTE); 4593 } else { 4594 lea(ary1, Address(ary1, 2)); 4595 lea(ary2, Address(ary2, 2)); 4596 4597 bind(COMPARE_BYTE); 4598 testl(result, 0x1); // tail byte 4599 jccb(Assembler::zero, TRUE_LABEL); 4600 load_unsigned_byte(chr, Address(ary1, 0)); 4601 load_unsigned_byte(limit, Address(ary2, 0)); 4602 cmpl(chr, limit); 4603 jccb(Assembler::notEqual, FALSE_LABEL); 4604 } 4605 bind(TRUE_LABEL); 4606 movl(result, 1); // return true 4607 jmpb(DONE); 4608 4609 bind(FALSE_LABEL); 4610 xorl(result, result); // return false 4611 4612 // That's it 4613 bind(DONE); 4614 if (UseAVX >= 2) { 4615 // clean upper bits of YMM registers 4616 vpxor(vec1, vec1); 4617 vpxor(vec2, vec2); 4618 } 4619 } 4620 4621 #ifdef _LP64 4622 4623 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4624 #define __ masm. 4625 Register dst = stub.data<0>(); 4626 XMMRegister src = stub.data<1>(); 4627 address target = stub.data<2>(); 4628 __ bind(stub.entry()); 4629 __ subptr(rsp, 8); 4630 __ movdbl(Address(rsp), src); 4631 __ call(RuntimeAddress(target)); 4632 __ pop(dst); 4633 __ jmp(stub.continuation()); 4634 #undef __ 4635 } 4636 4637 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4638 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4639 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4640 4641 address slowpath_target; 4642 if (dst_bt == T_INT) { 4643 if (src_bt == T_FLOAT) { 4644 cvttss2sil(dst, src); 4645 cmpl(dst, 0x80000000); 4646 slowpath_target = StubRoutines::x86::f2i_fixup(); 4647 } else { 4648 cvttsd2sil(dst, src); 4649 cmpl(dst, 0x80000000); 4650 slowpath_target = StubRoutines::x86::d2i_fixup(); 4651 } 4652 } else { 4653 if (src_bt == T_FLOAT) { 4654 cvttss2siq(dst, src); 4655 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4656 slowpath_target = StubRoutines::x86::f2l_fixup(); 4657 } else { 4658 cvttsd2siq(dst, src); 4659 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4660 slowpath_target = StubRoutines::x86::d2l_fixup(); 4661 } 4662 } 4663 4664 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4665 jcc(Assembler::equal, stub->entry()); 4666 bind(stub->continuation()); 4667 } 4668 4669 #endif // _LP64 4670 4671 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4672 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4673 switch(ideal_opc) { 4674 case Op_LShiftVS: 4675 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4676 case Op_LShiftVI: 4677 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4678 case Op_LShiftVL: 4679 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4680 case Op_RShiftVS: 4681 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4682 case Op_RShiftVI: 4683 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4684 case Op_RShiftVL: 4685 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4686 case Op_URShiftVS: 4687 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4688 case Op_URShiftVI: 4689 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4690 case Op_URShiftVL: 4691 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4692 case Op_RotateRightV: 4693 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4694 case Op_RotateLeftV: 4695 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4696 default: 4697 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4698 break; 4699 } 4700 } 4701 4702 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4703 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) { 4704 if (is_unsigned) { 4705 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4706 } else { 4707 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4708 } 4709 } 4710 4711 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4712 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4713 switch (elem_bt) { 4714 case T_BYTE: 4715 if (ideal_opc == Op_SaturatingAddV) { 4716 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4717 } else { 4718 assert(ideal_opc == Op_SaturatingSubV, ""); 4719 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4720 } 4721 break; 4722 case T_SHORT: 4723 if (ideal_opc == Op_SaturatingAddV) { 4724 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4725 } else { 4726 assert(ideal_opc == Op_SaturatingSubV, ""); 4727 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4728 } 4729 break; 4730 default: 4731 fatal("Unsupported type %s", type2name(elem_bt)); 4732 break; 4733 } 4734 } 4735 4736 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4737 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4738 switch (elem_bt) { 4739 case T_BYTE: 4740 if (ideal_opc == Op_SaturatingAddV) { 4741 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4742 } else { 4743 assert(ideal_opc == Op_SaturatingSubV, ""); 4744 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4745 } 4746 break; 4747 case T_SHORT: 4748 if (ideal_opc == Op_SaturatingAddV) { 4749 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4750 } else { 4751 assert(ideal_opc == Op_SaturatingSubV, ""); 4752 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4753 } 4754 break; 4755 default: 4756 fatal("Unsupported type %s", type2name(elem_bt)); 4757 break; 4758 } 4759 } 4760 4761 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4762 Address src2, bool is_unsigned, bool merge, int vlen_enc) { 4763 if (is_unsigned) { 4764 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4765 } else { 4766 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4767 } 4768 } 4769 4770 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4771 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4772 switch (elem_bt) { 4773 case T_BYTE: 4774 if (ideal_opc == Op_SaturatingAddV) { 4775 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4776 } else { 4777 assert(ideal_opc == Op_SaturatingSubV, ""); 4778 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4779 } 4780 break; 4781 case T_SHORT: 4782 if (ideal_opc == Op_SaturatingAddV) { 4783 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4784 } else { 4785 assert(ideal_opc == Op_SaturatingSubV, ""); 4786 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4787 } 4788 break; 4789 default: 4790 fatal("Unsupported type %s", type2name(elem_bt)); 4791 break; 4792 } 4793 } 4794 4795 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4796 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4797 switch (elem_bt) { 4798 case T_BYTE: 4799 if (ideal_opc == Op_SaturatingAddV) { 4800 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4801 } else { 4802 assert(ideal_opc == Op_SaturatingSubV, ""); 4803 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4804 } 4805 break; 4806 case T_SHORT: 4807 if (ideal_opc == Op_SaturatingAddV) { 4808 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4809 } else { 4810 assert(ideal_opc == Op_SaturatingSubV, ""); 4811 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4812 } 4813 break; 4814 default: 4815 fatal("Unsupported type %s", type2name(elem_bt)); 4816 break; 4817 } 4818 } 4819 4820 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4821 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4822 bool is_varshift) { 4823 switch (ideal_opc) { 4824 case Op_AddVB: 4825 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4826 case Op_AddVS: 4827 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4828 case Op_AddVI: 4829 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4830 case Op_AddVL: 4831 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4832 case Op_AddVF: 4833 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4834 case Op_AddVD: 4835 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4836 case Op_SubVB: 4837 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4838 case Op_SubVS: 4839 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4840 case Op_SubVI: 4841 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4842 case Op_SubVL: 4843 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4844 case Op_SubVF: 4845 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4846 case Op_SubVD: 4847 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4848 case Op_MulVS: 4849 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4850 case Op_MulVI: 4851 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4852 case Op_MulVL: 4853 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4854 case Op_MulVF: 4855 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4856 case Op_MulVD: 4857 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4858 case Op_DivVF: 4859 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4860 case Op_DivVD: 4861 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4862 case Op_SqrtVF: 4863 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4864 case Op_SqrtVD: 4865 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4866 case Op_AbsVB: 4867 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4868 case Op_AbsVS: 4869 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4870 case Op_AbsVI: 4871 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4872 case Op_AbsVL: 4873 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4874 case Op_FmaVF: 4875 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4876 case Op_FmaVD: 4877 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4878 case Op_VectorRearrange: 4879 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4880 case Op_LShiftVS: 4881 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4882 case Op_LShiftVI: 4883 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4884 case Op_LShiftVL: 4885 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4886 case Op_RShiftVS: 4887 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4888 case Op_RShiftVI: 4889 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4890 case Op_RShiftVL: 4891 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4892 case Op_URShiftVS: 4893 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4894 case Op_URShiftVI: 4895 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4896 case Op_URShiftVL: 4897 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4898 case Op_RotateLeftV: 4899 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4900 case Op_RotateRightV: 4901 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4902 case Op_MaxV: 4903 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4904 case Op_MinV: 4905 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4906 case Op_UMinV: 4907 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4908 case Op_UMaxV: 4909 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4910 case Op_XorV: 4911 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4912 case Op_OrV: 4913 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4914 case Op_AndV: 4915 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4916 default: 4917 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4918 break; 4919 } 4920 } 4921 4922 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4923 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4924 switch (ideal_opc) { 4925 case Op_AddVB: 4926 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4927 case Op_AddVS: 4928 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4929 case Op_AddVI: 4930 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4931 case Op_AddVL: 4932 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4933 case Op_AddVF: 4934 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4935 case Op_AddVD: 4936 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4937 case Op_SubVB: 4938 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4939 case Op_SubVS: 4940 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4941 case Op_SubVI: 4942 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4943 case Op_SubVL: 4944 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4945 case Op_SubVF: 4946 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4947 case Op_SubVD: 4948 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4949 case Op_MulVS: 4950 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4951 case Op_MulVI: 4952 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4953 case Op_MulVL: 4954 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4955 case Op_MulVF: 4956 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4957 case Op_MulVD: 4958 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4959 case Op_DivVF: 4960 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4961 case Op_DivVD: 4962 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4963 case Op_FmaVF: 4964 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4965 case Op_FmaVD: 4966 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4967 case Op_MaxV: 4968 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4969 case Op_MinV: 4970 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4971 case Op_UMaxV: 4972 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4973 case Op_UMinV: 4974 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4975 case Op_XorV: 4976 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4977 case Op_OrV: 4978 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4979 case Op_AndV: 4980 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4981 default: 4982 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4983 break; 4984 } 4985 } 4986 4987 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4988 KRegister src1, KRegister src2) { 4989 BasicType etype = T_ILLEGAL; 4990 switch(mask_len) { 4991 case 2: 4992 case 4: 4993 case 8: etype = T_BYTE; break; 4994 case 16: etype = T_SHORT; break; 4995 case 32: etype = T_INT; break; 4996 case 64: etype = T_LONG; break; 4997 default: fatal("Unsupported type"); break; 4998 } 4999 assert(etype != T_ILLEGAL, ""); 5000 switch(ideal_opc) { 5001 case Op_AndVMask: 5002 kand(etype, dst, src1, src2); break; 5003 case Op_OrVMask: 5004 kor(etype, dst, src1, src2); break; 5005 case Op_XorVMask: 5006 kxor(etype, dst, src1, src2); break; 5007 default: 5008 fatal("Unsupported masked operation"); break; 5009 } 5010 } 5011 5012 /* 5013 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5014 * If src is NaN, the result is 0. 5015 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 5016 * the result is equal to the value of Integer.MIN_VALUE. 5017 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 5018 * the result is equal to the value of Integer.MAX_VALUE. 5019 */ 5020 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5021 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5022 Register rscratch, AddressLiteral float_sign_flip, 5023 int vec_enc) { 5024 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5025 Label done; 5026 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 5027 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5028 vptest(xtmp2, xtmp2, vec_enc); 5029 jccb(Assembler::equal, done); 5030 5031 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 5032 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 5033 5034 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5035 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 5036 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 5037 5038 // Recompute the mask for remaining special value. 5039 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 5040 // Extract SRC values corresponding to TRUE mask lanes. 5041 vpand(xtmp4, xtmp2, src, vec_enc); 5042 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 5043 // values are set. 5044 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 5045 5046 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 5047 bind(done); 5048 } 5049 5050 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5051 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5052 Register rscratch, AddressLiteral float_sign_flip, 5053 int vec_enc) { 5054 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5055 Label done; 5056 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5057 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5058 kortestwl(ktmp1, ktmp1); 5059 jccb(Assembler::equal, done); 5060 5061 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5062 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5063 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5064 5065 kxorwl(ktmp1, ktmp1, ktmp2); 5066 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5067 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5068 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5069 bind(done); 5070 } 5071 5072 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5073 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5074 Register rscratch, AddressLiteral double_sign_flip, 5075 int vec_enc) { 5076 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5077 5078 Label done; 5079 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5080 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 5081 kortestwl(ktmp1, ktmp1); 5082 jccb(Assembler::equal, done); 5083 5084 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5085 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5086 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5087 5088 kxorwl(ktmp1, ktmp1, ktmp2); 5089 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5090 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5091 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5092 bind(done); 5093 } 5094 5095 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5096 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5097 Register rscratch, AddressLiteral float_sign_flip, 5098 int vec_enc) { 5099 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5100 Label done; 5101 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5102 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5103 kortestwl(ktmp1, ktmp1); 5104 jccb(Assembler::equal, done); 5105 5106 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5107 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5108 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5109 5110 kxorwl(ktmp1, ktmp1, ktmp2); 5111 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5112 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5113 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5114 bind(done); 5115 } 5116 5117 /* 5118 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5119 * If src is NaN, the result is 0. 5120 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 5121 * the result is equal to the value of Long.MIN_VALUE. 5122 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 5123 * the result is equal to the value of Long.MAX_VALUE. 5124 */ 5125 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5126 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5127 Register rscratch, AddressLiteral double_sign_flip, 5128 int vec_enc) { 5129 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5130 5131 Label done; 5132 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5133 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 5134 kortestwl(ktmp1, ktmp1); 5135 jccb(Assembler::equal, done); 5136 5137 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5138 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5139 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5140 5141 kxorwl(ktmp1, ktmp1, ktmp2); 5142 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5143 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5144 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5145 bind(done); 5146 } 5147 5148 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5149 XMMRegister xtmp, int index, int vec_enc) { 5150 assert(vec_enc < Assembler::AVX_512bit, ""); 5151 if (vec_enc == Assembler::AVX_256bit) { 5152 vextractf128_high(xtmp, src); 5153 vshufps(dst, src, xtmp, index, vec_enc); 5154 } else { 5155 vshufps(dst, src, zero, index, vec_enc); 5156 } 5157 } 5158 5159 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5160 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5161 AddressLiteral float_sign_flip, int src_vec_enc) { 5162 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5163 5164 Label done; 5165 // Compare the destination lanes with float_sign_flip 5166 // value to get mask for all special values. 5167 movdqu(xtmp1, float_sign_flip, rscratch); 5168 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5169 ptest(xtmp2, xtmp2); 5170 jccb(Assembler::equal, done); 5171 5172 // Flip float_sign_flip to get max integer value. 5173 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5174 pxor(xtmp1, xtmp4); 5175 5176 // Set detination lanes corresponding to unordered source lanes as zero. 5177 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5178 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5179 5180 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5181 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5182 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5183 5184 // Recompute the mask for remaining special value. 5185 pxor(xtmp2, xtmp3); 5186 // Extract mask corresponding to non-negative source lanes. 5187 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5188 5189 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5190 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5191 pand(xtmp3, xtmp2); 5192 5193 // Replace destination lanes holding special value(0x80000000) with max int 5194 // if corresponding source lane holds a +ve value. 5195 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5196 bind(done); 5197 } 5198 5199 5200 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5201 XMMRegister xtmp, Register rscratch, int vec_enc) { 5202 switch(to_elem_bt) { 5203 case T_SHORT: 5204 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5205 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5206 vpackusdw(dst, dst, zero, vec_enc); 5207 if (vec_enc == Assembler::AVX_256bit) { 5208 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5209 } 5210 break; 5211 case T_BYTE: 5212 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5213 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5214 vpackusdw(dst, dst, zero, vec_enc); 5215 if (vec_enc == Assembler::AVX_256bit) { 5216 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5217 } 5218 vpackuswb(dst, dst, zero, vec_enc); 5219 break; 5220 default: assert(false, "%s", type2name(to_elem_bt)); 5221 } 5222 } 5223 5224 /* 5225 * Algorithm for vector D2L and F2I conversions:- 5226 * a) Perform vector D2L/F2I cast. 5227 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5228 * It signifies that source value could be any of the special floating point 5229 * values(NaN,-Inf,Inf,Max,-Min). 5230 * c) Set destination to zero if source is NaN value. 5231 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5232 */ 5233 5234 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5235 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5236 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5237 int to_elem_sz = type2aelembytes(to_elem_bt); 5238 assert(to_elem_sz <= 4, ""); 5239 vcvttps2dq(dst, src, vec_enc); 5240 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5241 if (to_elem_sz < 4) { 5242 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5243 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5244 } 5245 } 5246 5247 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5248 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5249 Register rscratch, int vec_enc) { 5250 int to_elem_sz = type2aelembytes(to_elem_bt); 5251 assert(to_elem_sz <= 4, ""); 5252 vcvttps2dq(dst, src, vec_enc); 5253 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5254 switch(to_elem_bt) { 5255 case T_INT: 5256 break; 5257 case T_SHORT: 5258 evpmovdw(dst, dst, vec_enc); 5259 break; 5260 case T_BYTE: 5261 evpmovdb(dst, dst, vec_enc); 5262 break; 5263 default: assert(false, "%s", type2name(to_elem_bt)); 5264 } 5265 } 5266 5267 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5268 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5269 Register rscratch, int vec_enc) { 5270 evcvttps2qq(dst, src, vec_enc); 5271 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5272 } 5273 5274 // Handling for downcasting from double to integer or sub-word types on AVX2. 5275 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5276 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5277 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5278 int to_elem_sz = type2aelembytes(to_elem_bt); 5279 assert(to_elem_sz < 8, ""); 5280 vcvttpd2dq(dst, src, vec_enc); 5281 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5282 float_sign_flip, vec_enc); 5283 if (to_elem_sz < 4) { 5284 // xtmp4 holds all zero lanes. 5285 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5286 } 5287 } 5288 5289 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5290 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5291 KRegister ktmp2, AddressLiteral sign_flip, 5292 Register rscratch, int vec_enc) { 5293 if (VM_Version::supports_avx512dq()) { 5294 evcvttpd2qq(dst, src, vec_enc); 5295 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5296 switch(to_elem_bt) { 5297 case T_LONG: 5298 break; 5299 case T_INT: 5300 evpmovsqd(dst, dst, vec_enc); 5301 break; 5302 case T_SHORT: 5303 evpmovsqd(dst, dst, vec_enc); 5304 evpmovdw(dst, dst, vec_enc); 5305 break; 5306 case T_BYTE: 5307 evpmovsqd(dst, dst, vec_enc); 5308 evpmovdb(dst, dst, vec_enc); 5309 break; 5310 default: assert(false, "%s", type2name(to_elem_bt)); 5311 } 5312 } else { 5313 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5314 vcvttpd2dq(dst, src, vec_enc); 5315 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5316 switch(to_elem_bt) { 5317 case T_INT: 5318 break; 5319 case T_SHORT: 5320 evpmovdw(dst, dst, vec_enc); 5321 break; 5322 case T_BYTE: 5323 evpmovdb(dst, dst, vec_enc); 5324 break; 5325 default: assert(false, "%s", type2name(to_elem_bt)); 5326 } 5327 } 5328 } 5329 5330 #ifdef _LP64 5331 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5332 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5333 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5334 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5335 // and re-instantiate original MXCSR.RC mode after that. 5336 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5337 5338 mov64(tmp, julong_cast(0.5L)); 5339 evpbroadcastq(xtmp1, tmp, vec_enc); 5340 vaddpd(xtmp1, src , xtmp1, vec_enc); 5341 evcvtpd2qq(dst, xtmp1, vec_enc); 5342 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5343 double_sign_flip, vec_enc);; 5344 5345 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5346 } 5347 5348 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5349 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5350 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5351 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5352 // and re-instantiate original MXCSR.RC mode after that. 5353 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5354 5355 movl(tmp, jint_cast(0.5)); 5356 movq(xtmp1, tmp); 5357 vbroadcastss(xtmp1, xtmp1, vec_enc); 5358 vaddps(xtmp1, src , xtmp1, vec_enc); 5359 vcvtps2dq(dst, xtmp1, vec_enc); 5360 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5361 float_sign_flip, vec_enc); 5362 5363 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5364 } 5365 5366 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5367 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5368 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5369 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5370 // and re-instantiate original MXCSR.RC mode after that. 5371 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5372 5373 movl(tmp, jint_cast(0.5)); 5374 movq(xtmp1, tmp); 5375 vbroadcastss(xtmp1, xtmp1, vec_enc); 5376 vaddps(xtmp1, src , xtmp1, vec_enc); 5377 vcvtps2dq(dst, xtmp1, vec_enc); 5378 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5379 5380 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5381 } 5382 #endif // _LP64 5383 5384 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5385 BasicType from_elem_bt, BasicType to_elem_bt) { 5386 switch (from_elem_bt) { 5387 case T_BYTE: 5388 switch (to_elem_bt) { 5389 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5390 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5391 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5392 default: ShouldNotReachHere(); 5393 } 5394 break; 5395 case T_SHORT: 5396 switch (to_elem_bt) { 5397 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5398 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5399 default: ShouldNotReachHere(); 5400 } 5401 break; 5402 case T_INT: 5403 assert(to_elem_bt == T_LONG, ""); 5404 vpmovzxdq(dst, src, vlen_enc); 5405 break; 5406 default: 5407 ShouldNotReachHere(); 5408 } 5409 } 5410 5411 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5412 BasicType from_elem_bt, BasicType to_elem_bt) { 5413 switch (from_elem_bt) { 5414 case T_BYTE: 5415 switch (to_elem_bt) { 5416 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5417 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5418 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5419 default: ShouldNotReachHere(); 5420 } 5421 break; 5422 case T_SHORT: 5423 switch (to_elem_bt) { 5424 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5425 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5426 default: ShouldNotReachHere(); 5427 } 5428 break; 5429 case T_INT: 5430 assert(to_elem_bt == T_LONG, ""); 5431 vpmovsxdq(dst, src, vlen_enc); 5432 break; 5433 default: 5434 ShouldNotReachHere(); 5435 } 5436 } 5437 5438 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5439 BasicType dst_bt, BasicType src_bt, int vlen) { 5440 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5441 assert(vlen_enc != AVX_512bit, ""); 5442 5443 int dst_bt_size = type2aelembytes(dst_bt); 5444 int src_bt_size = type2aelembytes(src_bt); 5445 if (dst_bt_size > src_bt_size) { 5446 switch (dst_bt_size / src_bt_size) { 5447 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5448 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5449 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5450 default: ShouldNotReachHere(); 5451 } 5452 } else { 5453 assert(dst_bt_size < src_bt_size, ""); 5454 switch (src_bt_size / dst_bt_size) { 5455 case 2: { 5456 if (vlen_enc == AVX_128bit) { 5457 vpacksswb(dst, src, src, vlen_enc); 5458 } else { 5459 vpacksswb(dst, src, src, vlen_enc); 5460 vpermq(dst, dst, 0x08, vlen_enc); 5461 } 5462 break; 5463 } 5464 case 4: { 5465 if (vlen_enc == AVX_128bit) { 5466 vpackssdw(dst, src, src, vlen_enc); 5467 vpacksswb(dst, dst, dst, vlen_enc); 5468 } else { 5469 vpackssdw(dst, src, src, vlen_enc); 5470 vpermq(dst, dst, 0x08, vlen_enc); 5471 vpacksswb(dst, dst, dst, AVX_128bit); 5472 } 5473 break; 5474 } 5475 case 8: { 5476 if (vlen_enc == AVX_128bit) { 5477 vpshufd(dst, src, 0x08, vlen_enc); 5478 vpackssdw(dst, dst, dst, vlen_enc); 5479 vpacksswb(dst, dst, dst, vlen_enc); 5480 } else { 5481 vpshufd(dst, src, 0x08, vlen_enc); 5482 vpermq(dst, dst, 0x08, vlen_enc); 5483 vpackssdw(dst, dst, dst, AVX_128bit); 5484 vpacksswb(dst, dst, dst, AVX_128bit); 5485 } 5486 break; 5487 } 5488 default: ShouldNotReachHere(); 5489 } 5490 } 5491 } 5492 5493 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5494 bool merge, BasicType bt, int vlen_enc) { 5495 if (bt == T_INT) { 5496 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5497 } else { 5498 assert(bt == T_LONG, ""); 5499 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5500 } 5501 } 5502 5503 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5504 bool merge, BasicType bt, int vlen_enc) { 5505 if (bt == T_INT) { 5506 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5507 } else { 5508 assert(bt == T_LONG, ""); 5509 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5510 } 5511 } 5512 5513 #ifdef _LP64 5514 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5515 Register rtmp2, XMMRegister xtmp, int mask_len, 5516 int vec_enc) { 5517 int index = 0; 5518 int vindex = 0; 5519 mov64(rtmp1, 0x0101010101010101L); 5520 pdepq(rtmp1, src, rtmp1); 5521 if (mask_len > 8) { 5522 movq(rtmp2, src); 5523 vpxor(xtmp, xtmp, xtmp, vec_enc); 5524 movq(xtmp, rtmp1); 5525 } 5526 movq(dst, rtmp1); 5527 5528 mask_len -= 8; 5529 while (mask_len > 0) { 5530 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5531 index++; 5532 if ((index % 2) == 0) { 5533 pxor(xtmp, xtmp); 5534 } 5535 mov64(rtmp1, 0x0101010101010101L); 5536 shrq(rtmp2, 8); 5537 pdepq(rtmp1, rtmp2, rtmp1); 5538 pinsrq(xtmp, rtmp1, index % 2); 5539 vindex = index / 2; 5540 if (vindex) { 5541 // Write entire 16 byte vector when both 64 bit 5542 // lanes are update to save redundant instructions. 5543 if (index % 2) { 5544 vinsertf128(dst, dst, xtmp, vindex); 5545 } 5546 } else { 5547 vmovdqu(dst, xtmp); 5548 } 5549 mask_len -= 8; 5550 } 5551 } 5552 5553 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5554 switch(opc) { 5555 case Op_VectorMaskTrueCount: 5556 popcntq(dst, tmp); 5557 break; 5558 case Op_VectorMaskLastTrue: 5559 if (VM_Version::supports_lzcnt()) { 5560 lzcntq(tmp, tmp); 5561 movl(dst, 63); 5562 subl(dst, tmp); 5563 } else { 5564 movl(dst, -1); 5565 bsrq(tmp, tmp); 5566 cmov32(Assembler::notZero, dst, tmp); 5567 } 5568 break; 5569 case Op_VectorMaskFirstTrue: 5570 if (VM_Version::supports_bmi1()) { 5571 if (masklen < 32) { 5572 orl(tmp, 1 << masklen); 5573 tzcntl(dst, tmp); 5574 } else if (masklen == 32) { 5575 tzcntl(dst, tmp); 5576 } else { 5577 assert(masklen == 64, ""); 5578 tzcntq(dst, tmp); 5579 } 5580 } else { 5581 if (masklen < 32) { 5582 orl(tmp, 1 << masklen); 5583 bsfl(dst, tmp); 5584 } else { 5585 assert(masklen == 32 || masklen == 64, ""); 5586 movl(dst, masklen); 5587 if (masklen == 32) { 5588 bsfl(tmp, tmp); 5589 } else { 5590 bsfq(tmp, tmp); 5591 } 5592 cmov32(Assembler::notZero, dst, tmp); 5593 } 5594 } 5595 break; 5596 case Op_VectorMaskToLong: 5597 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5598 break; 5599 default: assert(false, "Unhandled mask operation"); 5600 } 5601 } 5602 5603 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5604 int masklen, int masksize, int vec_enc) { 5605 assert(VM_Version::supports_popcnt(), ""); 5606 5607 if(VM_Version::supports_avx512bw()) { 5608 kmovql(tmp, mask); 5609 } else { 5610 assert(masklen <= 16, ""); 5611 kmovwl(tmp, mask); 5612 } 5613 5614 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5615 // operations needs to be clipped. 5616 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5617 andq(tmp, (1 << masklen) - 1); 5618 } 5619 5620 vector_mask_operation_helper(opc, dst, tmp, masklen); 5621 } 5622 5623 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5624 Register tmp, int masklen, BasicType bt, int vec_enc) { 5625 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5626 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5627 assert(VM_Version::supports_popcnt(), ""); 5628 5629 bool need_clip = false; 5630 switch(bt) { 5631 case T_BOOLEAN: 5632 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5633 vpxor(xtmp, xtmp, xtmp, vec_enc); 5634 vpsubb(xtmp, xtmp, mask, vec_enc); 5635 vpmovmskb(tmp, xtmp, vec_enc); 5636 need_clip = masklen < 16; 5637 break; 5638 case T_BYTE: 5639 vpmovmskb(tmp, mask, vec_enc); 5640 need_clip = masklen < 16; 5641 break; 5642 case T_SHORT: 5643 vpacksswb(xtmp, mask, mask, vec_enc); 5644 if (masklen >= 16) { 5645 vpermpd(xtmp, xtmp, 8, vec_enc); 5646 } 5647 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5648 need_clip = masklen < 16; 5649 break; 5650 case T_INT: 5651 case T_FLOAT: 5652 vmovmskps(tmp, mask, vec_enc); 5653 need_clip = masklen < 4; 5654 break; 5655 case T_LONG: 5656 case T_DOUBLE: 5657 vmovmskpd(tmp, mask, vec_enc); 5658 need_clip = masklen < 2; 5659 break; 5660 default: assert(false, "Unhandled type, %s", type2name(bt)); 5661 } 5662 5663 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5664 // operations needs to be clipped. 5665 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5666 // need_clip implies masklen < 32 5667 andq(tmp, (1 << masklen) - 1); 5668 } 5669 5670 vector_mask_operation_helper(opc, dst, tmp, masklen); 5671 } 5672 5673 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5674 Register rtmp2, int mask_len) { 5675 kmov(rtmp1, src); 5676 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5677 mov64(rtmp2, -1L); 5678 pextq(rtmp2, rtmp2, rtmp1); 5679 kmov(dst, rtmp2); 5680 } 5681 5682 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5683 XMMRegister mask, Register rtmp, Register rscratch, 5684 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5685 int vec_enc) { 5686 assert(type2aelembytes(bt) >= 4, ""); 5687 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5688 address compress_perm_table = nullptr; 5689 address expand_perm_table = nullptr; 5690 if (type2aelembytes(bt) == 8) { 5691 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5692 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5693 vmovmskpd(rtmp, mask, vec_enc); 5694 } else { 5695 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5696 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5697 vmovmskps(rtmp, mask, vec_enc); 5698 } 5699 shlq(rtmp, 5); // for 32 byte permute row. 5700 if (opcode == Op_CompressV) { 5701 lea(rscratch, ExternalAddress(compress_perm_table)); 5702 } else { 5703 lea(rscratch, ExternalAddress(expand_perm_table)); 5704 } 5705 addptr(rtmp, rscratch); 5706 vmovdqu(permv, Address(rtmp)); 5707 vpermps(dst, permv, src, Assembler::AVX_256bit); 5708 vpxor(xtmp, xtmp, xtmp, vec_enc); 5709 // Blend the result with zero vector using permute mask, each column entry 5710 // in a permute table row contains either a valid permute index or a -1 (default) 5711 // value, this can potentially be used as a blending mask after 5712 // compressing/expanding the source vector lanes. 5713 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5714 } 5715 5716 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5717 bool merge, BasicType bt, int vec_enc) { 5718 if (opcode == Op_CompressV) { 5719 switch(bt) { 5720 case T_BYTE: 5721 evpcompressb(dst, mask, src, merge, vec_enc); 5722 break; 5723 case T_CHAR: 5724 case T_SHORT: 5725 evpcompressw(dst, mask, src, merge, vec_enc); 5726 break; 5727 case T_INT: 5728 evpcompressd(dst, mask, src, merge, vec_enc); 5729 break; 5730 case T_FLOAT: 5731 evcompressps(dst, mask, src, merge, vec_enc); 5732 break; 5733 case T_LONG: 5734 evpcompressq(dst, mask, src, merge, vec_enc); 5735 break; 5736 case T_DOUBLE: 5737 evcompresspd(dst, mask, src, merge, vec_enc); 5738 break; 5739 default: 5740 fatal("Unsupported type %s", type2name(bt)); 5741 break; 5742 } 5743 } else { 5744 assert(opcode == Op_ExpandV, ""); 5745 switch(bt) { 5746 case T_BYTE: 5747 evpexpandb(dst, mask, src, merge, vec_enc); 5748 break; 5749 case T_CHAR: 5750 case T_SHORT: 5751 evpexpandw(dst, mask, src, merge, vec_enc); 5752 break; 5753 case T_INT: 5754 evpexpandd(dst, mask, src, merge, vec_enc); 5755 break; 5756 case T_FLOAT: 5757 evexpandps(dst, mask, src, merge, vec_enc); 5758 break; 5759 case T_LONG: 5760 evpexpandq(dst, mask, src, merge, vec_enc); 5761 break; 5762 case T_DOUBLE: 5763 evexpandpd(dst, mask, src, merge, vec_enc); 5764 break; 5765 default: 5766 fatal("Unsupported type %s", type2name(bt)); 5767 break; 5768 } 5769 } 5770 } 5771 #endif 5772 5773 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5774 KRegister ktmp1, int vec_enc) { 5775 if (opcode == Op_SignumVD) { 5776 vsubpd(dst, zero, one, vec_enc); 5777 // if src < 0 ? -1 : 1 5778 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5779 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5780 // if src == NaN, -0.0 or 0.0 return src. 5781 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5782 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5783 } else { 5784 assert(opcode == Op_SignumVF, ""); 5785 vsubps(dst, zero, one, vec_enc); 5786 // if src < 0 ? -1 : 1 5787 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5788 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5789 // if src == NaN, -0.0 or 0.0 return src. 5790 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5791 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5792 } 5793 } 5794 5795 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5796 XMMRegister xtmp1, int vec_enc) { 5797 if (opcode == Op_SignumVD) { 5798 vsubpd(dst, zero, one, vec_enc); 5799 // if src < 0 ? -1 : 1 5800 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5801 // if src == NaN, -0.0 or 0.0 return src. 5802 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5803 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5804 } else { 5805 assert(opcode == Op_SignumVF, ""); 5806 vsubps(dst, zero, one, vec_enc); 5807 // if src < 0 ? -1 : 1 5808 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5809 // if src == NaN, -0.0 or 0.0 return src. 5810 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5811 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5812 } 5813 } 5814 5815 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5816 if (VM_Version::supports_avx512bw()) { 5817 if (mask_len > 32) { 5818 kmovql(dst, src); 5819 } else { 5820 kmovdl(dst, src); 5821 if (mask_len != 32) { 5822 kshiftrdl(dst, dst, 32 - mask_len); 5823 } 5824 } 5825 } else { 5826 assert(mask_len <= 16, ""); 5827 kmovwl(dst, src); 5828 if (mask_len != 16) { 5829 kshiftrwl(dst, dst, 16 - mask_len); 5830 } 5831 } 5832 } 5833 5834 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5835 int lane_size = type2aelembytes(bt); 5836 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5837 if ((is_LP64 || lane_size < 8) && 5838 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5839 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5840 movptr(rtmp, imm32); 5841 switch(lane_size) { 5842 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5843 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5844 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5845 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5846 fatal("Unsupported lane size %d", lane_size); 5847 break; 5848 } 5849 } else { 5850 movptr(rtmp, imm32); 5851 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5852 switch(lane_size) { 5853 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5854 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5855 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5856 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5857 fatal("Unsupported lane size %d", lane_size); 5858 break; 5859 } 5860 } 5861 } 5862 5863 // 5864 // Following is lookup table based popcount computation algorithm:- 5865 // Index Bit set count 5866 // [ 0000 -> 0, 5867 // 0001 -> 1, 5868 // 0010 -> 1, 5869 // 0011 -> 2, 5870 // 0100 -> 1, 5871 // 0101 -> 2, 5872 // 0110 -> 2, 5873 // 0111 -> 3, 5874 // 1000 -> 1, 5875 // 1001 -> 2, 5876 // 1010 -> 3, 5877 // 1011 -> 3, 5878 // 1100 -> 2, 5879 // 1101 -> 3, 5880 // 1111 -> 4 ] 5881 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5882 // shuffle indices for lookup table access. 5883 // b. Right shift each byte of vector lane by 4 positions. 5884 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5885 // shuffle indices for lookup table access. 5886 // d. Add the bitset count of upper and lower 4 bits of each byte. 5887 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5888 // count of all the bytes of a quadword. 5889 // f. Perform step e. for upper 128bit vector lane. 5890 // g. Pack the bitset count of quadwords back to double word. 5891 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5892 5893 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5894 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5895 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5896 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5897 vpsrlw(dst, src, 4, vec_enc); 5898 vpand(dst, dst, xtmp1, vec_enc); 5899 vpand(xtmp1, src, xtmp1, vec_enc); 5900 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5901 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5902 vpshufb(dst, xtmp2, dst, vec_enc); 5903 vpaddb(dst, dst, xtmp1, vec_enc); 5904 } 5905 5906 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5907 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5908 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5909 // Following code is as per steps e,f,g and h of above algorithm. 5910 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5911 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5912 vpsadbw(dst, dst, xtmp2, vec_enc); 5913 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5914 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5915 vpackuswb(dst, xtmp1, dst, vec_enc); 5916 } 5917 5918 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5919 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5920 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5921 // Add the popcount of upper and lower bytes of word. 5922 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5923 vpsrlw(dst, xtmp1, 8, vec_enc); 5924 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5925 vpaddw(dst, dst, xtmp1, vec_enc); 5926 } 5927 5928 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5929 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5930 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5931 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5932 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5933 } 5934 5935 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5936 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5937 switch(bt) { 5938 case T_LONG: 5939 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5940 break; 5941 case T_INT: 5942 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5943 break; 5944 case T_CHAR: 5945 case T_SHORT: 5946 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5947 break; 5948 case T_BYTE: 5949 case T_BOOLEAN: 5950 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5951 break; 5952 default: 5953 fatal("Unsupported type %s", type2name(bt)); 5954 break; 5955 } 5956 } 5957 5958 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5959 KRegister mask, bool merge, int vec_enc) { 5960 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5961 switch(bt) { 5962 case T_LONG: 5963 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5964 evpopcntq(dst, mask, src, merge, vec_enc); 5965 break; 5966 case T_INT: 5967 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5968 evpopcntd(dst, mask, src, merge, vec_enc); 5969 break; 5970 case T_CHAR: 5971 case T_SHORT: 5972 assert(VM_Version::supports_avx512_bitalg(), ""); 5973 evpopcntw(dst, mask, src, merge, vec_enc); 5974 break; 5975 case T_BYTE: 5976 case T_BOOLEAN: 5977 assert(VM_Version::supports_avx512_bitalg(), ""); 5978 evpopcntb(dst, mask, src, merge, vec_enc); 5979 break; 5980 default: 5981 fatal("Unsupported type %s", type2name(bt)); 5982 break; 5983 } 5984 } 5985 5986 #ifndef _LP64 5987 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5988 assert(VM_Version::supports_avx512bw(), ""); 5989 kmovdl(tmp, src); 5990 kunpckdql(dst, tmp, tmp); 5991 } 5992 #endif 5993 5994 // Bit reversal algorithm first reverses the bits of each byte followed by 5995 // a byte level reversal for multi-byte primitive types (short/int/long). 5996 // Algorithm performs a lookup table access to get reverse bit sequence 5997 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5998 // is obtained by swapping the reverse bit sequences of upper and lower 5999 // nibble of a byte. 6000 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6001 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6002 if (VM_Version::supports_avx512vlbw()) { 6003 6004 // Get the reverse bit sequence of lower nibble of each byte. 6005 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 6006 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6007 evpandq(dst, xtmp2, src, vec_enc); 6008 vpshufb(dst, xtmp1, dst, vec_enc); 6009 vpsllq(dst, dst, 4, vec_enc); 6010 6011 // Get the reverse bit sequence of upper nibble of each byte. 6012 vpandn(xtmp2, xtmp2, src, vec_enc); 6013 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6014 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6015 6016 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6017 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6018 evporq(xtmp2, dst, xtmp2, vec_enc); 6019 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6020 6021 } else if(vec_enc == Assembler::AVX_512bit) { 6022 // Shift based bit reversal. 6023 assert(bt == T_LONG || bt == T_INT, ""); 6024 6025 // Swap lower and upper nibble of each byte. 6026 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 6027 6028 // Swap two least and most significant bits of each nibble. 6029 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 6030 6031 // Swap adjacent pair of bits. 6032 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6033 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 6034 6035 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6036 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 6037 } else { 6038 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 6039 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6040 6041 // Get the reverse bit sequence of lower nibble of each byte. 6042 vpand(dst, xtmp2, src, vec_enc); 6043 vpshufb(dst, xtmp1, dst, vec_enc); 6044 vpsllq(dst, dst, 4, vec_enc); 6045 6046 // Get the reverse bit sequence of upper nibble of each byte. 6047 vpandn(xtmp2, xtmp2, src, vec_enc); 6048 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6049 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6050 6051 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6052 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6053 vpor(xtmp2, dst, xtmp2, vec_enc); 6054 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6055 } 6056 } 6057 6058 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 6059 XMMRegister xtmp, Register rscratch) { 6060 assert(VM_Version::supports_gfni(), ""); 6061 assert(rscratch != noreg || always_reachable(mask), "missing"); 6062 6063 // Galois field instruction based bit reversal based on following algorithm. 6064 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6065 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 6066 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 6067 vector_reverse_byte(bt, dst, xtmp, vec_enc); 6068 } 6069 6070 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 6071 XMMRegister xtmp1, Register rtmp, int vec_enc) { 6072 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 6073 evpandq(dst, xtmp1, src, vec_enc); 6074 vpsllq(dst, dst, nbits, vec_enc); 6075 vpandn(xtmp1, xtmp1, src, vec_enc); 6076 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 6077 evporq(dst, dst, xtmp1, vec_enc); 6078 } 6079 6080 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6081 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6082 // Shift based bit reversal. 6083 assert(VM_Version::supports_evex(), ""); 6084 switch(bt) { 6085 case T_LONG: 6086 // Swap upper and lower double word of each quad word. 6087 evprorq(xtmp1, k0, src, 32, true, vec_enc); 6088 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 6089 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6090 break; 6091 case T_INT: 6092 // Swap upper and lower word of each double word. 6093 evprord(xtmp1, k0, src, 16, true, vec_enc); 6094 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6095 break; 6096 case T_CHAR: 6097 case T_SHORT: 6098 // Swap upper and lower byte of each word. 6099 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 6100 break; 6101 case T_BYTE: 6102 evmovdquq(dst, k0, src, true, vec_enc); 6103 break; 6104 default: 6105 fatal("Unsupported type %s", type2name(bt)); 6106 break; 6107 } 6108 } 6109 6110 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 6111 if (bt == T_BYTE) { 6112 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 6113 evmovdquq(dst, k0, src, true, vec_enc); 6114 } else { 6115 vmovdqu(dst, src); 6116 } 6117 return; 6118 } 6119 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 6120 // pre-computed shuffle indices. 6121 switch(bt) { 6122 case T_LONG: 6123 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 6124 break; 6125 case T_INT: 6126 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 6127 break; 6128 case T_CHAR: 6129 case T_SHORT: 6130 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6131 break; 6132 default: 6133 fatal("Unsupported type %s", type2name(bt)); 6134 break; 6135 } 6136 vpshufb(dst, src, dst, vec_enc); 6137 } 6138 6139 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6140 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6141 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6142 assert(is_integral_type(bt), ""); 6143 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6144 assert(VM_Version::supports_avx512cd(), ""); 6145 switch(bt) { 6146 case T_LONG: 6147 evplzcntq(dst, ktmp, src, merge, vec_enc); 6148 break; 6149 case T_INT: 6150 evplzcntd(dst, ktmp, src, merge, vec_enc); 6151 break; 6152 case T_SHORT: 6153 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6154 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6155 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6156 vpunpckhwd(dst, xtmp1, src, vec_enc); 6157 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6158 vpackusdw(dst, xtmp2, dst, vec_enc); 6159 break; 6160 case T_BYTE: 6161 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6162 // accessing the lookup table. 6163 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6164 // accessing the lookup table. 6165 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6166 assert(VM_Version::supports_avx512bw(), ""); 6167 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6168 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6169 vpand(xtmp2, dst, src, vec_enc); 6170 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6171 vpsrlw(xtmp3, src, 4, vec_enc); 6172 vpand(xtmp3, dst, xtmp3, vec_enc); 6173 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6174 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6175 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6176 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6177 break; 6178 default: 6179 fatal("Unsupported type %s", type2name(bt)); 6180 break; 6181 } 6182 } 6183 6184 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6185 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6186 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6187 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6188 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6189 // accessing the lookup table. 6190 vpand(dst, xtmp2, src, vec_enc); 6191 vpshufb(dst, xtmp1, dst, vec_enc); 6192 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6193 // accessing the lookup table. 6194 vpsrlw(xtmp3, src, 4, vec_enc); 6195 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6196 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6197 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6198 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6199 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6200 vpaddb(dst, dst, xtmp2, vec_enc); 6201 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6202 } 6203 6204 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6205 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6206 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6207 // Add zero counts of lower byte and upper byte of a word if 6208 // upper byte holds a zero value. 6209 vpsrlw(xtmp3, src, 8, vec_enc); 6210 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6211 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6212 vpsllw(xtmp2, dst, 8, vec_enc); 6213 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6214 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6215 vpsrlw(dst, dst, 8, vec_enc); 6216 } 6217 6218 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6219 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6220 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6221 // hence biased exponent can be used to compute leading zero count as per 6222 // following formula:- 6223 // LZCNT = 32 - (biased_exp - 127) 6224 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6225 6226 // Broadcast 0xFF 6227 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6228 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6229 6230 // Extract biased exponent. 6231 vcvtdq2ps(dst, src, vec_enc); 6232 vpsrld(dst, dst, 23, vec_enc); 6233 vpand(dst, dst, xtmp1, vec_enc); 6234 6235 // Broadcast 127. 6236 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6237 // Exponent = biased_exp - 127 6238 vpsubd(dst, dst, xtmp1, vec_enc); 6239 6240 // Exponent = Exponent + 1 6241 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6242 vpaddd(dst, dst, xtmp3, vec_enc); 6243 6244 // Replace -ve exponent with zero, exponent is -ve when src 6245 // lane contains a zero value. 6246 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6247 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6248 6249 // Rematerialize broadcast 32. 6250 vpslld(xtmp1, xtmp3, 5, vec_enc); 6251 // Exponent is 32 if corresponding source lane contains max_int value. 6252 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6253 // LZCNT = 32 - exponent 6254 vpsubd(dst, xtmp1, dst, vec_enc); 6255 6256 // Replace LZCNT with a value 1 if corresponding source lane 6257 // contains max_int value. 6258 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6259 6260 // Replace biased_exp with 0 if source lane value is less than zero. 6261 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6262 vblendvps(dst, dst, xtmp2, src, vec_enc); 6263 } 6264 6265 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6266 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6267 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6268 // Add zero counts of lower word and upper word of a double word if 6269 // upper word holds a zero value. 6270 vpsrld(xtmp3, src, 16, vec_enc); 6271 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6272 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6273 vpslld(xtmp2, dst, 16, vec_enc); 6274 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6275 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6276 vpsrld(dst, dst, 16, vec_enc); 6277 // Add zero counts of lower doubleword and upper doubleword of a 6278 // quadword if upper doubleword holds a zero value. 6279 vpsrlq(xtmp3, src, 32, vec_enc); 6280 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6281 vpsllq(xtmp2, dst, 32, vec_enc); 6282 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6283 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6284 vpsrlq(dst, dst, 32, vec_enc); 6285 } 6286 6287 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6288 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6289 Register rtmp, int vec_enc) { 6290 assert(is_integral_type(bt), "unexpected type"); 6291 assert(vec_enc < Assembler::AVX_512bit, ""); 6292 switch(bt) { 6293 case T_LONG: 6294 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6295 break; 6296 case T_INT: 6297 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6298 break; 6299 case T_SHORT: 6300 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6301 break; 6302 case T_BYTE: 6303 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6304 break; 6305 default: 6306 fatal("Unsupported type %s", type2name(bt)); 6307 break; 6308 } 6309 } 6310 6311 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6312 switch(bt) { 6313 case T_BYTE: 6314 vpsubb(dst, src1, src2, vec_enc); 6315 break; 6316 case T_SHORT: 6317 vpsubw(dst, src1, src2, vec_enc); 6318 break; 6319 case T_INT: 6320 vpsubd(dst, src1, src2, vec_enc); 6321 break; 6322 case T_LONG: 6323 vpsubq(dst, src1, src2, vec_enc); 6324 break; 6325 default: 6326 fatal("Unsupported type %s", type2name(bt)); 6327 break; 6328 } 6329 } 6330 6331 // Trailing zero count computation is based on leading zero count operation as per 6332 // following equation. All AVX3 targets support AVX512CD feature which offers 6333 // direct vector instruction to compute leading zero count. 6334 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6335 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6336 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6337 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6338 assert(is_integral_type(bt), ""); 6339 // xtmp = -1 6340 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6341 // xtmp = xtmp + src 6342 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6343 // xtmp = xtmp & ~src 6344 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6345 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6346 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6347 vpsub(bt, dst, xtmp4, dst, vec_enc); 6348 } 6349 6350 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6351 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6352 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6353 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6354 assert(is_integral_type(bt), ""); 6355 // xtmp = 0 6356 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6357 // xtmp = 0 - src 6358 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6359 // xtmp = xtmp | src 6360 vpor(xtmp3, xtmp3, src, vec_enc); 6361 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6362 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6363 vpsub(bt, dst, xtmp1, dst, vec_enc); 6364 } 6365 6366 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6367 Label done; 6368 Label neg_divisor_fastpath; 6369 cmpl(divisor, 0); 6370 jccb(Assembler::less, neg_divisor_fastpath); 6371 xorl(rdx, rdx); 6372 divl(divisor); 6373 jmpb(done); 6374 bind(neg_divisor_fastpath); 6375 // Fastpath for divisor < 0: 6376 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6377 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6378 movl(rdx, rax); 6379 subl(rdx, divisor); 6380 if (VM_Version::supports_bmi1()) { 6381 andnl(rax, rdx, rax); 6382 } else { 6383 notl(rdx); 6384 andl(rax, rdx); 6385 } 6386 shrl(rax, 31); 6387 bind(done); 6388 } 6389 6390 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6391 Label done; 6392 Label neg_divisor_fastpath; 6393 cmpl(divisor, 0); 6394 jccb(Assembler::less, neg_divisor_fastpath); 6395 xorl(rdx, rdx); 6396 divl(divisor); 6397 jmpb(done); 6398 bind(neg_divisor_fastpath); 6399 // Fastpath when divisor < 0: 6400 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6401 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6402 movl(rdx, rax); 6403 subl(rax, divisor); 6404 if (VM_Version::supports_bmi1()) { 6405 andnl(rax, rax, rdx); 6406 } else { 6407 notl(rax); 6408 andl(rax, rdx); 6409 } 6410 sarl(rax, 31); 6411 andl(rax, divisor); 6412 subl(rdx, rax); 6413 bind(done); 6414 } 6415 6416 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6417 Label done; 6418 Label neg_divisor_fastpath; 6419 6420 cmpl(divisor, 0); 6421 jccb(Assembler::less, neg_divisor_fastpath); 6422 xorl(rdx, rdx); 6423 divl(divisor); 6424 jmpb(done); 6425 bind(neg_divisor_fastpath); 6426 // Fastpath for divisor < 0: 6427 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6428 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6429 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6430 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6431 movl(rdx, rax); 6432 subl(rax, divisor); 6433 if (VM_Version::supports_bmi1()) { 6434 andnl(rax, rax, rdx); 6435 } else { 6436 notl(rax); 6437 andl(rax, rdx); 6438 } 6439 movl(tmp, rax); 6440 shrl(rax, 31); // quotient 6441 sarl(tmp, 31); 6442 andl(tmp, divisor); 6443 subl(rdx, tmp); // remainder 6444 bind(done); 6445 } 6446 6447 #ifdef _LP64 6448 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6449 XMMRegister xtmp2, Register rtmp) { 6450 if(VM_Version::supports_gfni()) { 6451 // Galois field instruction based bit reversal based on following algorithm. 6452 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6453 mov64(rtmp, 0x8040201008040201L); 6454 movq(xtmp1, src); 6455 movq(xtmp2, rtmp); 6456 gf2p8affineqb(xtmp1, xtmp2, 0); 6457 movq(dst, xtmp1); 6458 } else { 6459 // Swap even and odd numbered bits. 6460 movl(rtmp, src); 6461 andl(rtmp, 0x55555555); 6462 shll(rtmp, 1); 6463 movl(dst, src); 6464 andl(dst, 0xAAAAAAAA); 6465 shrl(dst, 1); 6466 orl(dst, rtmp); 6467 6468 // Swap LSB and MSB 2 bits of each nibble. 6469 movl(rtmp, dst); 6470 andl(rtmp, 0x33333333); 6471 shll(rtmp, 2); 6472 andl(dst, 0xCCCCCCCC); 6473 shrl(dst, 2); 6474 orl(dst, rtmp); 6475 6476 // Swap LSB and MSB 4 bits of each byte. 6477 movl(rtmp, dst); 6478 andl(rtmp, 0x0F0F0F0F); 6479 shll(rtmp, 4); 6480 andl(dst, 0xF0F0F0F0); 6481 shrl(dst, 4); 6482 orl(dst, rtmp); 6483 } 6484 bswapl(dst); 6485 } 6486 6487 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6488 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6489 if(VM_Version::supports_gfni()) { 6490 // Galois field instruction based bit reversal based on following algorithm. 6491 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6492 mov64(rtmp1, 0x8040201008040201L); 6493 movq(xtmp1, src); 6494 movq(xtmp2, rtmp1); 6495 gf2p8affineqb(xtmp1, xtmp2, 0); 6496 movq(dst, xtmp1); 6497 } else { 6498 // Swap even and odd numbered bits. 6499 movq(rtmp1, src); 6500 mov64(rtmp2, 0x5555555555555555L); 6501 andq(rtmp1, rtmp2); 6502 shlq(rtmp1, 1); 6503 movq(dst, src); 6504 notq(rtmp2); 6505 andq(dst, rtmp2); 6506 shrq(dst, 1); 6507 orq(dst, rtmp1); 6508 6509 // Swap LSB and MSB 2 bits of each nibble. 6510 movq(rtmp1, dst); 6511 mov64(rtmp2, 0x3333333333333333L); 6512 andq(rtmp1, rtmp2); 6513 shlq(rtmp1, 2); 6514 notq(rtmp2); 6515 andq(dst, rtmp2); 6516 shrq(dst, 2); 6517 orq(dst, rtmp1); 6518 6519 // Swap LSB and MSB 4 bits of each byte. 6520 movq(rtmp1, dst); 6521 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6522 andq(rtmp1, rtmp2); 6523 shlq(rtmp1, 4); 6524 notq(rtmp2); 6525 andq(dst, rtmp2); 6526 shrq(dst, 4); 6527 orq(dst, rtmp1); 6528 } 6529 bswapq(dst); 6530 } 6531 6532 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6533 Label done; 6534 Label neg_divisor_fastpath; 6535 cmpq(divisor, 0); 6536 jccb(Assembler::less, neg_divisor_fastpath); 6537 xorl(rdx, rdx); 6538 divq(divisor); 6539 jmpb(done); 6540 bind(neg_divisor_fastpath); 6541 // Fastpath for divisor < 0: 6542 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6543 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6544 movq(rdx, rax); 6545 subq(rdx, divisor); 6546 if (VM_Version::supports_bmi1()) { 6547 andnq(rax, rdx, rax); 6548 } else { 6549 notq(rdx); 6550 andq(rax, rdx); 6551 } 6552 shrq(rax, 63); 6553 bind(done); 6554 } 6555 6556 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6557 Label done; 6558 Label neg_divisor_fastpath; 6559 cmpq(divisor, 0); 6560 jccb(Assembler::less, neg_divisor_fastpath); 6561 xorq(rdx, rdx); 6562 divq(divisor); 6563 jmp(done); 6564 bind(neg_divisor_fastpath); 6565 // Fastpath when divisor < 0: 6566 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6567 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6568 movq(rdx, rax); 6569 subq(rax, divisor); 6570 if (VM_Version::supports_bmi1()) { 6571 andnq(rax, rax, rdx); 6572 } else { 6573 notq(rax); 6574 andq(rax, rdx); 6575 } 6576 sarq(rax, 63); 6577 andq(rax, divisor); 6578 subq(rdx, rax); 6579 bind(done); 6580 } 6581 6582 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6583 Label done; 6584 Label neg_divisor_fastpath; 6585 cmpq(divisor, 0); 6586 jccb(Assembler::less, neg_divisor_fastpath); 6587 xorq(rdx, rdx); 6588 divq(divisor); 6589 jmp(done); 6590 bind(neg_divisor_fastpath); 6591 // Fastpath for divisor < 0: 6592 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6593 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6594 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6595 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6596 movq(rdx, rax); 6597 subq(rax, divisor); 6598 if (VM_Version::supports_bmi1()) { 6599 andnq(rax, rax, rdx); 6600 } else { 6601 notq(rax); 6602 andq(rax, rdx); 6603 } 6604 movq(tmp, rax); 6605 shrq(rax, 63); // quotient 6606 sarq(tmp, 63); 6607 andq(tmp, divisor); 6608 subq(rdx, tmp); // remainder 6609 bind(done); 6610 } 6611 #endif 6612 6613 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6614 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6615 int vlen_enc) { 6616 assert(VM_Version::supports_avx512bw(), ""); 6617 // Byte shuffles are inlane operations and indices are determined using 6618 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6619 // normalized to index range 0-15. This makes sure that all the multiples 6620 // of an index value are placed at same relative position in 128 bit 6621 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6622 // will be 16th element in their respective 128 bit lanes. 6623 movl(rtmp, 16); 6624 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6625 6626 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6627 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6628 // original shuffle indices and move the shuffled lanes corresponding to true 6629 // mask to destination vector. 6630 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6631 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6632 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6633 6634 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6635 // and broadcasting second 128 bit lane. 6636 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6637 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6638 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6639 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6640 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6641 6642 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6643 // and broadcasting third 128 bit lane. 6644 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6645 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6646 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6647 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6648 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6649 6650 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6651 // and broadcasting third 128 bit lane. 6652 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6653 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6654 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6655 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6656 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6657 } 6658 6659 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6660 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6661 if (vlen_enc == AVX_128bit) { 6662 vpermilps(dst, src, shuffle, vlen_enc); 6663 } else if (bt == T_INT) { 6664 vpermd(dst, shuffle, src, vlen_enc); 6665 } else { 6666 assert(bt == T_FLOAT, ""); 6667 vpermps(dst, shuffle, src, vlen_enc); 6668 } 6669 } 6670 6671 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6672 switch(elem_bt) { 6673 case T_BYTE: 6674 if (ideal_opc == Op_SaturatingAddV) { 6675 vpaddsb(dst, src1, src2, vlen_enc); 6676 } else { 6677 assert(ideal_opc == Op_SaturatingSubV, ""); 6678 vpsubsb(dst, src1, src2, vlen_enc); 6679 } 6680 break; 6681 case T_SHORT: 6682 if (ideal_opc == Op_SaturatingAddV) { 6683 vpaddsw(dst, src1, src2, vlen_enc); 6684 } else { 6685 assert(ideal_opc == Op_SaturatingSubV, ""); 6686 vpsubsw(dst, src1, src2, vlen_enc); 6687 } 6688 break; 6689 default: 6690 fatal("Unsupported type %s", type2name(elem_bt)); 6691 break; 6692 } 6693 } 6694 6695 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6696 switch(elem_bt) { 6697 case T_BYTE: 6698 if (ideal_opc == Op_SaturatingAddV) { 6699 vpaddusb(dst, src1, src2, vlen_enc); 6700 } else { 6701 assert(ideal_opc == Op_SaturatingSubV, ""); 6702 vpsubusb(dst, src1, src2, vlen_enc); 6703 } 6704 break; 6705 case T_SHORT: 6706 if (ideal_opc == Op_SaturatingAddV) { 6707 vpaddusw(dst, src1, src2, vlen_enc); 6708 } else { 6709 assert(ideal_opc == Op_SaturatingSubV, ""); 6710 vpsubusw(dst, src1, src2, vlen_enc); 6711 } 6712 break; 6713 default: 6714 fatal("Unsupported type %s", type2name(elem_bt)); 6715 break; 6716 } 6717 } 6718 6719 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6720 XMMRegister src2, KRegister ktmp, int vlen_enc) { 6721 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input. 6722 // overflow_mask = Inp1 <u Inp2 6723 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc); 6724 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative) 6725 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false); 6726 } 6727 6728 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6729 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6730 // Emulate unsigned comparison using signed comparison 6731 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE 6732 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true); 6733 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc); 6734 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc); 6735 6736 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc); 6737 6738 // Res = INP1 - INP2 (non-commutative and non-associative) 6739 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6740 // Res = Mask ? Zero : Res 6741 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); 6742 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc); 6743 } 6744 6745 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6746 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) { 6747 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation. 6748 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2) 6749 // Res = Signed Add INP1, INP2 6750 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6751 // T1 = SRC1 | SRC2 6752 vpor(xtmp1, src1, src2, vlen_enc); 6753 // Max_Unsigned = -1 6754 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6755 // Unsigned compare: Mask = Res <u T1 6756 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc); 6757 // res = Mask ? Max_Unsigned : Res 6758 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc); 6759 } 6760 6761 // 6762 // Section 2-13 Hacker's Delight list following overflow detection check for saturating 6763 // unsigned addition operation. 6764 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1 6765 // 6766 // We empirically determined its semantic equivalence to following reduced expression 6767 // overflow_mask = (a + b) <u (a | b) 6768 // 6769 // and also verified it though Alive2 solver. 6770 // (https://alive2.llvm.org/ce/z/XDQ7dY) 6771 // 6772 6773 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6774 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) { 6775 // Res = Signed Add INP1, INP2 6776 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6777 // Compute T1 = INP1 | INP2 6778 vpor(xtmp3, src1, src2, vlen_enc); 6779 // T1 = Minimum signed value. 6780 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6781 // Convert T1 to signed value, T1 = T1 + MIN_VALUE 6782 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc); 6783 // Convert Res to signed value, Res<s> = Res + MIN_VALUE 6784 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc); 6785 // Compute overflow detection mask = Res<1> <s T1 6786 if (elem_bt == T_INT) { 6787 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc); 6788 } else { 6789 assert(elem_bt == T_LONG, ""); 6790 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc); 6791 } 6792 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6793 } 6794 6795 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6796 int vlen_enc, bool xtmp2_hold_M1) { 6797 if (VM_Version::supports_avx512dq()) { 6798 evpmovq2m(ktmp, src, vlen_enc); 6799 } else { 6800 assert(VM_Version::supports_evex(), ""); 6801 if (!xtmp2_hold_M1) { 6802 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6803 } 6804 evpsraq(xtmp1, src, 63, vlen_enc); 6805 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6806 } 6807 } 6808 6809 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6810 int vlen_enc, bool xtmp2_hold_M1) { 6811 if (VM_Version::supports_avx512dq()) { 6812 evpmovd2m(ktmp, src, vlen_enc); 6813 } else { 6814 assert(VM_Version::supports_evex(), ""); 6815 if (!xtmp2_hold_M1) { 6816 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6817 } 6818 vpsrad(xtmp1, src, 31, vlen_enc); 6819 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6820 } 6821 } 6822 6823 6824 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 6825 if (elem_bt == T_LONG) { 6826 if (VM_Version::supports_evex()) { 6827 evpsraq(dst, src, 63, vlen_enc); 6828 } else { 6829 vpsrad(dst, src, 31, vlen_enc); 6830 vpshufd(dst, dst, 0xF5, vlen_enc); 6831 } 6832 } else { 6833 assert(elem_bt == T_INT, ""); 6834 vpsrad(dst, src, 31, vlen_enc); 6835 } 6836 } 6837 6838 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6839 if (compute_allones) { 6840 if (vlen_enc == Assembler::AVX_512bit) { 6841 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6842 } else { 6843 vpcmpeqq(allones, allones, allones, vlen_enc); 6844 } 6845 } 6846 if (elem_bt == T_LONG) { 6847 vpsrlq(dst, allones, 1, vlen_enc); 6848 } else { 6849 assert(elem_bt == T_INT, ""); 6850 vpsrld(dst, allones, 1, vlen_enc); 6851 } 6852 } 6853 6854 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6855 if (compute_allones) { 6856 if (vlen_enc == Assembler::AVX_512bit) { 6857 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6858 } else { 6859 vpcmpeqq(allones, allones, allones, vlen_enc); 6860 } 6861 } 6862 if (elem_bt == T_LONG) { 6863 vpsllq(dst, allones, 63, vlen_enc); 6864 } else { 6865 assert(elem_bt == T_INT, ""); 6866 vpslld(dst, allones, 31, vlen_enc); 6867 } 6868 } 6869 6870 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2, 6871 Assembler::ComparisonPredicate cond, int vlen_enc) { 6872 switch(elem_bt) { 6873 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break; 6874 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break; 6875 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break; 6876 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break; 6877 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6878 } 6879 } 6880 6881 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6882 switch(elem_bt) { 6883 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break; 6884 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break; 6885 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break; 6886 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break; 6887 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6888 } 6889 } 6890 6891 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1, 6892 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) { 6893 if (elem_bt == T_LONG) { 6894 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6895 } else { 6896 assert(elem_bt == T_INT, ""); 6897 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6898 } 6899 } 6900 6901 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6902 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6903 KRegister ktmp1, KRegister ktmp2, int vlen_enc) { 6904 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6905 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6906 // Overflow detection based on Hacker's delight section 2-13. 6907 if (ideal_opc == Op_SaturatingAddV) { 6908 // res = src1 + src2 6909 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6910 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6911 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6912 vpxor(xtmp1, dst, src1, vlen_enc); 6913 vpxor(xtmp2, dst, src2, vlen_enc); 6914 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6915 } else { 6916 assert(ideal_opc == Op_SaturatingSubV, ""); 6917 // res = src1 - src2 6918 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6919 // Overflow occurs when both inputs have opposite polarity and 6920 // result polarity does not comply with first input polarity. 6921 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6922 vpxor(xtmp1, src1, src2, vlen_enc); 6923 vpxor(xtmp2, dst, src1, vlen_enc); 6924 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6925 } 6926 6927 // Compute overflow detection mask. 6928 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc); 6929 // Note: xtmp1 hold -1 in all its lanes after above call. 6930 6931 // Compute mask based on first input polarity. 6932 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true); 6933 6934 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6935 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6936 6937 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to 6938 // set bits in first input polarity mask holds a min value. 6939 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc); 6940 // Blend destination lanes with saturated values using overflow detection mask. 6941 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc); 6942 } 6943 6944 6945 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6946 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6947 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) { 6948 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6949 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6950 // Overflow detection based on Hacker's delight section 2-13. 6951 if (ideal_opc == Op_SaturatingAddV) { 6952 // res = src1 + src2 6953 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6954 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6955 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6956 vpxor(xtmp1, dst, src1, vlen_enc); 6957 vpxor(xtmp2, dst, src2, vlen_enc); 6958 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6959 } else { 6960 assert(ideal_opc == Op_SaturatingSubV, ""); 6961 // res = src1 - src2 6962 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6963 // Overflow occurs when both inputs have opposite polarity and 6964 // result polarity does not comply with first input polarity. 6965 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6966 vpxor(xtmp1, src1, src2, vlen_enc); 6967 vpxor(xtmp2, dst, src1, vlen_enc); 6968 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6969 } 6970 6971 // Sign-extend to compute overflow detection mask. 6972 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc); 6973 6974 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc); 6975 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc); 6976 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6977 6978 // Compose saturating min/max vector using first input polarity mask. 6979 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc); 6980 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc); 6981 6982 // Blend result with saturating vector using overflow detection mask. 6983 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6984 } 6985 6986 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6987 switch(elem_bt) { 6988 case T_BYTE: 6989 if (ideal_opc == Op_SaturatingAddV) { 6990 vpaddsb(dst, src1, src2, vlen_enc); 6991 } else { 6992 assert(ideal_opc == Op_SaturatingSubV, ""); 6993 vpsubsb(dst, src1, src2, vlen_enc); 6994 } 6995 break; 6996 case T_SHORT: 6997 if (ideal_opc == Op_SaturatingAddV) { 6998 vpaddsw(dst, src1, src2, vlen_enc); 6999 } else { 7000 assert(ideal_opc == Op_SaturatingSubV, ""); 7001 vpsubsw(dst, src1, src2, vlen_enc); 7002 } 7003 break; 7004 default: 7005 fatal("Unsupported type %s", type2name(elem_bt)); 7006 break; 7007 } 7008 } 7009 7010 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7011 switch(elem_bt) { 7012 case T_BYTE: 7013 if (ideal_opc == Op_SaturatingAddV) { 7014 vpaddusb(dst, src1, src2, vlen_enc); 7015 } else { 7016 assert(ideal_opc == Op_SaturatingSubV, ""); 7017 vpsubusb(dst, src1, src2, vlen_enc); 7018 } 7019 break; 7020 case T_SHORT: 7021 if (ideal_opc == Op_SaturatingAddV) { 7022 vpaddusw(dst, src1, src2, vlen_enc); 7023 } else { 7024 assert(ideal_opc == Op_SaturatingSubV, ""); 7025 vpsubusw(dst, src1, src2, vlen_enc); 7026 } 7027 break; 7028 default: 7029 fatal("Unsupported type %s", type2name(elem_bt)); 7030 break; 7031 } 7032 } 7033 7034 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 7035 XMMRegister src2, int vlen_enc) { 7036 switch(elem_bt) { 7037 case T_BYTE: 7038 evpermi2b(dst, src1, src2, vlen_enc); 7039 break; 7040 case T_SHORT: 7041 evpermi2w(dst, src1, src2, vlen_enc); 7042 break; 7043 case T_INT: 7044 evpermi2d(dst, src1, src2, vlen_enc); 7045 break; 7046 case T_LONG: 7047 evpermi2q(dst, src1, src2, vlen_enc); 7048 break; 7049 case T_FLOAT: 7050 evpermi2ps(dst, src1, src2, vlen_enc); 7051 break; 7052 case T_DOUBLE: 7053 evpermi2pd(dst, src1, src2, vlen_enc); 7054 break; 7055 default: 7056 fatal("Unsupported type %s", type2name(elem_bt)); 7057 break; 7058 } 7059 } 7060 7061 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) { 7062 if (is_unsigned) { 7063 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7064 } else { 7065 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7066 } 7067 } 7068 7069 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) { 7070 if (is_unsigned) { 7071 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7072 } else { 7073 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7074 } 7075 }