1 /* 2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "gc/shared/barrierSet.hpp" 28 #include "gc/shared/barrierSetAssembler.hpp" 29 #include "oops/methodData.hpp" 30 #include "opto/c2_MacroAssembler.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/output.hpp" 33 #include "opto/opcodes.hpp" 34 #include "opto/subnode.hpp" 35 #include "runtime/globals.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 #include "utilities/checkedCast.hpp" 39 #include "utilities/globalDefinitions.hpp" 40 #include "utilities/powerOfTwo.hpp" 41 #include "utilities/sizes.hpp" 42 43 #ifdef PRODUCT 44 #define BLOCK_COMMENT(str) /* nothing */ 45 #define STOP(error) stop(error) 46 #else 47 #define BLOCK_COMMENT(str) block_comment(str) 48 #define STOP(error) block_comment(error); stop(error) 49 #endif 50 51 // C2 compiled method's prolog code. 52 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 53 54 // WARNING: Initial instruction MUST be 5 bytes or longer so that 55 // NativeJump::patch_verified_entry will be able to patch out the entry 56 // code safely. The push to verify stack depth is ok at 5 bytes, 57 // the frame allocation can be either 3 or 6 bytes. So if we don't do 58 // stack bang then we must use the 6 byte frame allocation even if 59 // we have no frame. :-( 60 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 61 62 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 63 // Remove word for return addr 64 framesize -= wordSize; 65 stack_bang_size -= wordSize; 66 67 // Calls to C2R adapters often do not accept exceptional returns. 68 // We require that their callers must bang for them. But be careful, because 69 // some VM calls (such as call site linkage) can use several kilobytes of 70 // stack. But the stack safety zone should account for that. 71 // See bugs 4446381, 4468289, 4497237. 72 if (stack_bang_size > 0) { 73 generate_stack_overflow_check(stack_bang_size); 74 75 // We always push rbp, so that on return to interpreter rbp, will be 76 // restored correctly and we can correct the stack. 77 push(rbp); 78 // Save caller's stack pointer into RBP if the frame pointer is preserved. 79 if (PreserveFramePointer) { 80 mov(rbp, rsp); 81 } 82 // Remove word for ebp 83 framesize -= wordSize; 84 85 // Create frame 86 if (framesize) { 87 subptr(rsp, framesize); 88 } 89 } else { 90 // Create frame (force generation of a 4 byte immediate value) 91 subptr_imm32(rsp, framesize); 92 93 // Save RBP register now. 94 framesize -= wordSize; 95 movptr(Address(rsp, framesize), rbp); 96 // Save caller's stack pointer into RBP if the frame pointer is preserved. 97 if (PreserveFramePointer) { 98 movptr(rbp, rsp); 99 if (framesize > 0) { 100 addptr(rbp, framesize); 101 } 102 } 103 } 104 105 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 106 framesize -= wordSize; 107 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 108 } 109 110 #ifdef ASSERT 111 if (VerifyStackAtCalls) { 112 Label L; 113 push(rax); 114 mov(rax, rsp); 115 andptr(rax, StackAlignmentInBytes-1); 116 cmpptr(rax, StackAlignmentInBytes-wordSize); 117 pop(rax); 118 jcc(Assembler::equal, L); 119 STOP("Stack is not properly aligned!"); 120 bind(L); 121 } 122 #endif 123 124 if (!is_stub) { 125 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 126 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 127 Label dummy_slow_path; 128 Label dummy_continuation; 129 Label* slow_path = &dummy_slow_path; 130 Label* continuation = &dummy_continuation; 131 if (!Compile::current()->output()->in_scratch_emit_size()) { 132 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 133 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 134 Compile::current()->output()->add_stub(stub); 135 slow_path = &stub->entry(); 136 continuation = &stub->continuation(); 137 } 138 bs->nmethod_entry_barrier(this, slow_path, continuation); 139 } 140 } 141 142 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 143 switch (vlen_in_bytes) { 144 case 4: // fall-through 145 case 8: // fall-through 146 case 16: return Assembler::AVX_128bit; 147 case 32: return Assembler::AVX_256bit; 148 case 64: return Assembler::AVX_512bit; 149 150 default: { 151 ShouldNotReachHere(); 152 return Assembler::AVX_NoVec; 153 } 154 } 155 } 156 157 // fast_lock and fast_unlock used by C2 158 159 // Because the transitions from emitted code to the runtime 160 // monitorenter/exit helper stubs are so slow it's critical that 161 // we inline both the stack-locking fast path and the inflated fast path. 162 // 163 // See also: cmpFastLock and cmpFastUnlock. 164 // 165 // What follows is a specialized inline transliteration of the code 166 // in enter() and exit(). If we're concerned about I$ bloat another 167 // option would be to emit TrySlowEnter and TrySlowExit methods 168 // at startup-time. These methods would accept arguments as 169 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 170 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 171 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 172 // In practice, however, the # of lock sites is bounded and is usually small. 173 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 174 // if the processor uses simple bimodal branch predictors keyed by EIP 175 // Since the helper routines would be called from multiple synchronization 176 // sites. 177 // 178 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 179 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 180 // to those specialized methods. That'd give us a mostly platform-independent 181 // implementation that the JITs could optimize and inline at their pleasure. 182 // Done correctly, the only time we'd need to cross to native could would be 183 // to park() or unpark() threads. We'd also need a few more unsafe operators 184 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 185 // (b) explicit barriers or fence operations. 186 // 187 // TODO: 188 // 189 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 190 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 191 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 192 // the lock operators would typically be faster than reifying Self. 193 // 194 // * Ideally I'd define the primitives as: 195 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 196 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 197 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 198 // Instead, we're stuck with a rather awkward and brittle register assignments below. 199 // Furthermore the register assignments are overconstrained, possibly resulting in 200 // sub-optimal code near the synchronization site. 201 // 202 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 203 // Alternately, use a better sp-proximity test. 204 // 205 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 206 // Either one is sufficient to uniquely identify a thread. 207 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 208 // 209 // * Intrinsify notify() and notifyAll() for the common cases where the 210 // object is locked by the calling thread but the waitlist is empty. 211 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 212 // 213 // * use jccb and jmpb instead of jcc and jmp to improve code density. 214 // But beware of excessive branch density on AMD Opterons. 215 // 216 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 217 // or failure of the fast path. If the fast path fails then we pass 218 // control to the slow path, typically in C. In fast_lock and 219 // fast_unlock we often branch to DONE_LABEL, just to find that C2 220 // will emit a conditional branch immediately after the node. 221 // So we have branches to branches and lots of ICC.ZF games. 222 // Instead, it might be better to have C2 pass a "FailureLabel" 223 // into fast_lock and fast_unlock. In the case of success, control 224 // will drop through the node. ICC.ZF is undefined at exit. 225 // In the case of failure, the node will branch directly to the 226 // FailureLabel 227 228 229 // obj: object to lock 230 // box: on-stack box address (displaced header location) - KILLED 231 // rax,: tmp -- KILLED 232 // scr: tmp -- KILLED 233 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 234 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 235 Metadata* method_data) { 236 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 237 // Ensure the register assignments are disjoint 238 assert(tmpReg == rax, ""); 239 assert(cx1Reg == noreg, ""); 240 assert(cx2Reg == noreg, ""); 241 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 242 243 // Possible cases that we'll encounter in fast_lock 244 // ------------------------------------------------ 245 // * Inflated 246 // -- unlocked 247 // -- Locked 248 // = by self 249 // = by other 250 // * neutral 251 // * stack-locked 252 // -- by self 253 // = sp-proximity test hits 254 // = sp-proximity test generates false-negative 255 // -- by other 256 // 257 258 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 259 260 if (DiagnoseSyncOnValueBasedClasses != 0) { 261 load_klass(tmpReg, objReg, scrReg); 262 testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 263 jcc(Assembler::notZero, DONE_LABEL); 264 } 265 266 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 267 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 268 jcc(Assembler::notZero, IsInflated); 269 270 if (LockingMode == LM_MONITOR) { 271 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 272 testptr(objReg, objReg); 273 } else { 274 assert(LockingMode == LM_LEGACY, "must be"); 275 // Attempt stack-locking ... 276 orptr (tmpReg, markWord::unlocked_value); 277 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 278 lock(); 279 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 280 jcc(Assembler::equal, COUNT); // Success 281 282 // Recursive locking. 283 // The object is stack-locked: markword contains stack pointer to BasicLock. 284 // Locked by current thread if difference with current SP is less than one page. 285 subptr(tmpReg, rsp); 286 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 287 andptr(tmpReg, (int32_t) (7 - (int)os::vm_page_size()) ); 288 movptr(Address(boxReg, 0), tmpReg); 289 } 290 jmp(DONE_LABEL); 291 292 bind(IsInflated); 293 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 294 295 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 296 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 297 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 298 299 // It's inflated and we use scrReg for ObjectMonitor* in this section. 300 movptr(boxReg, Address(r15_thread, JavaThread::monitor_owner_id_offset())); 301 movq(scrReg, tmpReg); 302 xorq(tmpReg, tmpReg); 303 lock(); 304 cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 305 306 // Propagate ICC.ZF from CAS above into DONE_LABEL. 307 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 308 309 cmpptr(boxReg, rax); // Check if we are already the owner (recursive lock) 310 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 311 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 312 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 313 bind(DONE_LABEL); 314 315 // ZFlag == 1 count in fast path 316 // ZFlag == 0 count in slow path 317 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 318 319 bind(COUNT); 320 if (LockingMode == LM_LEGACY) { 321 // Count monitors in fast path 322 increment(Address(thread, JavaThread::held_monitor_count_offset())); 323 } 324 xorl(tmpReg, tmpReg); // Set ZF == 1 325 326 bind(NO_COUNT); 327 328 // At NO_COUNT the icc ZFlag is set as follows ... 329 // fast_unlock uses the same protocol. 330 // ZFlag == 1 -> Success 331 // ZFlag == 0 -> Failure - force control through the slow path 332 } 333 334 // obj: object to unlock 335 // box: box address (displaced header location), killed. Must be EAX. 336 // tmp: killed, cannot be obj nor box. 337 // 338 // Some commentary on balanced locking: 339 // 340 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 341 // Methods that don't have provably balanced locking are forced to run in the 342 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 343 // The interpreter provides two properties: 344 // I1: At return-time the interpreter automatically and quietly unlocks any 345 // objects acquired the current activation (frame). Recall that the 346 // interpreter maintains an on-stack list of locks currently held by 347 // a frame. 348 // I2: If a method attempts to unlock an object that is not held by the 349 // the frame the interpreter throws IMSX. 350 // 351 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 352 // B() doesn't have provably balanced locking so it runs in the interpreter. 353 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 354 // is still locked by A(). 355 // 356 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 357 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 358 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 359 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 360 // Arguably given that the spec legislates the JNI case as undefined our implementation 361 // could reasonably *avoid* checking owner in fast_unlock(). 362 // In the interest of performance we elide m->Owner==Self check in unlock. 363 // A perfectly viable alternative is to elide the owner check except when 364 // Xcheck:jni is enabled. 365 366 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { 367 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 368 assert(boxReg == rax, ""); 369 assert_different_registers(objReg, boxReg, tmpReg); 370 371 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 372 373 if (LockingMode == LM_LEGACY) { 374 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 375 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 376 } 377 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 378 if (LockingMode != LM_MONITOR) { 379 testptr(tmpReg, markWord::monitor_value); // Inflated? 380 jcc(Assembler::zero, Stacked); 381 } 382 383 // It's inflated. 384 385 // Despite our balanced locking property we still check that m->_owner == Self 386 // as java routines or native JNI code called by this thread might 387 // have released the lock. 388 // 389 // If there's no contention try a 1-0 exit. That is, exit without 390 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 391 // we detect and recover from the race that the 1-0 exit admits. 392 // 393 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 394 // before it STs null into _owner, releasing the lock. Updates 395 // to data protected by the critical section must be visible before 396 // we drop the lock (and thus before any other thread could acquire 397 // the lock and observe the fields protected by the lock). 398 // IA32's memory-model is SPO, so STs are ordered with respect to 399 // each other and there's no need for an explicit barrier (fence). 400 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 401 Label LSuccess, LNotRecursive; 402 403 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 404 jccb(Assembler::equal, LNotRecursive); 405 406 // Recursive inflated unlock 407 decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 408 jmpb(LSuccess); 409 410 bind(LNotRecursive); 411 412 // Set owner to null. 413 // Release to satisfy the JMM 414 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 415 // We need a full fence after clearing owner to avoid stranding. 416 // StoreLoad achieves this. 417 membar(StoreLoad); 418 419 // Check if the entry_list is empty. 420 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(entry_list)), NULL_WORD); 421 jccb(Assembler::zero, LSuccess); // If so we are done. 422 423 // Check if there is a successor. 424 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 425 jccb(Assembler::notZero, LSuccess); // If so we are done. 426 427 // Save the monitor pointer in the current thread, so we can try to 428 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 429 andptr(tmpReg, ~(int32_t)markWord::monitor_value); 430 movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg); 431 432 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 433 jmpb (DONE_LABEL); 434 435 bind (LSuccess); 436 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 437 jmpb (DONE_LABEL); 438 439 if (LockingMode == LM_LEGACY) { 440 bind (Stacked); 441 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 442 lock(); 443 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 444 // Intentional fall-thru into DONE_LABEL 445 } 446 447 bind(DONE_LABEL); 448 449 // ZFlag == 1 count in fast path 450 // ZFlag == 0 count in slow path 451 jccb(Assembler::notZero, NO_COUNT); 452 453 bind(COUNT); 454 455 if (LockingMode == LM_LEGACY) { 456 // Count monitors in fast path 457 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 458 } 459 460 xorl(tmpReg, tmpReg); // Set ZF == 1 461 462 bind(NO_COUNT); 463 } 464 465 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 466 Register t, Register thread) { 467 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 468 assert(rax_reg == rax, "Used for CAS"); 469 assert_different_registers(obj, box, rax_reg, t, thread); 470 471 // Handle inflated monitor. 472 Label inflated; 473 // Finish fast lock successfully. ZF value is irrelevant. 474 Label locked; 475 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 476 Label slow_path; 477 478 if (UseObjectMonitorTable) { 479 // Clear cache in case fast locking succeeds or we need to take the slow-path. 480 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 481 } 482 483 if (DiagnoseSyncOnValueBasedClasses != 0) { 484 load_klass(rax_reg, obj, t); 485 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 486 jcc(Assembler::notZero, slow_path); 487 } 488 489 const Register mark = t; 490 491 { // Lightweight Lock 492 493 Label push; 494 495 const Register top = UseObjectMonitorTable ? rax_reg : box; 496 497 // Load the mark. 498 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 499 500 // Prefetch top. 501 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 502 503 // Check for monitor (0b10). 504 testptr(mark, markWord::monitor_value); 505 jcc(Assembler::notZero, inflated); 506 507 // Check if lock-stack is full. 508 cmpl(top, LockStack::end_offset() - 1); 509 jcc(Assembler::greater, slow_path); 510 511 // Check if recursive. 512 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 513 jccb(Assembler::equal, push); 514 515 // Try to lock. Transition lock bits 0b01 => 0b00 516 movptr(rax_reg, mark); 517 orptr(rax_reg, markWord::unlocked_value); 518 andptr(mark, ~(int32_t)markWord::unlocked_value); 519 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 520 jcc(Assembler::notEqual, slow_path); 521 522 if (UseObjectMonitorTable) { 523 // Need to reload top, clobbered by CAS. 524 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 525 } 526 bind(push); 527 // After successful lock, push object on lock-stack. 528 movptr(Address(thread, top), obj); 529 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 530 jmpb(locked); 531 } 532 533 { // Handle inflated monitor. 534 bind(inflated); 535 536 const Register monitor = t; 537 538 if (!UseObjectMonitorTable) { 539 assert(mark == monitor, "should be the same here"); 540 } else { 541 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 542 // Fetch ObjectMonitor* from the cache or take the slow-path. 543 Label monitor_found; 544 545 // Load cache address 546 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 547 548 const int num_unrolled = 2; 549 for (int i = 0; i < num_unrolled; i++) { 550 cmpptr(obj, Address(t)); 551 jccb(Assembler::equal, monitor_found); 552 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 553 } 554 555 Label loop; 556 557 // Search for obj in cache. 558 bind(loop); 559 560 // Check for match. 561 cmpptr(obj, Address(t)); 562 jccb(Assembler::equal, monitor_found); 563 564 // Search until null encountered, guaranteed _null_sentinel at end. 565 cmpptr(Address(t), 1); 566 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 567 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 568 jmpb(loop); 569 570 // Cache hit. 571 bind(monitor_found); 572 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 573 } 574 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 575 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 576 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 577 578 Label monitor_locked; 579 // Lock the monitor. 580 581 if (UseObjectMonitorTable) { 582 // Cache the monitor for unlock before trashing box. On failure to acquire 583 // the lock, the slow path will reset the entry accordingly (see CacheSetter). 584 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 585 } 586 587 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 588 xorptr(rax_reg, rax_reg); 589 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset())); 590 lock(); cmpxchgptr(box, owner_address); 591 jccb(Assembler::equal, monitor_locked); 592 593 // Check if recursive. 594 cmpptr(box, rax_reg); 595 jccb(Assembler::notEqual, slow_path); 596 597 // Recursive. 598 increment(recursions_address); 599 600 bind(monitor_locked); 601 } 602 603 bind(locked); 604 // Set ZF = 1 605 xorl(rax_reg, rax_reg); 606 607 #ifdef ASSERT 608 // Check that locked label is reached with ZF set. 609 Label zf_correct; 610 Label zf_bad_zero; 611 jcc(Assembler::zero, zf_correct); 612 jmp(zf_bad_zero); 613 #endif 614 615 bind(slow_path); 616 #ifdef ASSERT 617 // Check that slow_path label is reached with ZF not set. 618 jcc(Assembler::notZero, zf_correct); 619 stop("Fast Lock ZF != 0"); 620 bind(zf_bad_zero); 621 stop("Fast Lock ZF != 1"); 622 bind(zf_correct); 623 #endif 624 // C2 uses the value of ZF to determine the continuation. 625 } 626 627 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 628 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 629 assert(reg_rax == rax, "Used for CAS"); 630 assert_different_registers(obj, reg_rax, t); 631 632 // Handle inflated monitor. 633 Label inflated, inflated_check_lock_stack; 634 // Finish fast unlock successfully. MUST jump with ZF == 1 635 Label unlocked, slow_path; 636 637 const Register mark = t; 638 const Register monitor = t; 639 const Register top = UseObjectMonitorTable ? t : reg_rax; 640 const Register box = reg_rax; 641 642 Label dummy; 643 C2FastUnlockLightweightStub* stub = nullptr; 644 645 if (!Compile::current()->output()->in_scratch_emit_size()) { 646 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 647 Compile::current()->output()->add_stub(stub); 648 } 649 650 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 651 652 { // Lightweight Unlock 653 654 // Load top. 655 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 656 657 if (!UseObjectMonitorTable) { 658 // Prefetch mark. 659 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 660 } 661 662 // Check if obj is top of lock-stack. 663 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 664 // Top of lock stack was not obj. Must be monitor. 665 jcc(Assembler::notEqual, inflated_check_lock_stack); 666 667 // Pop lock-stack. 668 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 669 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 670 671 // Check if recursive. 672 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 673 jcc(Assembler::equal, unlocked); 674 675 // We elide the monitor check, let the CAS fail instead. 676 677 if (UseObjectMonitorTable) { 678 // Load mark. 679 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 680 } 681 682 // Try to unlock. Transition lock bits 0b00 => 0b01 683 movptr(reg_rax, mark); 684 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 685 orptr(mark, markWord::unlocked_value); 686 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 687 jcc(Assembler::notEqual, push_and_slow_path); 688 jmp(unlocked); 689 } 690 691 692 { // Handle inflated monitor. 693 bind(inflated_check_lock_stack); 694 #ifdef ASSERT 695 Label check_done; 696 subl(top, oopSize); 697 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 698 jcc(Assembler::below, check_done); 699 cmpptr(obj, Address(thread, top)); 700 jccb(Assembler::notEqual, inflated_check_lock_stack); 701 stop("Fast Unlock lock on stack"); 702 bind(check_done); 703 if (UseObjectMonitorTable) { 704 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 705 } 706 testptr(mark, markWord::monitor_value); 707 jccb(Assembler::notZero, inflated); 708 stop("Fast Unlock not monitor"); 709 #endif 710 711 bind(inflated); 712 713 if (!UseObjectMonitorTable) { 714 assert(mark == monitor, "should be the same here"); 715 } else { 716 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 717 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 718 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 719 cmpptr(monitor, alignof(ObjectMonitor*)); 720 jcc(Assembler::below, slow_path); 721 } 722 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 723 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 724 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 725 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag}; 726 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 727 728 Label recursive; 729 730 // Check if recursive. 731 cmpptr(recursions_address, 0); 732 jccb(Assembler::notZero, recursive); 733 734 // Set owner to null. 735 // Release to satisfy the JMM 736 movptr(owner_address, NULL_WORD); 737 // We need a full fence after clearing owner to avoid stranding. 738 // StoreLoad achieves this. 739 membar(StoreLoad); 740 741 // Check if the entry_list is empty. 742 cmpptr(entry_list_address, NULL_WORD); 743 jccb(Assembler::zero, unlocked); // If so we are done. 744 745 // Check if there is a successor. 746 cmpptr(succ_address, NULL_WORD); 747 jccb(Assembler::notZero, unlocked); // If so we are done. 748 749 // Save the monitor pointer in the current thread, so we can try to 750 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 751 if (!UseObjectMonitorTable) { 752 andptr(monitor, ~(int32_t)markWord::monitor_value); 753 } 754 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 755 756 orl(t, 1); // Fast Unlock ZF = 0 757 jmpb(slow_path); 758 759 // Recursive unlock. 760 bind(recursive); 761 decrement(recursions_address); 762 } 763 764 bind(unlocked); 765 xorl(t, t); // Fast Unlock ZF = 1 766 767 #ifdef ASSERT 768 // Check that unlocked label is reached with ZF set. 769 Label zf_correct; 770 Label zf_bad_zero; 771 jcc(Assembler::zero, zf_correct); 772 jmp(zf_bad_zero); 773 #endif 774 775 bind(slow_path); 776 if (stub != nullptr) { 777 bind(stub->slow_path_continuation()); 778 } 779 #ifdef ASSERT 780 // Check that stub->continuation() label is reached with ZF not set. 781 jcc(Assembler::notZero, zf_correct); 782 stop("Fast Unlock ZF != 0"); 783 bind(zf_bad_zero); 784 stop("Fast Unlock ZF != 1"); 785 bind(zf_correct); 786 #endif 787 // C2 uses the value of ZF to determine the continuation. 788 } 789 790 //------------------------------------------------------------------------------------------- 791 // Generic instructions support for use in .ad files C2 code generation 792 793 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 794 if (dst != src) { 795 movdqu(dst, src); 796 } 797 if (opcode == Op_AbsVD) { 798 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 799 } else { 800 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 801 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 802 } 803 } 804 805 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 806 if (opcode == Op_AbsVD) { 807 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 808 } else { 809 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 810 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 811 } 812 } 813 814 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 815 if (dst != src) { 816 movdqu(dst, src); 817 } 818 if (opcode == Op_AbsVF) { 819 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 820 } else { 821 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 822 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 823 } 824 } 825 826 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 827 if (opcode == Op_AbsVF) { 828 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 829 } else { 830 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 831 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 832 } 833 } 834 835 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 836 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 837 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 838 839 if (opcode == Op_MinV) { 840 if (elem_bt == T_BYTE) { 841 pminsb(dst, src); 842 } else if (elem_bt == T_SHORT) { 843 pminsw(dst, src); 844 } else if (elem_bt == T_INT) { 845 pminsd(dst, src); 846 } else { 847 assert(elem_bt == T_LONG, "required"); 848 assert(tmp == xmm0, "required"); 849 assert_different_registers(dst, src, tmp); 850 movdqu(xmm0, dst); 851 pcmpgtq(xmm0, src); 852 blendvpd(dst, src); // xmm0 as mask 853 } 854 } else { // opcode == Op_MaxV 855 if (elem_bt == T_BYTE) { 856 pmaxsb(dst, src); 857 } else if (elem_bt == T_SHORT) { 858 pmaxsw(dst, src); 859 } else if (elem_bt == T_INT) { 860 pmaxsd(dst, src); 861 } else { 862 assert(elem_bt == T_LONG, "required"); 863 assert(tmp == xmm0, "required"); 864 assert_different_registers(dst, src, tmp); 865 movdqu(xmm0, src); 866 pcmpgtq(xmm0, dst); 867 blendvpd(dst, src); // xmm0 as mask 868 } 869 } 870 } 871 872 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 873 XMMRegister src1, Address src2, int vlen_enc) { 874 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 875 if (opcode == Op_UMinV) { 876 switch(elem_bt) { 877 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 878 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 879 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 880 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 881 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 882 } 883 } else { 884 assert(opcode == Op_UMaxV, "required"); 885 switch(elem_bt) { 886 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 887 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 888 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 889 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 890 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 891 } 892 } 893 } 894 895 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 896 // For optimality, leverage a full vector width of 512 bits 897 // for operations over smaller vector sizes on AVX512 targets. 898 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { 899 if (opcode == Op_UMaxV) { 900 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 901 } else { 902 assert(opcode == Op_UMinV, "required"); 903 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 904 } 905 } else { 906 // T1 = -1 907 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc); 908 // T1 = -1 << 63 909 vpsllq(xtmp1, xtmp1, 63, vlen_enc); 910 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2 911 vpaddq(xtmp2, xtmp1, src2, vlen_enc); 912 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1 913 vpaddq(xtmp1, xtmp1, src1, vlen_enc); 914 // Mask = T2 > T1 915 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc); 916 if (opcode == Op_UMaxV) { 917 // Res = Mask ? Src2 : Src1 918 vpblendvb(dst, src1, src2, xtmp1, vlen_enc); 919 } else { 920 // Res = Mask ? Src1 : Src2 921 vpblendvb(dst, src2, src1, xtmp1, vlen_enc); 922 } 923 } 924 } 925 926 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 927 XMMRegister src1, XMMRegister src2, int vlen_enc) { 928 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 929 if (opcode == Op_UMinV) { 930 switch(elem_bt) { 931 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 932 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 933 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 934 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 935 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 936 } 937 } else { 938 assert(opcode == Op_UMaxV, "required"); 939 switch(elem_bt) { 940 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 941 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 942 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 943 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 944 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 945 } 946 } 947 } 948 949 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 950 XMMRegister dst, XMMRegister src1, XMMRegister src2, 951 int vlen_enc) { 952 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 953 954 if (opcode == Op_MinV) { 955 if (elem_bt == T_BYTE) { 956 vpminsb(dst, src1, src2, vlen_enc); 957 } else if (elem_bt == T_SHORT) { 958 vpminsw(dst, src1, src2, vlen_enc); 959 } else if (elem_bt == T_INT) { 960 vpminsd(dst, src1, src2, vlen_enc); 961 } else { 962 assert(elem_bt == T_LONG, "required"); 963 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 964 vpminsq(dst, src1, src2, vlen_enc); 965 } else { 966 assert_different_registers(dst, src1, src2); 967 vpcmpgtq(dst, src1, src2, vlen_enc); 968 vblendvpd(dst, src1, src2, dst, vlen_enc); 969 } 970 } 971 } else { // opcode == Op_MaxV 972 if (elem_bt == T_BYTE) { 973 vpmaxsb(dst, src1, src2, vlen_enc); 974 } else if (elem_bt == T_SHORT) { 975 vpmaxsw(dst, src1, src2, vlen_enc); 976 } else if (elem_bt == T_INT) { 977 vpmaxsd(dst, src1, src2, vlen_enc); 978 } else { 979 assert(elem_bt == T_LONG, "required"); 980 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 981 vpmaxsq(dst, src1, src2, vlen_enc); 982 } else { 983 assert_different_registers(dst, src1, src2); 984 vpcmpgtq(dst, src1, src2, vlen_enc); 985 vblendvpd(dst, src2, src1, dst, vlen_enc); 986 } 987 } 988 } 989 } 990 991 // Float/Double min max 992 993 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 994 XMMRegister dst, XMMRegister a, XMMRegister b, 995 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 996 int vlen_enc) { 997 assert(UseAVX > 0, "required"); 998 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 999 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1000 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1001 assert_different_registers(a, tmp, atmp, btmp); 1002 assert_different_registers(b, tmp, atmp, btmp); 1003 1004 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1005 bool is_double_word = is_double_word_type(elem_bt); 1006 1007 /* Note on 'non-obvious' assembly sequence: 1008 * 1009 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1010 * and Java on how they handle floats: 1011 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1012 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1013 * 1014 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1015 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1016 * (only useful when signs differ, noop otherwise) 1017 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1018 1019 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1020 * btmp = (b < +0.0) ? a : b 1021 * atmp = (b < +0.0) ? b : a 1022 * Tmp = Max_Float(atmp , btmp) 1023 * Res = (atmp == NaN) ? atmp : Tmp 1024 */ 1025 1026 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1027 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1028 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1029 XMMRegister mask; 1030 1031 if (!is_double_word && is_min) { 1032 mask = a; 1033 vblend = &MacroAssembler::vblendvps; 1034 vmaxmin = &MacroAssembler::vminps; 1035 vcmp = &MacroAssembler::vcmpps; 1036 } else if (!is_double_word && !is_min) { 1037 mask = b; 1038 vblend = &MacroAssembler::vblendvps; 1039 vmaxmin = &MacroAssembler::vmaxps; 1040 vcmp = &MacroAssembler::vcmpps; 1041 } else if (is_double_word && is_min) { 1042 mask = a; 1043 vblend = &MacroAssembler::vblendvpd; 1044 vmaxmin = &MacroAssembler::vminpd; 1045 vcmp = &MacroAssembler::vcmppd; 1046 } else { 1047 assert(is_double_word && !is_min, "sanity"); 1048 mask = b; 1049 vblend = &MacroAssembler::vblendvpd; 1050 vmaxmin = &MacroAssembler::vmaxpd; 1051 vcmp = &MacroAssembler::vcmppd; 1052 } 1053 1054 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1055 XMMRegister maxmin, scratch; 1056 if (dst == btmp) { 1057 maxmin = btmp; 1058 scratch = tmp; 1059 } else { 1060 maxmin = tmp; 1061 scratch = btmp; 1062 } 1063 1064 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1065 if (precompute_mask && !is_double_word) { 1066 vpsrad(tmp, mask, 32, vlen_enc); 1067 mask = tmp; 1068 } else if (precompute_mask && is_double_word) { 1069 vpxor(tmp, tmp, tmp, vlen_enc); 1070 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1071 mask = tmp; 1072 } 1073 1074 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1075 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1076 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1077 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1078 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1079 } 1080 1081 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1082 XMMRegister dst, XMMRegister a, XMMRegister b, 1083 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1084 int vlen_enc) { 1085 assert(UseAVX > 2, "required"); 1086 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1087 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1088 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1089 assert_different_registers(dst, a, atmp, btmp); 1090 assert_different_registers(dst, b, atmp, btmp); 1091 1092 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1093 bool is_double_word = is_double_word_type(elem_bt); 1094 bool merge = true; 1095 1096 if (!is_double_word && is_min) { 1097 evpmovd2m(ktmp, a, vlen_enc); 1098 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1099 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1100 vminps(dst, atmp, btmp, vlen_enc); 1101 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1102 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1103 } else if (!is_double_word && !is_min) { 1104 evpmovd2m(ktmp, b, vlen_enc); 1105 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1106 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1107 vmaxps(dst, atmp, btmp, vlen_enc); 1108 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1109 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1110 } else if (is_double_word && is_min) { 1111 evpmovq2m(ktmp, a, vlen_enc); 1112 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1113 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1114 vminpd(dst, atmp, btmp, vlen_enc); 1115 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1116 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1117 } else { 1118 assert(is_double_word && !is_min, "sanity"); 1119 evpmovq2m(ktmp, b, vlen_enc); 1120 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1121 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1122 vmaxpd(dst, atmp, btmp, vlen_enc); 1123 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1124 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1125 } 1126 } 1127 1128 // Float/Double signum 1129 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1130 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1131 1132 Label DONE_LABEL; 1133 1134 if (opcode == Op_SignumF) { 1135 ucomiss(dst, zero); 1136 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1137 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1138 movflt(dst, one); 1139 jcc(Assembler::above, DONE_LABEL); 1140 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1141 } else if (opcode == Op_SignumD) { 1142 ucomisd(dst, zero); 1143 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1144 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1145 movdbl(dst, one); 1146 jcc(Assembler::above, DONE_LABEL); 1147 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1148 } 1149 1150 bind(DONE_LABEL); 1151 } 1152 1153 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1154 if (sign) { 1155 pmovsxbw(dst, src); 1156 } else { 1157 pmovzxbw(dst, src); 1158 } 1159 } 1160 1161 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1162 if (sign) { 1163 vpmovsxbw(dst, src, vector_len); 1164 } else { 1165 vpmovzxbw(dst, src, vector_len); 1166 } 1167 } 1168 1169 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1170 if (sign) { 1171 vpmovsxbd(dst, src, vector_len); 1172 } else { 1173 vpmovzxbd(dst, src, vector_len); 1174 } 1175 } 1176 1177 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1178 if (sign) { 1179 vpmovsxwd(dst, src, vector_len); 1180 } else { 1181 vpmovzxwd(dst, src, vector_len); 1182 } 1183 } 1184 1185 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1186 int shift, int vector_len) { 1187 if (opcode == Op_RotateLeftV) { 1188 if (etype == T_INT) { 1189 evprold(dst, src, shift, vector_len); 1190 } else { 1191 assert(etype == T_LONG, "expected type T_LONG"); 1192 evprolq(dst, src, shift, vector_len); 1193 } 1194 } else { 1195 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1196 if (etype == T_INT) { 1197 evprord(dst, src, shift, vector_len); 1198 } else { 1199 assert(etype == T_LONG, "expected type T_LONG"); 1200 evprorq(dst, src, shift, vector_len); 1201 } 1202 } 1203 } 1204 1205 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1206 XMMRegister shift, int vector_len) { 1207 if (opcode == Op_RotateLeftV) { 1208 if (etype == T_INT) { 1209 evprolvd(dst, src, shift, vector_len); 1210 } else { 1211 assert(etype == T_LONG, "expected type T_LONG"); 1212 evprolvq(dst, src, shift, vector_len); 1213 } 1214 } else { 1215 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1216 if (etype == T_INT) { 1217 evprorvd(dst, src, shift, vector_len); 1218 } else { 1219 assert(etype == T_LONG, "expected type T_LONG"); 1220 evprorvq(dst, src, shift, vector_len); 1221 } 1222 } 1223 } 1224 1225 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1226 if (opcode == Op_RShiftVI) { 1227 psrad(dst, shift); 1228 } else if (opcode == Op_LShiftVI) { 1229 pslld(dst, shift); 1230 } else { 1231 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1232 psrld(dst, shift); 1233 } 1234 } 1235 1236 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1237 switch (opcode) { 1238 case Op_RShiftVI: psrad(dst, shift); break; 1239 case Op_LShiftVI: pslld(dst, shift); break; 1240 case Op_URShiftVI: psrld(dst, shift); break; 1241 1242 default: assert(false, "%s", NodeClassNames[opcode]); 1243 } 1244 } 1245 1246 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1247 if (opcode == Op_RShiftVI) { 1248 vpsrad(dst, nds, shift, vector_len); 1249 } else if (opcode == Op_LShiftVI) { 1250 vpslld(dst, nds, shift, vector_len); 1251 } else { 1252 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1253 vpsrld(dst, nds, shift, vector_len); 1254 } 1255 } 1256 1257 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1258 switch (opcode) { 1259 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1260 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1261 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1262 1263 default: assert(false, "%s", NodeClassNames[opcode]); 1264 } 1265 } 1266 1267 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1268 switch (opcode) { 1269 case Op_RShiftVB: // fall-through 1270 case Op_RShiftVS: psraw(dst, shift); break; 1271 1272 case Op_LShiftVB: // fall-through 1273 case Op_LShiftVS: psllw(dst, shift); break; 1274 1275 case Op_URShiftVS: // fall-through 1276 case Op_URShiftVB: psrlw(dst, shift); break; 1277 1278 default: assert(false, "%s", NodeClassNames[opcode]); 1279 } 1280 } 1281 1282 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1283 switch (opcode) { 1284 case Op_RShiftVB: // fall-through 1285 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1286 1287 case Op_LShiftVB: // fall-through 1288 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1289 1290 case Op_URShiftVS: // fall-through 1291 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1292 1293 default: assert(false, "%s", NodeClassNames[opcode]); 1294 } 1295 } 1296 1297 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1298 switch (opcode) { 1299 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1300 case Op_LShiftVL: psllq(dst, shift); break; 1301 case Op_URShiftVL: psrlq(dst, shift); break; 1302 1303 default: assert(false, "%s", NodeClassNames[opcode]); 1304 } 1305 } 1306 1307 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1308 if (opcode == Op_RShiftVL) { 1309 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1310 } else if (opcode == Op_LShiftVL) { 1311 psllq(dst, shift); 1312 } else { 1313 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1314 psrlq(dst, shift); 1315 } 1316 } 1317 1318 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1319 switch (opcode) { 1320 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1321 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1322 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1323 1324 default: assert(false, "%s", NodeClassNames[opcode]); 1325 } 1326 } 1327 1328 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1329 if (opcode == Op_RShiftVL) { 1330 evpsraq(dst, nds, shift, vector_len); 1331 } else if (opcode == Op_LShiftVL) { 1332 vpsllq(dst, nds, shift, vector_len); 1333 } else { 1334 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1335 vpsrlq(dst, nds, shift, vector_len); 1336 } 1337 } 1338 1339 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1340 switch (opcode) { 1341 case Op_RShiftVB: // fall-through 1342 case Op_RShiftVS: // fall-through 1343 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1344 1345 case Op_LShiftVB: // fall-through 1346 case Op_LShiftVS: // fall-through 1347 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1348 1349 case Op_URShiftVB: // fall-through 1350 case Op_URShiftVS: // fall-through 1351 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1352 1353 default: assert(false, "%s", NodeClassNames[opcode]); 1354 } 1355 } 1356 1357 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1358 switch (opcode) { 1359 case Op_RShiftVB: // fall-through 1360 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1361 1362 case Op_LShiftVB: // fall-through 1363 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1364 1365 case Op_URShiftVB: // fall-through 1366 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1367 1368 default: assert(false, "%s", NodeClassNames[opcode]); 1369 } 1370 } 1371 1372 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1373 assert(UseAVX >= 2, "required"); 1374 switch (opcode) { 1375 case Op_RShiftVL: { 1376 if (UseAVX > 2) { 1377 assert(tmp == xnoreg, "not used"); 1378 if (!VM_Version::supports_avx512vl()) { 1379 vlen_enc = Assembler::AVX_512bit; 1380 } 1381 evpsravq(dst, src, shift, vlen_enc); 1382 } else { 1383 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1384 vpsrlvq(dst, src, shift, vlen_enc); 1385 vpsrlvq(tmp, tmp, shift, vlen_enc); 1386 vpxor(dst, dst, tmp, vlen_enc); 1387 vpsubq(dst, dst, tmp, vlen_enc); 1388 } 1389 break; 1390 } 1391 case Op_LShiftVL: { 1392 assert(tmp == xnoreg, "not used"); 1393 vpsllvq(dst, src, shift, vlen_enc); 1394 break; 1395 } 1396 case Op_URShiftVL: { 1397 assert(tmp == xnoreg, "not used"); 1398 vpsrlvq(dst, src, shift, vlen_enc); 1399 break; 1400 } 1401 default: assert(false, "%s", NodeClassNames[opcode]); 1402 } 1403 } 1404 1405 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1406 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1407 assert(opcode == Op_LShiftVB || 1408 opcode == Op_RShiftVB || 1409 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1410 bool sign = (opcode != Op_URShiftVB); 1411 assert(vector_len == 0, "required"); 1412 vextendbd(sign, dst, src, 1); 1413 vpmovzxbd(vtmp, shift, 1); 1414 varshiftd(opcode, dst, dst, vtmp, 1); 1415 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1416 vextracti128_high(vtmp, dst); 1417 vpackusdw(dst, dst, vtmp, 0); 1418 } 1419 1420 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1421 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1422 assert(opcode == Op_LShiftVB || 1423 opcode == Op_RShiftVB || 1424 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1425 bool sign = (opcode != Op_URShiftVB); 1426 int ext_vector_len = vector_len + 1; 1427 vextendbw(sign, dst, src, ext_vector_len); 1428 vpmovzxbw(vtmp, shift, ext_vector_len); 1429 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1430 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1431 if (vector_len == 0) { 1432 vextracti128_high(vtmp, dst); 1433 vpackuswb(dst, dst, vtmp, vector_len); 1434 } else { 1435 vextracti64x4_high(vtmp, dst); 1436 vpackuswb(dst, dst, vtmp, vector_len); 1437 vpermq(dst, dst, 0xD8, vector_len); 1438 } 1439 } 1440 1441 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1442 switch(typ) { 1443 case T_BYTE: 1444 pinsrb(dst, val, idx); 1445 break; 1446 case T_SHORT: 1447 pinsrw(dst, val, idx); 1448 break; 1449 case T_INT: 1450 pinsrd(dst, val, idx); 1451 break; 1452 case T_LONG: 1453 pinsrq(dst, val, idx); 1454 break; 1455 default: 1456 assert(false,"Should not reach here."); 1457 break; 1458 } 1459 } 1460 1461 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1462 switch(typ) { 1463 case T_BYTE: 1464 vpinsrb(dst, src, val, idx); 1465 break; 1466 case T_SHORT: 1467 vpinsrw(dst, src, val, idx); 1468 break; 1469 case T_INT: 1470 vpinsrd(dst, src, val, idx); 1471 break; 1472 case T_LONG: 1473 vpinsrq(dst, src, val, idx); 1474 break; 1475 default: 1476 assert(false,"Should not reach here."); 1477 break; 1478 } 1479 } 1480 1481 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1482 XMMRegister dst, Register base, 1483 Register idx_base, 1484 Register offset, Register mask, 1485 Register mask_idx, Register rtmp, 1486 int vlen_enc) { 1487 vpxor(dst, dst, dst, vlen_enc); 1488 if (elem_bt == T_SHORT) { 1489 for (int i = 0; i < 4; i++) { 1490 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1491 Label skip_load; 1492 btq(mask, mask_idx); 1493 jccb(Assembler::carryClear, skip_load); 1494 movl(rtmp, Address(idx_base, i * 4)); 1495 if (offset != noreg) { 1496 addl(rtmp, offset); 1497 } 1498 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1499 bind(skip_load); 1500 incq(mask_idx); 1501 } 1502 } else { 1503 assert(elem_bt == T_BYTE, ""); 1504 for (int i = 0; i < 8; i++) { 1505 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1506 Label skip_load; 1507 btq(mask, mask_idx); 1508 jccb(Assembler::carryClear, skip_load); 1509 movl(rtmp, Address(idx_base, i * 4)); 1510 if (offset != noreg) { 1511 addl(rtmp, offset); 1512 } 1513 pinsrb(dst, Address(base, rtmp), i); 1514 bind(skip_load); 1515 incq(mask_idx); 1516 } 1517 } 1518 } 1519 1520 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1521 Register base, Register idx_base, 1522 Register offset, Register rtmp, 1523 int vlen_enc) { 1524 vpxor(dst, dst, dst, vlen_enc); 1525 if (elem_bt == T_SHORT) { 1526 for (int i = 0; i < 4; i++) { 1527 // dst[i] = src[offset + idx_base[i]] 1528 movl(rtmp, Address(idx_base, i * 4)); 1529 if (offset != noreg) { 1530 addl(rtmp, offset); 1531 } 1532 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1533 } 1534 } else { 1535 assert(elem_bt == T_BYTE, ""); 1536 for (int i = 0; i < 8; i++) { 1537 // dst[i] = src[offset + idx_base[i]] 1538 movl(rtmp, Address(idx_base, i * 4)); 1539 if (offset != noreg) { 1540 addl(rtmp, offset); 1541 } 1542 pinsrb(dst, Address(base, rtmp), i); 1543 } 1544 } 1545 } 1546 1547 /* 1548 * Gather using hybrid algorithm, first partially unroll scalar loop 1549 * to accumulate values from gather indices into a quad-word(64bit) slice. 1550 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1551 * permutation to place the slice into appropriate vector lane 1552 * locations in destination vector. Following pseudo code describes the 1553 * algorithm in detail: 1554 * 1555 * DST_VEC = ZERO_VEC 1556 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1557 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1558 * FOREACH_ITER: 1559 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1560 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1561 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1562 * PERM_INDEX = PERM_INDEX - TWO_VEC 1563 * 1564 * With each iteration, doubleword permute indices (0,1) corresponding 1565 * to gathered quadword gets right shifted by two lane positions. 1566 * 1567 */ 1568 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1569 Register base, Register idx_base, 1570 Register offset, Register mask, 1571 XMMRegister xtmp1, XMMRegister xtmp2, 1572 XMMRegister temp_dst, Register rtmp, 1573 Register mask_idx, Register length, 1574 int vector_len, int vlen_enc) { 1575 Label GATHER8_LOOP; 1576 assert(is_subword_type(elem_ty), ""); 1577 movl(length, vector_len); 1578 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1579 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1580 vallones(xtmp2, vlen_enc); 1581 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1582 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1583 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1584 1585 bind(GATHER8_LOOP); 1586 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1587 if (mask == noreg) { 1588 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1589 } else { 1590 vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc); 1591 } 1592 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1593 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1594 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1595 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1596 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1597 vpor(dst, dst, temp_dst, vlen_enc); 1598 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1599 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1600 jcc(Assembler::notEqual, GATHER8_LOOP); 1601 } 1602 1603 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1604 switch(typ) { 1605 case T_INT: 1606 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1607 break; 1608 case T_FLOAT: 1609 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1610 break; 1611 case T_LONG: 1612 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1613 break; 1614 case T_DOUBLE: 1615 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1616 break; 1617 default: 1618 assert(false,"Should not reach here."); 1619 break; 1620 } 1621 } 1622 1623 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1624 switch(typ) { 1625 case T_INT: 1626 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1627 break; 1628 case T_FLOAT: 1629 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1630 break; 1631 case T_LONG: 1632 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1633 break; 1634 case T_DOUBLE: 1635 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1636 break; 1637 default: 1638 assert(false,"Should not reach here."); 1639 break; 1640 } 1641 } 1642 1643 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1644 switch(typ) { 1645 case T_INT: 1646 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1647 break; 1648 case T_FLOAT: 1649 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1650 break; 1651 case T_LONG: 1652 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1653 break; 1654 case T_DOUBLE: 1655 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1656 break; 1657 default: 1658 assert(false,"Should not reach here."); 1659 break; 1660 } 1661 } 1662 1663 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1664 if (vlen_in_bytes <= 16) { 1665 pxor (dst, dst); 1666 psubb(dst, src); 1667 switch (elem_bt) { 1668 case T_BYTE: /* nothing to do */ break; 1669 case T_SHORT: pmovsxbw(dst, dst); break; 1670 case T_INT: pmovsxbd(dst, dst); break; 1671 case T_FLOAT: pmovsxbd(dst, dst); break; 1672 case T_LONG: pmovsxbq(dst, dst); break; 1673 case T_DOUBLE: pmovsxbq(dst, dst); break; 1674 1675 default: assert(false, "%s", type2name(elem_bt)); 1676 } 1677 } else { 1678 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1679 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1680 1681 vpxor (dst, dst, dst, vlen_enc); 1682 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1683 1684 switch (elem_bt) { 1685 case T_BYTE: /* nothing to do */ break; 1686 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1687 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1688 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1689 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1690 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1691 1692 default: assert(false, "%s", type2name(elem_bt)); 1693 } 1694 } 1695 } 1696 1697 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1698 if (novlbwdq) { 1699 vpmovsxbd(xtmp, src, vlen_enc); 1700 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1701 Assembler::eq, true, vlen_enc, noreg); 1702 } else { 1703 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1704 vpsubb(xtmp, xtmp, src, vlen_enc); 1705 evpmovb2m(dst, xtmp, vlen_enc); 1706 } 1707 } 1708 1709 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) { 1710 if (is_integral_type(bt)) { 1711 switch (vlen_in_bytes) { 1712 case 4: movdl(dst, src); break; 1713 case 8: movq(dst, src); break; 1714 case 16: movdqu(dst, src); break; 1715 case 32: vmovdqu(dst, src); break; 1716 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1717 default: ShouldNotReachHere(); 1718 } 1719 } else { 1720 switch (vlen_in_bytes) { 1721 case 4: movflt(dst, src); break; 1722 case 8: movdbl(dst, src); break; 1723 case 16: movups(dst, src); break; 1724 case 32: vmovups(dst, src, Assembler::AVX_256bit); break; 1725 case 64: vmovups(dst, src, Assembler::AVX_512bit); break; 1726 default: ShouldNotReachHere(); 1727 } 1728 } 1729 } 1730 1731 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1732 assert(rscratch != noreg || always_reachable(src), "missing"); 1733 1734 if (reachable(src)) { 1735 load_vector(bt, dst, as_Address(src), vlen_in_bytes); 1736 } else { 1737 lea(rscratch, src); 1738 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes); 1739 } 1740 } 1741 1742 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1743 int vlen_enc = vector_length_encoding(vlen); 1744 if (VM_Version::supports_avx()) { 1745 if (bt == T_LONG) { 1746 if (VM_Version::supports_avx2()) { 1747 vpbroadcastq(dst, src, vlen_enc); 1748 } else { 1749 vmovddup(dst, src, vlen_enc); 1750 } 1751 } else if (bt == T_DOUBLE) { 1752 if (vlen_enc != Assembler::AVX_128bit) { 1753 vbroadcastsd(dst, src, vlen_enc, noreg); 1754 } else { 1755 vmovddup(dst, src, vlen_enc); 1756 } 1757 } else { 1758 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1759 vpbroadcastd(dst, src, vlen_enc); 1760 } else { 1761 vbroadcastss(dst, src, vlen_enc); 1762 } 1763 } 1764 } else if (VM_Version::supports_sse3()) { 1765 movddup(dst, src); 1766 } else { 1767 load_vector(bt, dst, src, vlen); 1768 } 1769 } 1770 1771 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1772 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1773 int offset = exact_log2(type2aelembytes(bt)) << 6; 1774 if (is_floating_point_type(bt)) { 1775 offset += 128; 1776 } 1777 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1778 load_vector(T_BYTE, dst, addr, vlen_in_bytes); 1779 } 1780 1781 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1782 1783 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1784 int vector_len = Assembler::AVX_128bit; 1785 1786 switch (opcode) { 1787 case Op_AndReductionV: pand(dst, src); break; 1788 case Op_OrReductionV: por (dst, src); break; 1789 case Op_XorReductionV: pxor(dst, src); break; 1790 case Op_MinReductionV: 1791 switch (typ) { 1792 case T_BYTE: pminsb(dst, src); break; 1793 case T_SHORT: pminsw(dst, src); break; 1794 case T_INT: pminsd(dst, src); break; 1795 case T_LONG: assert(UseAVX > 2, "required"); 1796 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1797 default: assert(false, "wrong type"); 1798 } 1799 break; 1800 case Op_MaxReductionV: 1801 switch (typ) { 1802 case T_BYTE: pmaxsb(dst, src); break; 1803 case T_SHORT: pmaxsw(dst, src); break; 1804 case T_INT: pmaxsd(dst, src); break; 1805 case T_LONG: assert(UseAVX > 2, "required"); 1806 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1807 default: assert(false, "wrong type"); 1808 } 1809 break; 1810 case Op_AddReductionVF: addss(dst, src); break; 1811 case Op_AddReductionVD: addsd(dst, src); break; 1812 case Op_AddReductionVI: 1813 switch (typ) { 1814 case T_BYTE: paddb(dst, src); break; 1815 case T_SHORT: paddw(dst, src); break; 1816 case T_INT: paddd(dst, src); break; 1817 default: assert(false, "wrong type"); 1818 } 1819 break; 1820 case Op_AddReductionVL: paddq(dst, src); break; 1821 case Op_MulReductionVF: mulss(dst, src); break; 1822 case Op_MulReductionVD: mulsd(dst, src); break; 1823 case Op_MulReductionVI: 1824 switch (typ) { 1825 case T_SHORT: pmullw(dst, src); break; 1826 case T_INT: pmulld(dst, src); break; 1827 default: assert(false, "wrong type"); 1828 } 1829 break; 1830 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1831 evpmullq(dst, dst, src, vector_len); break; 1832 default: assert(false, "wrong opcode"); 1833 } 1834 } 1835 1836 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1837 switch (opcode) { 1838 case Op_AddReductionVF: addps(dst, src); break; 1839 case Op_AddReductionVD: addpd(dst, src); break; 1840 case Op_MulReductionVF: mulps(dst, src); break; 1841 case Op_MulReductionVD: mulpd(dst, src); break; 1842 default: assert(false, "%s", NodeClassNames[opcode]); 1843 } 1844 } 1845 1846 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1847 int vector_len = Assembler::AVX_256bit; 1848 1849 switch (opcode) { 1850 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1851 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1852 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1853 case Op_MinReductionV: 1854 switch (typ) { 1855 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1856 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1857 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1858 case T_LONG: assert(UseAVX > 2, "required"); 1859 vpminsq(dst, src1, src2, vector_len); break; 1860 default: assert(false, "wrong type"); 1861 } 1862 break; 1863 case Op_MaxReductionV: 1864 switch (typ) { 1865 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1866 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1867 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1868 case T_LONG: assert(UseAVX > 2, "required"); 1869 vpmaxsq(dst, src1, src2, vector_len); break; 1870 default: assert(false, "wrong type"); 1871 } 1872 break; 1873 case Op_AddReductionVI: 1874 switch (typ) { 1875 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1876 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1877 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1878 default: assert(false, "wrong type"); 1879 } 1880 break; 1881 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1882 case Op_MulReductionVI: 1883 switch (typ) { 1884 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1885 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1886 default: assert(false, "wrong type"); 1887 } 1888 break; 1889 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1890 default: assert(false, "wrong opcode"); 1891 } 1892 } 1893 1894 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1895 int vector_len = Assembler::AVX_256bit; 1896 1897 switch (opcode) { 1898 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 1899 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 1900 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 1901 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 1902 default: assert(false, "%s", NodeClassNames[opcode]); 1903 } 1904 } 1905 1906 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1907 XMMRegister dst, XMMRegister src, 1908 XMMRegister vtmp1, XMMRegister vtmp2) { 1909 switch (opcode) { 1910 case Op_AddReductionVF: 1911 case Op_MulReductionVF: 1912 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1913 break; 1914 1915 case Op_AddReductionVD: 1916 case Op_MulReductionVD: 1917 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1918 break; 1919 1920 default: assert(false, "wrong opcode"); 1921 } 1922 } 1923 1924 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 1925 XMMRegister dst, XMMRegister src, 1926 XMMRegister vtmp1, XMMRegister vtmp2) { 1927 switch (opcode) { 1928 case Op_AddReductionVF: 1929 case Op_MulReductionVF: 1930 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1931 break; 1932 1933 case Op_AddReductionVD: 1934 case Op_MulReductionVD: 1935 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1936 break; 1937 1938 default: assert(false, "%s", NodeClassNames[opcode]); 1939 } 1940 } 1941 1942 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1943 Register dst, Register src1, XMMRegister src2, 1944 XMMRegister vtmp1, XMMRegister vtmp2) { 1945 switch (vlen) { 1946 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1947 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1948 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1949 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1950 1951 default: assert(false, "wrong vector length"); 1952 } 1953 } 1954 1955 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1956 Register dst, Register src1, XMMRegister src2, 1957 XMMRegister vtmp1, XMMRegister vtmp2) { 1958 switch (vlen) { 1959 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1960 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1961 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1962 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1963 1964 default: assert(false, "wrong vector length"); 1965 } 1966 } 1967 1968 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1969 Register dst, Register src1, XMMRegister src2, 1970 XMMRegister vtmp1, XMMRegister vtmp2) { 1971 switch (vlen) { 1972 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1973 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1974 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1975 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1976 1977 default: assert(false, "wrong vector length"); 1978 } 1979 } 1980 1981 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1982 Register dst, Register src1, XMMRegister src2, 1983 XMMRegister vtmp1, XMMRegister vtmp2) { 1984 switch (vlen) { 1985 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1986 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1987 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1988 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1989 1990 default: assert(false, "wrong vector length"); 1991 } 1992 } 1993 1994 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1995 Register dst, Register src1, XMMRegister src2, 1996 XMMRegister vtmp1, XMMRegister vtmp2) { 1997 switch (vlen) { 1998 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1999 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2000 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2001 2002 default: assert(false, "wrong vector length"); 2003 } 2004 } 2005 2006 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2007 switch (vlen) { 2008 case 2: 2009 assert(vtmp2 == xnoreg, ""); 2010 reduce2F(opcode, dst, src, vtmp1); 2011 break; 2012 case 4: 2013 assert(vtmp2 == xnoreg, ""); 2014 reduce4F(opcode, dst, src, vtmp1); 2015 break; 2016 case 8: 2017 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2018 break; 2019 case 16: 2020 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2021 break; 2022 default: assert(false, "wrong vector length"); 2023 } 2024 } 2025 2026 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2027 switch (vlen) { 2028 case 2: 2029 assert(vtmp2 == xnoreg, ""); 2030 reduce2D(opcode, dst, src, vtmp1); 2031 break; 2032 case 4: 2033 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2034 break; 2035 case 8: 2036 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2037 break; 2038 default: assert(false, "wrong vector length"); 2039 } 2040 } 2041 2042 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2043 switch (vlen) { 2044 case 2: 2045 assert(vtmp1 == xnoreg, ""); 2046 assert(vtmp2 == xnoreg, ""); 2047 unorderedReduce2F(opcode, dst, src); 2048 break; 2049 case 4: 2050 assert(vtmp2 == xnoreg, ""); 2051 unorderedReduce4F(opcode, dst, src, vtmp1); 2052 break; 2053 case 8: 2054 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 2055 break; 2056 case 16: 2057 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 2058 break; 2059 default: assert(false, "wrong vector length"); 2060 } 2061 } 2062 2063 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2064 switch (vlen) { 2065 case 2: 2066 assert(vtmp1 == xnoreg, ""); 2067 assert(vtmp2 == xnoreg, ""); 2068 unorderedReduce2D(opcode, dst, src); 2069 break; 2070 case 4: 2071 assert(vtmp2 == xnoreg, ""); 2072 unorderedReduce4D(opcode, dst, src, vtmp1); 2073 break; 2074 case 8: 2075 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2076 break; 2077 default: assert(false, "wrong vector length"); 2078 } 2079 } 2080 2081 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2082 if (opcode == Op_AddReductionVI) { 2083 if (vtmp1 != src2) { 2084 movdqu(vtmp1, src2); 2085 } 2086 phaddd(vtmp1, vtmp1); 2087 } else { 2088 pshufd(vtmp1, src2, 0x1); 2089 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2090 } 2091 movdl(vtmp2, src1); 2092 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2093 movdl(dst, vtmp1); 2094 } 2095 2096 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2097 if (opcode == Op_AddReductionVI) { 2098 if (vtmp1 != src2) { 2099 movdqu(vtmp1, src2); 2100 } 2101 phaddd(vtmp1, src2); 2102 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2103 } else { 2104 pshufd(vtmp2, src2, 0xE); 2105 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2106 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2107 } 2108 } 2109 2110 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2111 if (opcode == Op_AddReductionVI) { 2112 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2113 vextracti128_high(vtmp2, vtmp1); 2114 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2115 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2116 } else { 2117 vextracti128_high(vtmp1, src2); 2118 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2119 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2120 } 2121 } 2122 2123 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2124 vextracti64x4_high(vtmp2, src2); 2125 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2126 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2127 } 2128 2129 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2130 pshufd(vtmp2, src2, 0x1); 2131 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2132 movdqu(vtmp1, vtmp2); 2133 psrldq(vtmp1, 2); 2134 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2135 movdqu(vtmp2, vtmp1); 2136 psrldq(vtmp2, 1); 2137 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2138 movdl(vtmp2, src1); 2139 pmovsxbd(vtmp1, vtmp1); 2140 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2141 pextrb(dst, vtmp1, 0x0); 2142 movsbl(dst, dst); 2143 } 2144 2145 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2146 pshufd(vtmp1, src2, 0xE); 2147 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2148 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2149 } 2150 2151 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2152 vextracti128_high(vtmp2, src2); 2153 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2154 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2155 } 2156 2157 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2158 vextracti64x4_high(vtmp1, src2); 2159 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2160 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2161 } 2162 2163 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2164 pmovsxbw(vtmp2, src2); 2165 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2166 } 2167 2168 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2169 if (UseAVX > 1) { 2170 int vector_len = Assembler::AVX_256bit; 2171 vpmovsxbw(vtmp1, src2, vector_len); 2172 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2173 } else { 2174 pmovsxbw(vtmp2, src2); 2175 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2176 pshufd(vtmp2, src2, 0x1); 2177 pmovsxbw(vtmp2, src2); 2178 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2179 } 2180 } 2181 2182 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2183 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2184 int vector_len = Assembler::AVX_512bit; 2185 vpmovsxbw(vtmp1, src2, vector_len); 2186 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2187 } else { 2188 assert(UseAVX >= 2,"Should not reach here."); 2189 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2190 vextracti128_high(vtmp2, src2); 2191 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2192 } 2193 } 2194 2195 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2196 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2197 vextracti64x4_high(vtmp2, src2); 2198 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2199 } 2200 2201 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2202 if (opcode == Op_AddReductionVI) { 2203 if (vtmp1 != src2) { 2204 movdqu(vtmp1, src2); 2205 } 2206 phaddw(vtmp1, vtmp1); 2207 phaddw(vtmp1, vtmp1); 2208 } else { 2209 pshufd(vtmp2, src2, 0x1); 2210 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2211 movdqu(vtmp1, vtmp2); 2212 psrldq(vtmp1, 2); 2213 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2214 } 2215 movdl(vtmp2, src1); 2216 pmovsxwd(vtmp1, vtmp1); 2217 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2218 pextrw(dst, vtmp1, 0x0); 2219 movswl(dst, dst); 2220 } 2221 2222 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2223 if (opcode == Op_AddReductionVI) { 2224 if (vtmp1 != src2) { 2225 movdqu(vtmp1, src2); 2226 } 2227 phaddw(vtmp1, src2); 2228 } else { 2229 pshufd(vtmp1, src2, 0xE); 2230 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2231 } 2232 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2233 } 2234 2235 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2236 if (opcode == Op_AddReductionVI) { 2237 int vector_len = Assembler::AVX_256bit; 2238 vphaddw(vtmp2, src2, src2, vector_len); 2239 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2240 } else { 2241 vextracti128_high(vtmp2, src2); 2242 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2243 } 2244 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2245 } 2246 2247 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2248 int vector_len = Assembler::AVX_256bit; 2249 vextracti64x4_high(vtmp1, src2); 2250 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2251 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2252 } 2253 2254 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2255 pshufd(vtmp2, src2, 0xE); 2256 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2257 movdq(vtmp1, src1); 2258 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2259 movdq(dst, vtmp1); 2260 } 2261 2262 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2263 vextracti128_high(vtmp1, src2); 2264 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2265 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2266 } 2267 2268 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2269 vextracti64x4_high(vtmp2, src2); 2270 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2271 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2272 } 2273 2274 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2275 mov64(temp, -1L); 2276 bzhiq(temp, temp, len); 2277 kmovql(dst, temp); 2278 } 2279 2280 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2281 reduce_operation_128(T_FLOAT, opcode, dst, src); 2282 pshufd(vtmp, src, 0x1); 2283 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2284 } 2285 2286 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2287 reduce2F(opcode, dst, src, vtmp); 2288 pshufd(vtmp, src, 0x2); 2289 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2290 pshufd(vtmp, src, 0x3); 2291 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2292 } 2293 2294 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2295 reduce4F(opcode, dst, src, vtmp2); 2296 vextractf128_high(vtmp2, src); 2297 reduce4F(opcode, dst, vtmp2, vtmp1); 2298 } 2299 2300 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2301 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2302 vextracti64x4_high(vtmp1, src); 2303 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2304 } 2305 2306 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2307 pshufd(dst, src, 0x1); 2308 reduce_operation_128(T_FLOAT, opcode, dst, src); 2309 } 2310 2311 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2312 pshufd(vtmp, src, 0xE); 2313 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2314 unorderedReduce2F(opcode, dst, vtmp); 2315 } 2316 2317 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2318 vextractf128_high(vtmp1, src); 2319 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2320 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2321 } 2322 2323 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2324 vextractf64x4_high(vtmp2, src); 2325 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2326 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2327 } 2328 2329 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2330 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2331 pshufd(vtmp, src, 0xE); 2332 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2333 } 2334 2335 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2336 reduce2D(opcode, dst, src, vtmp2); 2337 vextractf128_high(vtmp2, src); 2338 reduce2D(opcode, dst, vtmp2, vtmp1); 2339 } 2340 2341 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2342 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2343 vextracti64x4_high(vtmp1, src); 2344 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2345 } 2346 2347 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2348 pshufd(dst, src, 0xE); 2349 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2350 } 2351 2352 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2353 vextractf128_high(vtmp, src); 2354 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2355 unorderedReduce2D(opcode, dst, vtmp); 2356 } 2357 2358 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2359 vextractf64x4_high(vtmp2, src); 2360 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2361 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2362 } 2363 2364 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2365 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2366 } 2367 2368 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2369 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2370 } 2371 2372 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 2373 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2374 } 2375 2376 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2377 int vec_enc) { 2378 switch(elem_bt) { 2379 case T_INT: 2380 case T_FLOAT: 2381 vmaskmovps(dst, src, mask, vec_enc); 2382 break; 2383 case T_LONG: 2384 case T_DOUBLE: 2385 vmaskmovpd(dst, src, mask, vec_enc); 2386 break; 2387 default: 2388 fatal("Unsupported type %s", type2name(elem_bt)); 2389 break; 2390 } 2391 } 2392 2393 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2394 int vec_enc) { 2395 switch(elem_bt) { 2396 case T_INT: 2397 case T_FLOAT: 2398 vmaskmovps(dst, src, mask, vec_enc); 2399 break; 2400 case T_LONG: 2401 case T_DOUBLE: 2402 vmaskmovpd(dst, src, mask, vec_enc); 2403 break; 2404 default: 2405 fatal("Unsupported type %s", type2name(elem_bt)); 2406 break; 2407 } 2408 } 2409 2410 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2411 XMMRegister dst, XMMRegister src, 2412 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2413 XMMRegister xmm_0, XMMRegister xmm_1) { 2414 const int permconst[] = {1, 14}; 2415 XMMRegister wsrc = src; 2416 XMMRegister wdst = xmm_0; 2417 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2418 2419 int vlen_enc = Assembler::AVX_128bit; 2420 if (vlen == 16) { 2421 vlen_enc = Assembler::AVX_256bit; 2422 } 2423 2424 for (int i = log2(vlen) - 1; i >=0; i--) { 2425 if (i == 0 && !is_dst_valid) { 2426 wdst = dst; 2427 } 2428 if (i == 3) { 2429 vextracti64x4_high(wtmp, wsrc); 2430 } else if (i == 2) { 2431 vextracti128_high(wtmp, wsrc); 2432 } else { // i = [0,1] 2433 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2434 } 2435 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2436 wsrc = wdst; 2437 vlen_enc = Assembler::AVX_128bit; 2438 } 2439 if (is_dst_valid) { 2440 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2441 } 2442 } 2443 2444 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2445 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2446 XMMRegister xmm_0, XMMRegister xmm_1) { 2447 XMMRegister wsrc = src; 2448 XMMRegister wdst = xmm_0; 2449 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2450 int vlen_enc = Assembler::AVX_128bit; 2451 if (vlen == 8) { 2452 vlen_enc = Assembler::AVX_256bit; 2453 } 2454 for (int i = log2(vlen) - 1; i >=0; i--) { 2455 if (i == 0 && !is_dst_valid) { 2456 wdst = dst; 2457 } 2458 if (i == 1) { 2459 vextracti128_high(wtmp, wsrc); 2460 } else if (i == 2) { 2461 vextracti64x4_high(wtmp, wsrc); 2462 } else { 2463 assert(i == 0, "%d", i); 2464 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2465 } 2466 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2467 wsrc = wdst; 2468 vlen_enc = Assembler::AVX_128bit; 2469 } 2470 if (is_dst_valid) { 2471 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2472 } 2473 } 2474 2475 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2476 switch (bt) { 2477 case T_BYTE: pextrb(dst, src, idx); break; 2478 case T_SHORT: pextrw(dst, src, idx); break; 2479 case T_INT: pextrd(dst, src, idx); break; 2480 case T_LONG: pextrq(dst, src, idx); break; 2481 2482 default: 2483 assert(false,"Should not reach here."); 2484 break; 2485 } 2486 } 2487 2488 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2489 int esize = type2aelembytes(typ); 2490 int elem_per_lane = 16/esize; 2491 int lane = elemindex / elem_per_lane; 2492 int eindex = elemindex % elem_per_lane; 2493 2494 if (lane >= 2) { 2495 assert(UseAVX > 2, "required"); 2496 vextractf32x4(dst, src, lane & 3); 2497 return dst; 2498 } else if (lane > 0) { 2499 assert(UseAVX > 0, "required"); 2500 vextractf128(dst, src, lane); 2501 return dst; 2502 } else { 2503 return src; 2504 } 2505 } 2506 2507 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2508 if (typ == T_BYTE) { 2509 movsbl(dst, dst); 2510 } else if (typ == T_SHORT) { 2511 movswl(dst, dst); 2512 } 2513 } 2514 2515 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2516 int esize = type2aelembytes(typ); 2517 int elem_per_lane = 16/esize; 2518 int eindex = elemindex % elem_per_lane; 2519 assert(is_integral_type(typ),"required"); 2520 2521 if (eindex == 0) { 2522 if (typ == T_LONG) { 2523 movq(dst, src); 2524 } else { 2525 movdl(dst, src); 2526 movsxl(typ, dst); 2527 } 2528 } else { 2529 extract(typ, dst, src, eindex); 2530 movsxl(typ, dst); 2531 } 2532 } 2533 2534 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2535 int esize = type2aelembytes(typ); 2536 int elem_per_lane = 16/esize; 2537 int eindex = elemindex % elem_per_lane; 2538 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2539 2540 if (eindex == 0) { 2541 movq(dst, src); 2542 } else { 2543 if (typ == T_FLOAT) { 2544 if (UseAVX == 0) { 2545 movdqu(dst, src); 2546 shufps(dst, dst, eindex); 2547 } else { 2548 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2549 } 2550 } else { 2551 if (UseAVX == 0) { 2552 movdqu(dst, src); 2553 psrldq(dst, eindex*esize); 2554 } else { 2555 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2556 } 2557 movq(dst, dst); 2558 } 2559 } 2560 // Zero upper bits 2561 if (typ == T_FLOAT) { 2562 if (UseAVX == 0) { 2563 assert(vtmp != xnoreg, "required."); 2564 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2565 pand(dst, vtmp); 2566 } else { 2567 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2568 } 2569 } 2570 } 2571 2572 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2573 switch(typ) { 2574 case T_BYTE: 2575 case T_BOOLEAN: 2576 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2577 break; 2578 case T_SHORT: 2579 case T_CHAR: 2580 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2581 break; 2582 case T_INT: 2583 case T_FLOAT: 2584 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2585 break; 2586 case T_LONG: 2587 case T_DOUBLE: 2588 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2589 break; 2590 default: 2591 assert(false,"Should not reach here."); 2592 break; 2593 } 2594 } 2595 2596 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2597 assert(rscratch != noreg || always_reachable(src2), "missing"); 2598 2599 switch(typ) { 2600 case T_BOOLEAN: 2601 case T_BYTE: 2602 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2603 break; 2604 case T_CHAR: 2605 case T_SHORT: 2606 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2607 break; 2608 case T_INT: 2609 case T_FLOAT: 2610 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2611 break; 2612 case T_LONG: 2613 case T_DOUBLE: 2614 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2615 break; 2616 default: 2617 assert(false,"Should not reach here."); 2618 break; 2619 } 2620 } 2621 2622 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2623 switch(typ) { 2624 case T_BYTE: 2625 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2626 break; 2627 case T_SHORT: 2628 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2629 break; 2630 case T_INT: 2631 case T_FLOAT: 2632 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2633 break; 2634 case T_LONG: 2635 case T_DOUBLE: 2636 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2637 break; 2638 default: 2639 assert(false,"Should not reach here."); 2640 break; 2641 } 2642 } 2643 2644 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2645 assert(vlen_in_bytes <= 32, ""); 2646 int esize = type2aelembytes(bt); 2647 if (vlen_in_bytes == 32) { 2648 assert(vtmp == xnoreg, "required."); 2649 if (esize >= 4) { 2650 vtestps(src1, src2, AVX_256bit); 2651 } else { 2652 vptest(src1, src2, AVX_256bit); 2653 } 2654 return; 2655 } 2656 if (vlen_in_bytes < 16) { 2657 // Duplicate the lower part to fill the whole register, 2658 // Don't need to do so for src2 2659 assert(vtmp != xnoreg, "required"); 2660 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2661 pshufd(vtmp, src1, shuffle_imm); 2662 } else { 2663 assert(vtmp == xnoreg, "required"); 2664 vtmp = src1; 2665 } 2666 if (esize >= 4 && VM_Version::supports_avx()) { 2667 vtestps(vtmp, src2, AVX_128bit); 2668 } else { 2669 ptest(vtmp, src2); 2670 } 2671 } 2672 2673 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2674 #ifdef ASSERT 2675 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2676 bool is_bw_supported = VM_Version::supports_avx512bw(); 2677 if (is_bw && !is_bw_supported) { 2678 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2679 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2680 "XMM register should be 0-15"); 2681 } 2682 #endif // ASSERT 2683 switch (elem_bt) { 2684 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2685 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2686 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2687 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2688 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2689 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2690 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2691 } 2692 } 2693 2694 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2695 assert(UseAVX >= 2, "required"); 2696 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2697 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2698 if ((UseAVX > 2) && 2699 (!is_bw || VM_Version::supports_avx512bw()) && 2700 (!is_vl || VM_Version::supports_avx512vl())) { 2701 switch (elem_bt) { 2702 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2703 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2704 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2705 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2706 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2707 } 2708 } else { 2709 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2710 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2711 switch (elem_bt) { 2712 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2713 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2714 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2715 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2716 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2717 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2718 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2719 } 2720 } 2721 } 2722 2723 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2724 switch (to_elem_bt) { 2725 case T_SHORT: 2726 vpmovsxbw(dst, src, vlen_enc); 2727 break; 2728 case T_INT: 2729 vpmovsxbd(dst, src, vlen_enc); 2730 break; 2731 case T_FLOAT: 2732 vpmovsxbd(dst, src, vlen_enc); 2733 vcvtdq2ps(dst, dst, vlen_enc); 2734 break; 2735 case T_LONG: 2736 vpmovsxbq(dst, src, vlen_enc); 2737 break; 2738 case T_DOUBLE: { 2739 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2740 vpmovsxbd(dst, src, mid_vlen_enc); 2741 vcvtdq2pd(dst, dst, vlen_enc); 2742 break; 2743 } 2744 default: 2745 fatal("Unsupported type %s", type2name(to_elem_bt)); 2746 break; 2747 } 2748 } 2749 2750 //------------------------------------------------------------------------------------------- 2751 2752 // IndexOf for constant substrings with size >= 8 chars 2753 // which don't need to be loaded through stack. 2754 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2755 Register cnt1, Register cnt2, 2756 int int_cnt2, Register result, 2757 XMMRegister vec, Register tmp, 2758 int ae) { 2759 ShortBranchVerifier sbv(this); 2760 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2761 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2762 2763 // This method uses the pcmpestri instruction with bound registers 2764 // inputs: 2765 // xmm - substring 2766 // rax - substring length (elements count) 2767 // mem - scanned string 2768 // rdx - string length (elements count) 2769 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2770 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2771 // outputs: 2772 // rcx - matched index in string 2773 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2774 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2775 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2776 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2777 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2778 2779 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2780 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2781 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2782 2783 // Note, inline_string_indexOf() generates checks: 2784 // if (substr.count > string.count) return -1; 2785 // if (substr.count == 0) return 0; 2786 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2787 2788 // Load substring. 2789 if (ae == StrIntrinsicNode::UL) { 2790 pmovzxbw(vec, Address(str2, 0)); 2791 } else { 2792 movdqu(vec, Address(str2, 0)); 2793 } 2794 movl(cnt2, int_cnt2); 2795 movptr(result, str1); // string addr 2796 2797 if (int_cnt2 > stride) { 2798 jmpb(SCAN_TO_SUBSTR); 2799 2800 // Reload substr for rescan, this code 2801 // is executed only for large substrings (> 8 chars) 2802 bind(RELOAD_SUBSTR); 2803 if (ae == StrIntrinsicNode::UL) { 2804 pmovzxbw(vec, Address(str2, 0)); 2805 } else { 2806 movdqu(vec, Address(str2, 0)); 2807 } 2808 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2809 2810 bind(RELOAD_STR); 2811 // We came here after the beginning of the substring was 2812 // matched but the rest of it was not so we need to search 2813 // again. Start from the next element after the previous match. 2814 2815 // cnt2 is number of substring reminding elements and 2816 // cnt1 is number of string reminding elements when cmp failed. 2817 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2818 subl(cnt1, cnt2); 2819 addl(cnt1, int_cnt2); 2820 movl(cnt2, int_cnt2); // Now restore cnt2 2821 2822 decrementl(cnt1); // Shift to next element 2823 cmpl(cnt1, cnt2); 2824 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2825 2826 addptr(result, (1<<scale1)); 2827 2828 } // (int_cnt2 > 8) 2829 2830 // Scan string for start of substr in 16-byte vectors 2831 bind(SCAN_TO_SUBSTR); 2832 pcmpestri(vec, Address(result, 0), mode); 2833 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2834 subl(cnt1, stride); 2835 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2836 cmpl(cnt1, cnt2); 2837 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2838 addptr(result, 16); 2839 jmpb(SCAN_TO_SUBSTR); 2840 2841 // Found a potential substr 2842 bind(FOUND_CANDIDATE); 2843 // Matched whole vector if first element matched (tmp(rcx) == 0). 2844 if (int_cnt2 == stride) { 2845 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2846 } else { // int_cnt2 > 8 2847 jccb(Assembler::overflow, FOUND_SUBSTR); 2848 } 2849 // After pcmpestri tmp(rcx) contains matched element index 2850 // Compute start addr of substr 2851 lea(result, Address(result, tmp, scale1)); 2852 2853 // Make sure string is still long enough 2854 subl(cnt1, tmp); 2855 cmpl(cnt1, cnt2); 2856 if (int_cnt2 == stride) { 2857 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2858 } else { // int_cnt2 > 8 2859 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2860 } 2861 // Left less then substring. 2862 2863 bind(RET_NOT_FOUND); 2864 movl(result, -1); 2865 jmp(EXIT); 2866 2867 if (int_cnt2 > stride) { 2868 // This code is optimized for the case when whole substring 2869 // is matched if its head is matched. 2870 bind(MATCH_SUBSTR_HEAD); 2871 pcmpestri(vec, Address(result, 0), mode); 2872 // Reload only string if does not match 2873 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2874 2875 Label CONT_SCAN_SUBSTR; 2876 // Compare the rest of substring (> 8 chars). 2877 bind(FOUND_SUBSTR); 2878 // First 8 chars are already matched. 2879 negptr(cnt2); 2880 addptr(cnt2, stride); 2881 2882 bind(SCAN_SUBSTR); 2883 subl(cnt1, stride); 2884 cmpl(cnt2, -stride); // Do not read beyond substring 2885 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2886 // Back-up strings to avoid reading beyond substring: 2887 // cnt1 = cnt1 - cnt2 + 8 2888 addl(cnt1, cnt2); // cnt2 is negative 2889 addl(cnt1, stride); 2890 movl(cnt2, stride); negptr(cnt2); 2891 bind(CONT_SCAN_SUBSTR); 2892 if (int_cnt2 < (int)G) { 2893 int tail_off1 = int_cnt2<<scale1; 2894 int tail_off2 = int_cnt2<<scale2; 2895 if (ae == StrIntrinsicNode::UL) { 2896 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2897 } else { 2898 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2899 } 2900 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2901 } else { 2902 // calculate index in register to avoid integer overflow (int_cnt2*2) 2903 movl(tmp, int_cnt2); 2904 addptr(tmp, cnt2); 2905 if (ae == StrIntrinsicNode::UL) { 2906 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2907 } else { 2908 movdqu(vec, Address(str2, tmp, scale2, 0)); 2909 } 2910 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2911 } 2912 // Need to reload strings pointers if not matched whole vector 2913 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2914 addptr(cnt2, stride); 2915 jcc(Assembler::negative, SCAN_SUBSTR); 2916 // Fall through if found full substring 2917 2918 } // (int_cnt2 > 8) 2919 2920 bind(RET_FOUND); 2921 // Found result if we matched full small substring. 2922 // Compute substr offset 2923 subptr(result, str1); 2924 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2925 shrl(result, 1); // index 2926 } 2927 bind(EXIT); 2928 2929 } // string_indexofC8 2930 2931 // Small strings are loaded through stack if they cross page boundary. 2932 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2933 Register cnt1, Register cnt2, 2934 int int_cnt2, Register result, 2935 XMMRegister vec, Register tmp, 2936 int ae) { 2937 ShortBranchVerifier sbv(this); 2938 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2939 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2940 2941 // 2942 // int_cnt2 is length of small (< 8 chars) constant substring 2943 // or (-1) for non constant substring in which case its length 2944 // is in cnt2 register. 2945 // 2946 // Note, inline_string_indexOf() generates checks: 2947 // if (substr.count > string.count) return -1; 2948 // if (substr.count == 0) return 0; 2949 // 2950 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2951 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2952 // This method uses the pcmpestri instruction with bound registers 2953 // inputs: 2954 // xmm - substring 2955 // rax - substring length (elements count) 2956 // mem - scanned string 2957 // rdx - string length (elements count) 2958 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2959 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2960 // outputs: 2961 // rcx - matched index in string 2962 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2963 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2964 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2965 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2966 2967 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2968 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2969 FOUND_CANDIDATE; 2970 2971 { //======================================================== 2972 // We don't know where these strings are located 2973 // and we can't read beyond them. Load them through stack. 2974 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2975 2976 movptr(tmp, rsp); // save old SP 2977 2978 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2979 if (int_cnt2 == (1>>scale2)) { // One byte 2980 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2981 load_unsigned_byte(result, Address(str2, 0)); 2982 movdl(vec, result); // move 32 bits 2983 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2984 // Not enough header space in 32-bit VM: 12+3 = 15. 2985 movl(result, Address(str2, -1)); 2986 shrl(result, 8); 2987 movdl(vec, result); // move 32 bits 2988 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2989 load_unsigned_short(result, Address(str2, 0)); 2990 movdl(vec, result); // move 32 bits 2991 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2992 movdl(vec, Address(str2, 0)); // move 32 bits 2993 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2994 movq(vec, Address(str2, 0)); // move 64 bits 2995 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2996 // Array header size is 12 bytes in 32-bit VM 2997 // + 6 bytes for 3 chars == 18 bytes, 2998 // enough space to load vec and shift. 2999 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3000 if (ae == StrIntrinsicNode::UL) { 3001 int tail_off = int_cnt2-8; 3002 pmovzxbw(vec, Address(str2, tail_off)); 3003 psrldq(vec, -2*tail_off); 3004 } 3005 else { 3006 int tail_off = int_cnt2*(1<<scale2); 3007 movdqu(vec, Address(str2, tail_off-16)); 3008 psrldq(vec, 16-tail_off); 3009 } 3010 } 3011 } else { // not constant substring 3012 cmpl(cnt2, stride); 3013 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3014 3015 // We can read beyond string if srt+16 does not cross page boundary 3016 // since heaps are aligned and mapped by pages. 3017 assert(os::vm_page_size() < (int)G, "default page should be small"); 3018 movl(result, str2); // We need only low 32 bits 3019 andl(result, ((int)os::vm_page_size()-1)); 3020 cmpl(result, ((int)os::vm_page_size()-16)); 3021 jccb(Assembler::belowEqual, CHECK_STR); 3022 3023 // Move small strings to stack to allow load 16 bytes into vec. 3024 subptr(rsp, 16); 3025 int stk_offset = wordSize-(1<<scale2); 3026 push(cnt2); 3027 3028 bind(COPY_SUBSTR); 3029 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3030 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3031 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3032 } else if (ae == StrIntrinsicNode::UU) { 3033 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3034 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3035 } 3036 decrement(cnt2); 3037 jccb(Assembler::notZero, COPY_SUBSTR); 3038 3039 pop(cnt2); 3040 movptr(str2, rsp); // New substring address 3041 } // non constant 3042 3043 bind(CHECK_STR); 3044 cmpl(cnt1, stride); 3045 jccb(Assembler::aboveEqual, BIG_STRINGS); 3046 3047 // Check cross page boundary. 3048 movl(result, str1); // We need only low 32 bits 3049 andl(result, ((int)os::vm_page_size()-1)); 3050 cmpl(result, ((int)os::vm_page_size()-16)); 3051 jccb(Assembler::belowEqual, BIG_STRINGS); 3052 3053 subptr(rsp, 16); 3054 int stk_offset = -(1<<scale1); 3055 if (int_cnt2 < 0) { // not constant 3056 push(cnt2); 3057 stk_offset += wordSize; 3058 } 3059 movl(cnt2, cnt1); 3060 3061 bind(COPY_STR); 3062 if (ae == StrIntrinsicNode::LL) { 3063 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3064 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3065 } else { 3066 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3067 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3068 } 3069 decrement(cnt2); 3070 jccb(Assembler::notZero, COPY_STR); 3071 3072 if (int_cnt2 < 0) { // not constant 3073 pop(cnt2); 3074 } 3075 movptr(str1, rsp); // New string address 3076 3077 bind(BIG_STRINGS); 3078 // Load substring. 3079 if (int_cnt2 < 0) { // -1 3080 if (ae == StrIntrinsicNode::UL) { 3081 pmovzxbw(vec, Address(str2, 0)); 3082 } else { 3083 movdqu(vec, Address(str2, 0)); 3084 } 3085 push(cnt2); // substr count 3086 push(str2); // substr addr 3087 push(str1); // string addr 3088 } else { 3089 // Small (< 8 chars) constant substrings are loaded already. 3090 movl(cnt2, int_cnt2); 3091 } 3092 push(tmp); // original SP 3093 3094 } // Finished loading 3095 3096 //======================================================== 3097 // Start search 3098 // 3099 3100 movptr(result, str1); // string addr 3101 3102 if (int_cnt2 < 0) { // Only for non constant substring 3103 jmpb(SCAN_TO_SUBSTR); 3104 3105 // SP saved at sp+0 3106 // String saved at sp+1*wordSize 3107 // Substr saved at sp+2*wordSize 3108 // Substr count saved at sp+3*wordSize 3109 3110 // Reload substr for rescan, this code 3111 // is executed only for large substrings (> 8 chars) 3112 bind(RELOAD_SUBSTR); 3113 movptr(str2, Address(rsp, 2*wordSize)); 3114 movl(cnt2, Address(rsp, 3*wordSize)); 3115 if (ae == StrIntrinsicNode::UL) { 3116 pmovzxbw(vec, Address(str2, 0)); 3117 } else { 3118 movdqu(vec, Address(str2, 0)); 3119 } 3120 // We came here after the beginning of the substring was 3121 // matched but the rest of it was not so we need to search 3122 // again. Start from the next element after the previous match. 3123 subptr(str1, result); // Restore counter 3124 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3125 shrl(str1, 1); 3126 } 3127 addl(cnt1, str1); 3128 decrementl(cnt1); // Shift to next element 3129 cmpl(cnt1, cnt2); 3130 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3131 3132 addptr(result, (1<<scale1)); 3133 } // non constant 3134 3135 // Scan string for start of substr in 16-byte vectors 3136 bind(SCAN_TO_SUBSTR); 3137 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3138 pcmpestri(vec, Address(result, 0), mode); 3139 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3140 subl(cnt1, stride); 3141 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3142 cmpl(cnt1, cnt2); 3143 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3144 addptr(result, 16); 3145 3146 bind(ADJUST_STR); 3147 cmpl(cnt1, stride); // Do not read beyond string 3148 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3149 // Back-up string to avoid reading beyond string. 3150 lea(result, Address(result, cnt1, scale1, -16)); 3151 movl(cnt1, stride); 3152 jmpb(SCAN_TO_SUBSTR); 3153 3154 // Found a potential substr 3155 bind(FOUND_CANDIDATE); 3156 // After pcmpestri tmp(rcx) contains matched element index 3157 3158 // Make sure string is still long enough 3159 subl(cnt1, tmp); 3160 cmpl(cnt1, cnt2); 3161 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3162 // Left less then substring. 3163 3164 bind(RET_NOT_FOUND); 3165 movl(result, -1); 3166 jmp(CLEANUP); 3167 3168 bind(FOUND_SUBSTR); 3169 // Compute start addr of substr 3170 lea(result, Address(result, tmp, scale1)); 3171 if (int_cnt2 > 0) { // Constant substring 3172 // Repeat search for small substring (< 8 chars) 3173 // from new point without reloading substring. 3174 // Have to check that we don't read beyond string. 3175 cmpl(tmp, stride-int_cnt2); 3176 jccb(Assembler::greater, ADJUST_STR); 3177 // Fall through if matched whole substring. 3178 } else { // non constant 3179 assert(int_cnt2 == -1, "should be != 0"); 3180 3181 addl(tmp, cnt2); 3182 // Found result if we matched whole substring. 3183 cmpl(tmp, stride); 3184 jcc(Assembler::lessEqual, RET_FOUND); 3185 3186 // Repeat search for small substring (<= 8 chars) 3187 // from new point 'str1' without reloading substring. 3188 cmpl(cnt2, stride); 3189 // Have to check that we don't read beyond string. 3190 jccb(Assembler::lessEqual, ADJUST_STR); 3191 3192 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3193 // Compare the rest of substring (> 8 chars). 3194 movptr(str1, result); 3195 3196 cmpl(tmp, cnt2); 3197 // First 8 chars are already matched. 3198 jccb(Assembler::equal, CHECK_NEXT); 3199 3200 bind(SCAN_SUBSTR); 3201 pcmpestri(vec, Address(str1, 0), mode); 3202 // Need to reload strings pointers if not matched whole vector 3203 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3204 3205 bind(CHECK_NEXT); 3206 subl(cnt2, stride); 3207 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3208 addptr(str1, 16); 3209 if (ae == StrIntrinsicNode::UL) { 3210 addptr(str2, 8); 3211 } else { 3212 addptr(str2, 16); 3213 } 3214 subl(cnt1, stride); 3215 cmpl(cnt2, stride); // Do not read beyond substring 3216 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3217 // Back-up strings to avoid reading beyond substring. 3218 3219 if (ae == StrIntrinsicNode::UL) { 3220 lea(str2, Address(str2, cnt2, scale2, -8)); 3221 lea(str1, Address(str1, cnt2, scale1, -16)); 3222 } else { 3223 lea(str2, Address(str2, cnt2, scale2, -16)); 3224 lea(str1, Address(str1, cnt2, scale1, -16)); 3225 } 3226 subl(cnt1, cnt2); 3227 movl(cnt2, stride); 3228 addl(cnt1, stride); 3229 bind(CONT_SCAN_SUBSTR); 3230 if (ae == StrIntrinsicNode::UL) { 3231 pmovzxbw(vec, Address(str2, 0)); 3232 } else { 3233 movdqu(vec, Address(str2, 0)); 3234 } 3235 jmp(SCAN_SUBSTR); 3236 3237 bind(RET_FOUND_LONG); 3238 movptr(str1, Address(rsp, wordSize)); 3239 } // non constant 3240 3241 bind(RET_FOUND); 3242 // Compute substr offset 3243 subptr(result, str1); 3244 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3245 shrl(result, 1); // index 3246 } 3247 bind(CLEANUP); 3248 pop(rsp); // restore SP 3249 3250 } // string_indexof 3251 3252 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3253 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3254 ShortBranchVerifier sbv(this); 3255 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3256 3257 int stride = 8; 3258 3259 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3260 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3261 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3262 FOUND_SEQ_CHAR, DONE_LABEL; 3263 3264 movptr(result, str1); 3265 if (UseAVX >= 2) { 3266 cmpl(cnt1, stride); 3267 jcc(Assembler::less, SCAN_TO_CHAR); 3268 cmpl(cnt1, 2*stride); 3269 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3270 movdl(vec1, ch); 3271 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3272 vpxor(vec2, vec2); 3273 movl(tmp, cnt1); 3274 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3275 andl(cnt1,0x0000000F); //tail count (in chars) 3276 3277 bind(SCAN_TO_16_CHAR_LOOP); 3278 vmovdqu(vec3, Address(result, 0)); 3279 vpcmpeqw(vec3, vec3, vec1, 1); 3280 vptest(vec2, vec3); 3281 jcc(Assembler::carryClear, FOUND_CHAR); 3282 addptr(result, 32); 3283 subl(tmp, 2*stride); 3284 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3285 jmp(SCAN_TO_8_CHAR); 3286 bind(SCAN_TO_8_CHAR_INIT); 3287 movdl(vec1, ch); 3288 pshuflw(vec1, vec1, 0x00); 3289 pshufd(vec1, vec1, 0); 3290 pxor(vec2, vec2); 3291 } 3292 bind(SCAN_TO_8_CHAR); 3293 cmpl(cnt1, stride); 3294 jcc(Assembler::less, SCAN_TO_CHAR); 3295 if (UseAVX < 2) { 3296 movdl(vec1, ch); 3297 pshuflw(vec1, vec1, 0x00); 3298 pshufd(vec1, vec1, 0); 3299 pxor(vec2, vec2); 3300 } 3301 movl(tmp, cnt1); 3302 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3303 andl(cnt1,0x00000007); //tail count (in chars) 3304 3305 bind(SCAN_TO_8_CHAR_LOOP); 3306 movdqu(vec3, Address(result, 0)); 3307 pcmpeqw(vec3, vec1); 3308 ptest(vec2, vec3); 3309 jcc(Assembler::carryClear, FOUND_CHAR); 3310 addptr(result, 16); 3311 subl(tmp, stride); 3312 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3313 bind(SCAN_TO_CHAR); 3314 testl(cnt1, cnt1); 3315 jcc(Assembler::zero, RET_NOT_FOUND); 3316 bind(SCAN_TO_CHAR_LOOP); 3317 load_unsigned_short(tmp, Address(result, 0)); 3318 cmpl(ch, tmp); 3319 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3320 addptr(result, 2); 3321 subl(cnt1, 1); 3322 jccb(Assembler::zero, RET_NOT_FOUND); 3323 jmp(SCAN_TO_CHAR_LOOP); 3324 3325 bind(RET_NOT_FOUND); 3326 movl(result, -1); 3327 jmpb(DONE_LABEL); 3328 3329 bind(FOUND_CHAR); 3330 if (UseAVX >= 2) { 3331 vpmovmskb(tmp, vec3); 3332 } else { 3333 pmovmskb(tmp, vec3); 3334 } 3335 bsfl(ch, tmp); 3336 addptr(result, ch); 3337 3338 bind(FOUND_SEQ_CHAR); 3339 subptr(result, str1); 3340 shrl(result, 1); 3341 3342 bind(DONE_LABEL); 3343 } // string_indexof_char 3344 3345 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3346 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3347 ShortBranchVerifier sbv(this); 3348 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3349 3350 int stride = 16; 3351 3352 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3353 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3354 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3355 FOUND_SEQ_CHAR, DONE_LABEL; 3356 3357 movptr(result, str1); 3358 if (UseAVX >= 2) { 3359 cmpl(cnt1, stride); 3360 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3361 cmpl(cnt1, stride*2); 3362 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3363 movdl(vec1, ch); 3364 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3365 vpxor(vec2, vec2); 3366 movl(tmp, cnt1); 3367 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3368 andl(cnt1,0x0000001F); //tail count (in chars) 3369 3370 bind(SCAN_TO_32_CHAR_LOOP); 3371 vmovdqu(vec3, Address(result, 0)); 3372 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3373 vptest(vec2, vec3); 3374 jcc(Assembler::carryClear, FOUND_CHAR); 3375 addptr(result, 32); 3376 subl(tmp, stride*2); 3377 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3378 jmp(SCAN_TO_16_CHAR); 3379 3380 bind(SCAN_TO_16_CHAR_INIT); 3381 movdl(vec1, ch); 3382 pxor(vec2, vec2); 3383 pshufb(vec1, vec2); 3384 } 3385 3386 bind(SCAN_TO_16_CHAR); 3387 cmpl(cnt1, stride); 3388 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3389 if (UseAVX < 2) { 3390 movdl(vec1, ch); 3391 pxor(vec2, vec2); 3392 pshufb(vec1, vec2); 3393 } 3394 movl(tmp, cnt1); 3395 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3396 andl(cnt1,0x0000000F); //tail count (in bytes) 3397 3398 bind(SCAN_TO_16_CHAR_LOOP); 3399 movdqu(vec3, Address(result, 0)); 3400 pcmpeqb(vec3, vec1); 3401 ptest(vec2, vec3); 3402 jcc(Assembler::carryClear, FOUND_CHAR); 3403 addptr(result, 16); 3404 subl(tmp, stride); 3405 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3406 3407 bind(SCAN_TO_CHAR_INIT); 3408 testl(cnt1, cnt1); 3409 jcc(Assembler::zero, RET_NOT_FOUND); 3410 bind(SCAN_TO_CHAR_LOOP); 3411 load_unsigned_byte(tmp, Address(result, 0)); 3412 cmpl(ch, tmp); 3413 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3414 addptr(result, 1); 3415 subl(cnt1, 1); 3416 jccb(Assembler::zero, RET_NOT_FOUND); 3417 jmp(SCAN_TO_CHAR_LOOP); 3418 3419 bind(RET_NOT_FOUND); 3420 movl(result, -1); 3421 jmpb(DONE_LABEL); 3422 3423 bind(FOUND_CHAR); 3424 if (UseAVX >= 2) { 3425 vpmovmskb(tmp, vec3); 3426 } else { 3427 pmovmskb(tmp, vec3); 3428 } 3429 bsfl(ch, tmp); 3430 addptr(result, ch); 3431 3432 bind(FOUND_SEQ_CHAR); 3433 subptr(result, str1); 3434 3435 bind(DONE_LABEL); 3436 } // stringL_indexof_char 3437 3438 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3439 switch (eltype) { 3440 case T_BOOLEAN: return sizeof(jboolean); 3441 case T_BYTE: return sizeof(jbyte); 3442 case T_SHORT: return sizeof(jshort); 3443 case T_CHAR: return sizeof(jchar); 3444 case T_INT: return sizeof(jint); 3445 default: 3446 ShouldNotReachHere(); 3447 return -1; 3448 } 3449 } 3450 3451 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3452 switch (eltype) { 3453 // T_BOOLEAN used as surrogate for unsigned byte 3454 case T_BOOLEAN: movzbl(dst, src); break; 3455 case T_BYTE: movsbl(dst, src); break; 3456 case T_SHORT: movswl(dst, src); break; 3457 case T_CHAR: movzwl(dst, src); break; 3458 case T_INT: movl(dst, src); break; 3459 default: 3460 ShouldNotReachHere(); 3461 } 3462 } 3463 3464 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3465 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3466 } 3467 3468 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3469 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3470 } 3471 3472 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3473 const int vlen = Assembler::AVX_256bit; 3474 switch (eltype) { 3475 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3476 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3477 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3478 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3479 case T_INT: 3480 // do nothing 3481 break; 3482 default: 3483 ShouldNotReachHere(); 3484 } 3485 } 3486 3487 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3488 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3489 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3490 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3491 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3492 BasicType eltype) { 3493 ShortBranchVerifier sbv(this); 3494 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3495 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3496 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3497 3498 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3499 SHORT_UNROLLED_LOOP_EXIT, 3500 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3501 UNROLLED_VECTOR_LOOP_BEGIN, 3502 END; 3503 switch (eltype) { 3504 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3505 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3506 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3507 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3508 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3509 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3510 } 3511 3512 // For "renaming" for readibility of the code 3513 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3514 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3515 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3516 3517 const int elsize = arrays_hashcode_elsize(eltype); 3518 3519 /* 3520 if (cnt1 >= 2) { 3521 if (cnt1 >= 32) { 3522 UNROLLED VECTOR LOOP 3523 } 3524 UNROLLED SCALAR LOOP 3525 } 3526 SINGLE SCALAR 3527 */ 3528 3529 cmpl(cnt1, 32); 3530 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3531 3532 // cnt1 >= 32 && generate_vectorized_loop 3533 xorl(index, index); 3534 3535 // vresult = IntVector.zero(I256); 3536 for (int idx = 0; idx < 4; idx++) { 3537 vpxor(vresult[idx], vresult[idx]); 3538 } 3539 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3540 Register bound = tmp2; 3541 Register next = tmp3; 3542 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3543 movl(next, Address(tmp2, 0)); 3544 movdl(vnext, next); 3545 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3546 3547 // index = 0; 3548 // bound = cnt1 & ~(32 - 1); 3549 movl(bound, cnt1); 3550 andl(bound, ~(32 - 1)); 3551 // for (; index < bound; index += 32) { 3552 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3553 // result *= next; 3554 imull(result, next); 3555 // loop fission to upfront the cost of fetching from memory, OOO execution 3556 // can then hopefully do a better job of prefetching 3557 for (int idx = 0; idx < 4; idx++) { 3558 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3559 } 3560 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3561 for (int idx = 0; idx < 4; idx++) { 3562 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3563 arrays_hashcode_elvcast(vtmp[idx], eltype); 3564 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3565 } 3566 // index += 32; 3567 addl(index, 32); 3568 // index < bound; 3569 cmpl(index, bound); 3570 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3571 // } 3572 3573 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3574 subl(cnt1, bound); 3575 // release bound 3576 3577 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3578 for (int idx = 0; idx < 4; idx++) { 3579 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3580 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3581 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3582 } 3583 // result += vresult.reduceLanes(ADD); 3584 for (int idx = 0; idx < 4; idx++) { 3585 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3586 } 3587 3588 // } else if (cnt1 < 32) { 3589 3590 bind(SHORT_UNROLLED_BEGIN); 3591 // int i = 1; 3592 movl(index, 1); 3593 cmpl(index, cnt1); 3594 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3595 3596 // for (; i < cnt1 ; i += 2) { 3597 bind(SHORT_UNROLLED_LOOP_BEGIN); 3598 movl(tmp3, 961); 3599 imull(result, tmp3); 3600 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3601 movl(tmp3, tmp2); 3602 shll(tmp3, 5); 3603 subl(tmp3, tmp2); 3604 addl(result, tmp3); 3605 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3606 addl(result, tmp3); 3607 addl(index, 2); 3608 cmpl(index, cnt1); 3609 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3610 3611 // } 3612 // if (i >= cnt1) { 3613 bind(SHORT_UNROLLED_LOOP_EXIT); 3614 jccb(Assembler::greater, END); 3615 movl(tmp2, result); 3616 shll(result, 5); 3617 subl(result, tmp2); 3618 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3619 addl(result, tmp3); 3620 // } 3621 bind(END); 3622 3623 BLOCK_COMMENT("} // arrays_hashcode"); 3624 3625 } // arrays_hashcode 3626 3627 // helper function for string_compare 3628 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3629 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3630 Address::ScaleFactor scale2, Register index, int ae) { 3631 if (ae == StrIntrinsicNode::LL) { 3632 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3633 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3634 } else if (ae == StrIntrinsicNode::UU) { 3635 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3636 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3637 } else { 3638 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3639 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3640 } 3641 } 3642 3643 // Compare strings, used for char[] and byte[]. 3644 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3645 Register cnt1, Register cnt2, Register result, 3646 XMMRegister vec1, int ae, KRegister mask) { 3647 ShortBranchVerifier sbv(this); 3648 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3649 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3 3650 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3651 int stride2x2 = 0x40; 3652 Address::ScaleFactor scale = Address::no_scale; 3653 Address::ScaleFactor scale1 = Address::no_scale; 3654 Address::ScaleFactor scale2 = Address::no_scale; 3655 3656 if (ae != StrIntrinsicNode::LL) { 3657 stride2x2 = 0x20; 3658 } 3659 3660 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3661 shrl(cnt2, 1); 3662 } 3663 // Compute the minimum of the string lengths and the 3664 // difference of the string lengths (stack). 3665 // Do the conditional move stuff 3666 movl(result, cnt1); 3667 subl(cnt1, cnt2); 3668 push(cnt1); 3669 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3670 3671 // Is the minimum length zero? 3672 testl(cnt2, cnt2); 3673 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3674 if (ae == StrIntrinsicNode::LL) { 3675 // Load first bytes 3676 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3677 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3678 } else if (ae == StrIntrinsicNode::UU) { 3679 // Load first characters 3680 load_unsigned_short(result, Address(str1, 0)); 3681 load_unsigned_short(cnt1, Address(str2, 0)); 3682 } else { 3683 load_unsigned_byte(result, Address(str1, 0)); 3684 load_unsigned_short(cnt1, Address(str2, 0)); 3685 } 3686 subl(result, cnt1); 3687 jcc(Assembler::notZero, POP_LABEL); 3688 3689 if (ae == StrIntrinsicNode::UU) { 3690 // Divide length by 2 to get number of chars 3691 shrl(cnt2, 1); 3692 } 3693 cmpl(cnt2, 1); 3694 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3695 3696 // Check if the strings start at the same location and setup scale and stride 3697 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3698 cmpptr(str1, str2); 3699 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3700 if (ae == StrIntrinsicNode::LL) { 3701 scale = Address::times_1; 3702 stride = 16; 3703 } else { 3704 scale = Address::times_2; 3705 stride = 8; 3706 } 3707 } else { 3708 scale1 = Address::times_1; 3709 scale2 = Address::times_2; 3710 // scale not used 3711 stride = 8; 3712 } 3713 3714 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3715 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3716 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3717 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3718 Label COMPARE_TAIL_LONG; 3719 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3 3720 3721 int pcmpmask = 0x19; 3722 if (ae == StrIntrinsicNode::LL) { 3723 pcmpmask &= ~0x01; 3724 } 3725 3726 // Setup to compare 16-chars (32-bytes) vectors, 3727 // start from first character again because it has aligned address. 3728 if (ae == StrIntrinsicNode::LL) { 3729 stride2 = 32; 3730 } else { 3731 stride2 = 16; 3732 } 3733 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3734 adr_stride = stride << scale; 3735 } else { 3736 adr_stride1 = 8; //stride << scale1; 3737 adr_stride2 = 16; //stride << scale2; 3738 } 3739 3740 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3741 // rax and rdx are used by pcmpestri as elements counters 3742 movl(result, cnt2); 3743 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3744 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3745 3746 // fast path : compare first 2 8-char vectors. 3747 bind(COMPARE_16_CHARS); 3748 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3749 movdqu(vec1, Address(str1, 0)); 3750 } else { 3751 pmovzxbw(vec1, Address(str1, 0)); 3752 } 3753 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3754 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3755 3756 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3757 movdqu(vec1, Address(str1, adr_stride)); 3758 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3759 } else { 3760 pmovzxbw(vec1, Address(str1, adr_stride1)); 3761 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3762 } 3763 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3764 addl(cnt1, stride); 3765 3766 // Compare the characters at index in cnt1 3767 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3768 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3769 subl(result, cnt2); 3770 jmp(POP_LABEL); 3771 3772 // Setup the registers to start vector comparison loop 3773 bind(COMPARE_WIDE_VECTORS); 3774 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3775 lea(str1, Address(str1, result, scale)); 3776 lea(str2, Address(str2, result, scale)); 3777 } else { 3778 lea(str1, Address(str1, result, scale1)); 3779 lea(str2, Address(str2, result, scale2)); 3780 } 3781 subl(result, stride2); 3782 subl(cnt2, stride2); 3783 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3784 negptr(result); 3785 3786 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3787 bind(COMPARE_WIDE_VECTORS_LOOP); 3788 3789 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3790 cmpl(cnt2, stride2x2); 3791 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3792 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3793 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3794 3795 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3796 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3797 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3798 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3799 } else { 3800 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3801 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3802 } 3803 kortestql(mask, mask); 3804 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3805 addptr(result, stride2x2); // update since we already compared at this addr 3806 subl(cnt2, stride2x2); // and sub the size too 3807 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3808 3809 vpxor(vec1, vec1); 3810 jmpb(COMPARE_WIDE_TAIL); 3811 }//if (VM_Version::supports_avx512vlbw()) 3812 3813 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3814 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3815 vmovdqu(vec1, Address(str1, result, scale)); 3816 vpxor(vec1, Address(str2, result, scale)); 3817 } else { 3818 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3819 vpxor(vec1, Address(str2, result, scale2)); 3820 } 3821 vptest(vec1, vec1); 3822 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3823 addptr(result, stride2); 3824 subl(cnt2, stride2); 3825 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3826 // clean upper bits of YMM registers 3827 vpxor(vec1, vec1); 3828 3829 // compare wide vectors tail 3830 bind(COMPARE_WIDE_TAIL); 3831 testptr(result, result); 3832 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3833 3834 movl(result, stride2); 3835 movl(cnt2, result); 3836 negptr(result); 3837 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3838 3839 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3840 bind(VECTOR_NOT_EQUAL); 3841 // clean upper bits of YMM registers 3842 vpxor(vec1, vec1); 3843 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3844 lea(str1, Address(str1, result, scale)); 3845 lea(str2, Address(str2, result, scale)); 3846 } else { 3847 lea(str1, Address(str1, result, scale1)); 3848 lea(str2, Address(str2, result, scale2)); 3849 } 3850 jmp(COMPARE_16_CHARS); 3851 3852 // Compare tail chars, length between 1 to 15 chars 3853 bind(COMPARE_TAIL_LONG); 3854 movl(cnt2, result); 3855 cmpl(cnt2, stride); 3856 jcc(Assembler::less, COMPARE_SMALL_STR); 3857 3858 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3859 movdqu(vec1, Address(str1, 0)); 3860 } else { 3861 pmovzxbw(vec1, Address(str1, 0)); 3862 } 3863 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3864 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3865 subptr(cnt2, stride); 3866 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3867 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3868 lea(str1, Address(str1, result, scale)); 3869 lea(str2, Address(str2, result, scale)); 3870 } else { 3871 lea(str1, Address(str1, result, scale1)); 3872 lea(str2, Address(str2, result, scale2)); 3873 } 3874 negptr(cnt2); 3875 jmpb(WHILE_HEAD_LABEL); 3876 3877 bind(COMPARE_SMALL_STR); 3878 } else if (UseSSE42Intrinsics) { 3879 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3880 int pcmpmask = 0x19; 3881 // Setup to compare 8-char (16-byte) vectors, 3882 // start from first character again because it has aligned address. 3883 movl(result, cnt2); 3884 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3885 if (ae == StrIntrinsicNode::LL) { 3886 pcmpmask &= ~0x01; 3887 } 3888 jcc(Assembler::zero, COMPARE_TAIL); 3889 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3890 lea(str1, Address(str1, result, scale)); 3891 lea(str2, Address(str2, result, scale)); 3892 } else { 3893 lea(str1, Address(str1, result, scale1)); 3894 lea(str2, Address(str2, result, scale2)); 3895 } 3896 negptr(result); 3897 3898 // pcmpestri 3899 // inputs: 3900 // vec1- substring 3901 // rax - negative string length (elements count) 3902 // mem - scanned string 3903 // rdx - string length (elements count) 3904 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3905 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3906 // outputs: 3907 // rcx - first mismatched element index 3908 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3909 3910 bind(COMPARE_WIDE_VECTORS); 3911 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3912 movdqu(vec1, Address(str1, result, scale)); 3913 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3914 } else { 3915 pmovzxbw(vec1, Address(str1, result, scale1)); 3916 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3917 } 3918 // After pcmpestri cnt1(rcx) contains mismatched element index 3919 3920 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3921 addptr(result, stride); 3922 subptr(cnt2, stride); 3923 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3924 3925 // compare wide vectors tail 3926 testptr(result, result); 3927 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3928 3929 movl(cnt2, stride); 3930 movl(result, stride); 3931 negptr(result); 3932 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3933 movdqu(vec1, Address(str1, result, scale)); 3934 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3935 } else { 3936 pmovzxbw(vec1, Address(str1, result, scale1)); 3937 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3938 } 3939 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3940 3941 // Mismatched characters in the vectors 3942 bind(VECTOR_NOT_EQUAL); 3943 addptr(cnt1, result); 3944 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3945 subl(result, cnt2); 3946 jmpb(POP_LABEL); 3947 3948 bind(COMPARE_TAIL); // limit is zero 3949 movl(cnt2, result); 3950 // Fallthru to tail compare 3951 } 3952 // Shift str2 and str1 to the end of the arrays, negate min 3953 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3954 lea(str1, Address(str1, cnt2, scale)); 3955 lea(str2, Address(str2, cnt2, scale)); 3956 } else { 3957 lea(str1, Address(str1, cnt2, scale1)); 3958 lea(str2, Address(str2, cnt2, scale2)); 3959 } 3960 decrementl(cnt2); // first character was compared already 3961 negptr(cnt2); 3962 3963 // Compare the rest of the elements 3964 bind(WHILE_HEAD_LABEL); 3965 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3966 subl(result, cnt1); 3967 jccb(Assembler::notZero, POP_LABEL); 3968 increment(cnt2); 3969 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3970 3971 // Strings are equal up to min length. Return the length difference. 3972 bind(LENGTH_DIFF_LABEL); 3973 pop(result); 3974 if (ae == StrIntrinsicNode::UU) { 3975 // Divide diff by 2 to get number of chars 3976 sarl(result, 1); 3977 } 3978 jmpb(DONE_LABEL); 3979 3980 if (VM_Version::supports_avx512vlbw()) { 3981 3982 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3983 3984 kmovql(cnt1, mask); 3985 notq(cnt1); 3986 bsfq(cnt2, cnt1); 3987 if (ae != StrIntrinsicNode::LL) { 3988 // Divide diff by 2 to get number of chars 3989 sarl(cnt2, 1); 3990 } 3991 addq(result, cnt2); 3992 if (ae == StrIntrinsicNode::LL) { 3993 load_unsigned_byte(cnt1, Address(str2, result)); 3994 load_unsigned_byte(result, Address(str1, result)); 3995 } else if (ae == StrIntrinsicNode::UU) { 3996 load_unsigned_short(cnt1, Address(str2, result, scale)); 3997 load_unsigned_short(result, Address(str1, result, scale)); 3998 } else { 3999 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4000 load_unsigned_byte(result, Address(str1, result, scale1)); 4001 } 4002 subl(result, cnt1); 4003 jmpb(POP_LABEL); 4004 }//if (VM_Version::supports_avx512vlbw()) 4005 4006 // Discard the stored length difference 4007 bind(POP_LABEL); 4008 pop(cnt1); 4009 4010 // That's it 4011 bind(DONE_LABEL); 4012 if(ae == StrIntrinsicNode::UL) { 4013 negl(result); 4014 } 4015 4016 } 4017 4018 // Search for Non-ASCII character (Negative byte value) in a byte array, 4019 // return the index of the first such character, otherwise the length 4020 // of the array segment searched. 4021 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4022 // @IntrinsicCandidate 4023 // public static int countPositives(byte[] ba, int off, int len) { 4024 // for (int i = off; i < off + len; i++) { 4025 // if (ba[i] < 0) { 4026 // return i - off; 4027 // } 4028 // } 4029 // return len; 4030 // } 4031 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4032 Register result, Register tmp1, 4033 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4034 // rsi: byte array 4035 // rcx: len 4036 // rax: result 4037 ShortBranchVerifier sbv(this); 4038 assert_different_registers(ary1, len, result, tmp1); 4039 assert_different_registers(vec1, vec2); 4040 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4041 4042 movl(result, len); // copy 4043 // len == 0 4044 testl(len, len); 4045 jcc(Assembler::zero, DONE); 4046 4047 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4048 VM_Version::supports_avx512vlbw() && 4049 VM_Version::supports_bmi2()) { 4050 4051 Label test_64_loop, test_tail, BREAK_LOOP; 4052 movl(tmp1, len); 4053 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4054 4055 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4056 andl(len, 0xffffffc0); // vector count (in chars) 4057 jccb(Assembler::zero, test_tail); 4058 4059 lea(ary1, Address(ary1, len, Address::times_1)); 4060 negptr(len); 4061 4062 bind(test_64_loop); 4063 // Check whether our 64 elements of size byte contain negatives 4064 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4065 kortestql(mask1, mask1); 4066 jcc(Assembler::notZero, BREAK_LOOP); 4067 4068 addptr(len, 64); 4069 jccb(Assembler::notZero, test_64_loop); 4070 4071 bind(test_tail); 4072 // bail out when there is nothing to be done 4073 testl(tmp1, -1); 4074 jcc(Assembler::zero, DONE); 4075 4076 4077 // check the tail for absense of negatives 4078 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4079 { 4080 Register tmp3_aliased = len; 4081 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4082 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4083 notq(tmp3_aliased); 4084 kmovql(mask2, tmp3_aliased); 4085 } 4086 4087 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4088 ktestq(mask1, mask2); 4089 jcc(Assembler::zero, DONE); 4090 4091 // do a full check for negative registers in the tail 4092 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4093 // ary1 already pointing to the right place 4094 jmpb(TAIL_START); 4095 4096 bind(BREAK_LOOP); 4097 // At least one byte in the last 64 byte block was negative. 4098 // Set up to look at the last 64 bytes as if they were a tail 4099 lea(ary1, Address(ary1, len, Address::times_1)); 4100 addptr(result, len); 4101 // Ignore the very last byte: if all others are positive, 4102 // it must be negative, so we can skip right to the 2+1 byte 4103 // end comparison at this point 4104 orl(result, 63); 4105 movl(len, 63); 4106 // Fallthru to tail compare 4107 } else { 4108 4109 if (UseAVX >= 2) { 4110 // With AVX2, use 32-byte vector compare 4111 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4112 4113 // Compare 32-byte vectors 4114 testl(len, 0xffffffe0); // vector count (in bytes) 4115 jccb(Assembler::zero, TAIL_START); 4116 4117 andl(len, 0xffffffe0); 4118 lea(ary1, Address(ary1, len, Address::times_1)); 4119 negptr(len); 4120 4121 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4122 movdl(vec2, tmp1); 4123 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4124 4125 bind(COMPARE_WIDE_VECTORS); 4126 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4127 vptest(vec1, vec2); 4128 jccb(Assembler::notZero, BREAK_LOOP); 4129 addptr(len, 32); 4130 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4131 4132 testl(result, 0x0000001f); // any bytes remaining? 4133 jcc(Assembler::zero, DONE); 4134 4135 // Quick test using the already prepared vector mask 4136 movl(len, result); 4137 andl(len, 0x0000001f); 4138 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4139 vptest(vec1, vec2); 4140 jcc(Assembler::zero, DONE); 4141 // There are zeros, jump to the tail to determine exactly where 4142 jmpb(TAIL_START); 4143 4144 bind(BREAK_LOOP); 4145 // At least one byte in the last 32-byte vector is negative. 4146 // Set up to look at the last 32 bytes as if they were a tail 4147 lea(ary1, Address(ary1, len, Address::times_1)); 4148 addptr(result, len); 4149 // Ignore the very last byte: if all others are positive, 4150 // it must be negative, so we can skip right to the 2+1 byte 4151 // end comparison at this point 4152 orl(result, 31); 4153 movl(len, 31); 4154 // Fallthru to tail compare 4155 } else if (UseSSE42Intrinsics) { 4156 // With SSE4.2, use double quad vector compare 4157 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4158 4159 // Compare 16-byte vectors 4160 testl(len, 0xfffffff0); // vector count (in bytes) 4161 jcc(Assembler::zero, TAIL_START); 4162 4163 andl(len, 0xfffffff0); 4164 lea(ary1, Address(ary1, len, Address::times_1)); 4165 negptr(len); 4166 4167 movl(tmp1, 0x80808080); 4168 movdl(vec2, tmp1); 4169 pshufd(vec2, vec2, 0); 4170 4171 bind(COMPARE_WIDE_VECTORS); 4172 movdqu(vec1, Address(ary1, len, Address::times_1)); 4173 ptest(vec1, vec2); 4174 jccb(Assembler::notZero, BREAK_LOOP); 4175 addptr(len, 16); 4176 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4177 4178 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4179 jcc(Assembler::zero, DONE); 4180 4181 // Quick test using the already prepared vector mask 4182 movl(len, result); 4183 andl(len, 0x0000000f); // tail count (in bytes) 4184 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4185 ptest(vec1, vec2); 4186 jcc(Assembler::zero, DONE); 4187 jmpb(TAIL_START); 4188 4189 bind(BREAK_LOOP); 4190 // At least one byte in the last 16-byte vector is negative. 4191 // Set up and look at the last 16 bytes as if they were a tail 4192 lea(ary1, Address(ary1, len, Address::times_1)); 4193 addptr(result, len); 4194 // Ignore the very last byte: if all others are positive, 4195 // it must be negative, so we can skip right to the 2+1 byte 4196 // end comparison at this point 4197 orl(result, 15); 4198 movl(len, 15); 4199 // Fallthru to tail compare 4200 } 4201 } 4202 4203 bind(TAIL_START); 4204 // Compare 4-byte vectors 4205 andl(len, 0xfffffffc); // vector count (in bytes) 4206 jccb(Assembler::zero, COMPARE_CHAR); 4207 4208 lea(ary1, Address(ary1, len, Address::times_1)); 4209 negptr(len); 4210 4211 bind(COMPARE_VECTORS); 4212 movl(tmp1, Address(ary1, len, Address::times_1)); 4213 andl(tmp1, 0x80808080); 4214 jccb(Assembler::notZero, TAIL_ADJUST); 4215 addptr(len, 4); 4216 jccb(Assembler::notZero, COMPARE_VECTORS); 4217 4218 // Compare trailing char (final 2-3 bytes), if any 4219 bind(COMPARE_CHAR); 4220 4221 testl(result, 0x2); // tail char 4222 jccb(Assembler::zero, COMPARE_BYTE); 4223 load_unsigned_short(tmp1, Address(ary1, 0)); 4224 andl(tmp1, 0x00008080); 4225 jccb(Assembler::notZero, CHAR_ADJUST); 4226 lea(ary1, Address(ary1, 2)); 4227 4228 bind(COMPARE_BYTE); 4229 testl(result, 0x1); // tail byte 4230 jccb(Assembler::zero, DONE); 4231 load_unsigned_byte(tmp1, Address(ary1, 0)); 4232 testl(tmp1, 0x00000080); 4233 jccb(Assembler::zero, DONE); 4234 subptr(result, 1); 4235 jmpb(DONE); 4236 4237 bind(TAIL_ADJUST); 4238 // there are negative bits in the last 4 byte block. 4239 // Adjust result and check the next three bytes 4240 addptr(result, len); 4241 orl(result, 3); 4242 lea(ary1, Address(ary1, len, Address::times_1)); 4243 jmpb(COMPARE_CHAR); 4244 4245 bind(CHAR_ADJUST); 4246 // We are looking at a char + optional byte tail, and found that one 4247 // of the bytes in the char is negative. Adjust the result, check the 4248 // first byte and readjust if needed. 4249 andl(result, 0xfffffffc); 4250 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4251 jccb(Assembler::notZero, DONE); 4252 addptr(result, 1); 4253 4254 // That's it 4255 bind(DONE); 4256 if (UseAVX >= 2) { 4257 // clean upper bits of YMM registers 4258 vpxor(vec1, vec1); 4259 vpxor(vec2, vec2); 4260 } 4261 } 4262 4263 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4264 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4265 Register limit, Register result, Register chr, 4266 XMMRegister vec1, XMMRegister vec2, bool is_char, 4267 KRegister mask, bool expand_ary2) { 4268 // for expand_ary2, limit is the (smaller) size of the second array. 4269 ShortBranchVerifier sbv(this); 4270 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4271 4272 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4273 "Expansion only implemented for AVX2"); 4274 4275 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4276 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4277 4278 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4279 int scaleIncr = expand_ary2 ? 8 : 16; 4280 4281 if (is_array_equ) { 4282 // Check the input args 4283 cmpoop(ary1, ary2); 4284 jcc(Assembler::equal, TRUE_LABEL); 4285 4286 // Need additional checks for arrays_equals. 4287 testptr(ary1, ary1); 4288 jcc(Assembler::zero, FALSE_LABEL); 4289 testptr(ary2, ary2); 4290 jcc(Assembler::zero, FALSE_LABEL); 4291 4292 // Check the lengths 4293 movl(limit, Address(ary1, length_offset)); 4294 cmpl(limit, Address(ary2, length_offset)); 4295 jcc(Assembler::notEqual, FALSE_LABEL); 4296 } 4297 4298 // count == 0 4299 testl(limit, limit); 4300 jcc(Assembler::zero, TRUE_LABEL); 4301 4302 if (is_array_equ) { 4303 // Load array address 4304 lea(ary1, Address(ary1, base_offset)); 4305 lea(ary2, Address(ary2, base_offset)); 4306 } 4307 4308 if (is_array_equ && is_char) { 4309 // arrays_equals when used for char[]. 4310 shll(limit, 1); // byte count != 0 4311 } 4312 movl(result, limit); // copy 4313 4314 if (UseAVX >= 2) { 4315 // With AVX2, use 32-byte vector compare 4316 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4317 4318 // Compare 32-byte vectors 4319 if (expand_ary2) { 4320 andl(result, 0x0000000f); // tail count (in bytes) 4321 andl(limit, 0xfffffff0); // vector count (in bytes) 4322 jcc(Assembler::zero, COMPARE_TAIL); 4323 } else { 4324 andl(result, 0x0000001f); // tail count (in bytes) 4325 andl(limit, 0xffffffe0); // vector count (in bytes) 4326 jcc(Assembler::zero, COMPARE_TAIL_16); 4327 } 4328 4329 lea(ary1, Address(ary1, limit, scaleFactor)); 4330 lea(ary2, Address(ary2, limit, Address::times_1)); 4331 negptr(limit); 4332 4333 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4334 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4335 4336 cmpl(limit, -64); 4337 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4338 4339 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4340 4341 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4342 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4343 kortestql(mask, mask); 4344 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4345 addptr(limit, 64); // update since we already compared at this addr 4346 cmpl(limit, -64); 4347 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4348 4349 // At this point we may still need to compare -limit+result bytes. 4350 // We could execute the next two instruction and just continue via non-wide path: 4351 // cmpl(limit, 0); 4352 // jcc(Assembler::equal, COMPARE_TAIL); // true 4353 // But since we stopped at the points ary{1,2}+limit which are 4354 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4355 // (|limit| <= 32 and result < 32), 4356 // we may just compare the last 64 bytes. 4357 // 4358 addptr(result, -64); // it is safe, bc we just came from this area 4359 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4360 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4361 kortestql(mask, mask); 4362 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4363 4364 jmp(TRUE_LABEL); 4365 4366 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4367 4368 }//if (VM_Version::supports_avx512vlbw()) 4369 4370 bind(COMPARE_WIDE_VECTORS); 4371 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4372 if (expand_ary2) { 4373 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4374 } else { 4375 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4376 } 4377 vpxor(vec1, vec2); 4378 4379 vptest(vec1, vec1); 4380 jcc(Assembler::notZero, FALSE_LABEL); 4381 addptr(limit, scaleIncr * 2); 4382 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4383 4384 testl(result, result); 4385 jcc(Assembler::zero, TRUE_LABEL); 4386 4387 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4388 if (expand_ary2) { 4389 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4390 } else { 4391 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4392 } 4393 vpxor(vec1, vec2); 4394 4395 vptest(vec1, vec1); 4396 jcc(Assembler::notZero, FALSE_LABEL); 4397 jmp(TRUE_LABEL); 4398 4399 bind(COMPARE_TAIL_16); // limit is zero 4400 movl(limit, result); 4401 4402 // Compare 16-byte chunks 4403 andl(result, 0x0000000f); // tail count (in bytes) 4404 andl(limit, 0xfffffff0); // vector count (in bytes) 4405 jcc(Assembler::zero, COMPARE_TAIL); 4406 4407 lea(ary1, Address(ary1, limit, scaleFactor)); 4408 lea(ary2, Address(ary2, limit, Address::times_1)); 4409 negptr(limit); 4410 4411 bind(COMPARE_WIDE_VECTORS_16); 4412 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4413 if (expand_ary2) { 4414 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4415 } else { 4416 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4417 } 4418 pxor(vec1, vec2); 4419 4420 ptest(vec1, vec1); 4421 jcc(Assembler::notZero, FALSE_LABEL); 4422 addptr(limit, scaleIncr); 4423 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4424 4425 bind(COMPARE_TAIL); // limit is zero 4426 movl(limit, result); 4427 // Fallthru to tail compare 4428 } else if (UseSSE42Intrinsics) { 4429 // With SSE4.2, use double quad vector compare 4430 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4431 4432 // Compare 16-byte vectors 4433 andl(result, 0x0000000f); // tail count (in bytes) 4434 andl(limit, 0xfffffff0); // vector count (in bytes) 4435 jcc(Assembler::zero, COMPARE_TAIL); 4436 4437 lea(ary1, Address(ary1, limit, Address::times_1)); 4438 lea(ary2, Address(ary2, limit, Address::times_1)); 4439 negptr(limit); 4440 4441 bind(COMPARE_WIDE_VECTORS); 4442 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4443 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4444 pxor(vec1, vec2); 4445 4446 ptest(vec1, vec1); 4447 jcc(Assembler::notZero, FALSE_LABEL); 4448 addptr(limit, 16); 4449 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4450 4451 testl(result, result); 4452 jcc(Assembler::zero, TRUE_LABEL); 4453 4454 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4455 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4456 pxor(vec1, vec2); 4457 4458 ptest(vec1, vec1); 4459 jccb(Assembler::notZero, FALSE_LABEL); 4460 jmpb(TRUE_LABEL); 4461 4462 bind(COMPARE_TAIL); // limit is zero 4463 movl(limit, result); 4464 // Fallthru to tail compare 4465 } 4466 4467 // Compare 4-byte vectors 4468 if (expand_ary2) { 4469 testl(result, result); 4470 jccb(Assembler::zero, TRUE_LABEL); 4471 } else { 4472 andl(limit, 0xfffffffc); // vector count (in bytes) 4473 jccb(Assembler::zero, COMPARE_CHAR); 4474 } 4475 4476 lea(ary1, Address(ary1, limit, scaleFactor)); 4477 lea(ary2, Address(ary2, limit, Address::times_1)); 4478 negptr(limit); 4479 4480 bind(COMPARE_VECTORS); 4481 if (expand_ary2) { 4482 // There are no "vector" operations for bytes to shorts 4483 movzbl(chr, Address(ary2, limit, Address::times_1)); 4484 cmpw(Address(ary1, limit, Address::times_2), chr); 4485 jccb(Assembler::notEqual, FALSE_LABEL); 4486 addptr(limit, 1); 4487 jcc(Assembler::notZero, COMPARE_VECTORS); 4488 jmp(TRUE_LABEL); 4489 } else { 4490 movl(chr, Address(ary1, limit, Address::times_1)); 4491 cmpl(chr, Address(ary2, limit, Address::times_1)); 4492 jccb(Assembler::notEqual, FALSE_LABEL); 4493 addptr(limit, 4); 4494 jcc(Assembler::notZero, COMPARE_VECTORS); 4495 } 4496 4497 // Compare trailing char (final 2 bytes), if any 4498 bind(COMPARE_CHAR); 4499 testl(result, 0x2); // tail char 4500 jccb(Assembler::zero, COMPARE_BYTE); 4501 load_unsigned_short(chr, Address(ary1, 0)); 4502 load_unsigned_short(limit, Address(ary2, 0)); 4503 cmpl(chr, limit); 4504 jccb(Assembler::notEqual, FALSE_LABEL); 4505 4506 if (is_array_equ && is_char) { 4507 bind(COMPARE_BYTE); 4508 } else { 4509 lea(ary1, Address(ary1, 2)); 4510 lea(ary2, Address(ary2, 2)); 4511 4512 bind(COMPARE_BYTE); 4513 testl(result, 0x1); // tail byte 4514 jccb(Assembler::zero, TRUE_LABEL); 4515 load_unsigned_byte(chr, Address(ary1, 0)); 4516 load_unsigned_byte(limit, Address(ary2, 0)); 4517 cmpl(chr, limit); 4518 jccb(Assembler::notEqual, FALSE_LABEL); 4519 } 4520 bind(TRUE_LABEL); 4521 movl(result, 1); // return true 4522 jmpb(DONE); 4523 4524 bind(FALSE_LABEL); 4525 xorl(result, result); // return false 4526 4527 // That's it 4528 bind(DONE); 4529 if (UseAVX >= 2) { 4530 // clean upper bits of YMM registers 4531 vpxor(vec1, vec1); 4532 vpxor(vec2, vec2); 4533 } 4534 } 4535 4536 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4537 #define __ masm. 4538 Register dst = stub.data<0>(); 4539 XMMRegister src = stub.data<1>(); 4540 address target = stub.data<2>(); 4541 __ bind(stub.entry()); 4542 __ subptr(rsp, 8); 4543 __ movdbl(Address(rsp), src); 4544 __ call(RuntimeAddress(target)); 4545 __ pop(dst); 4546 __ jmp(stub.continuation()); 4547 #undef __ 4548 } 4549 4550 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4551 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4552 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4553 4554 address slowpath_target; 4555 if (dst_bt == T_INT) { 4556 if (src_bt == T_FLOAT) { 4557 cvttss2sil(dst, src); 4558 cmpl(dst, 0x80000000); 4559 slowpath_target = StubRoutines::x86::f2i_fixup(); 4560 } else { 4561 cvttsd2sil(dst, src); 4562 cmpl(dst, 0x80000000); 4563 slowpath_target = StubRoutines::x86::d2i_fixup(); 4564 } 4565 } else { 4566 if (src_bt == T_FLOAT) { 4567 cvttss2siq(dst, src); 4568 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4569 slowpath_target = StubRoutines::x86::f2l_fixup(); 4570 } else { 4571 cvttsd2siq(dst, src); 4572 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4573 slowpath_target = StubRoutines::x86::d2l_fixup(); 4574 } 4575 } 4576 4577 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4578 jcc(Assembler::equal, stub->entry()); 4579 bind(stub->continuation()); 4580 } 4581 4582 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4583 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4584 switch(ideal_opc) { 4585 case Op_LShiftVS: 4586 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4587 case Op_LShiftVI: 4588 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4589 case Op_LShiftVL: 4590 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4591 case Op_RShiftVS: 4592 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4593 case Op_RShiftVI: 4594 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4595 case Op_RShiftVL: 4596 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4597 case Op_URShiftVS: 4598 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4599 case Op_URShiftVI: 4600 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4601 case Op_URShiftVL: 4602 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4603 case Op_RotateRightV: 4604 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4605 case Op_RotateLeftV: 4606 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4607 default: 4608 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4609 break; 4610 } 4611 } 4612 4613 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4614 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) { 4615 if (is_unsigned) { 4616 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4617 } else { 4618 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4619 } 4620 } 4621 4622 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4623 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4624 switch (elem_bt) { 4625 case T_BYTE: 4626 if (ideal_opc == Op_SaturatingAddV) { 4627 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4628 } else { 4629 assert(ideal_opc == Op_SaturatingSubV, ""); 4630 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4631 } 4632 break; 4633 case T_SHORT: 4634 if (ideal_opc == Op_SaturatingAddV) { 4635 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4636 } else { 4637 assert(ideal_opc == Op_SaturatingSubV, ""); 4638 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4639 } 4640 break; 4641 default: 4642 fatal("Unsupported type %s", type2name(elem_bt)); 4643 break; 4644 } 4645 } 4646 4647 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4648 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4649 switch (elem_bt) { 4650 case T_BYTE: 4651 if (ideal_opc == Op_SaturatingAddV) { 4652 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4653 } else { 4654 assert(ideal_opc == Op_SaturatingSubV, ""); 4655 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4656 } 4657 break; 4658 case T_SHORT: 4659 if (ideal_opc == Op_SaturatingAddV) { 4660 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4661 } else { 4662 assert(ideal_opc == Op_SaturatingSubV, ""); 4663 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4664 } 4665 break; 4666 default: 4667 fatal("Unsupported type %s", type2name(elem_bt)); 4668 break; 4669 } 4670 } 4671 4672 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4673 Address src2, bool is_unsigned, bool merge, int vlen_enc) { 4674 if (is_unsigned) { 4675 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4676 } else { 4677 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4678 } 4679 } 4680 4681 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4682 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4683 switch (elem_bt) { 4684 case T_BYTE: 4685 if (ideal_opc == Op_SaturatingAddV) { 4686 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4687 } else { 4688 assert(ideal_opc == Op_SaturatingSubV, ""); 4689 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4690 } 4691 break; 4692 case T_SHORT: 4693 if (ideal_opc == Op_SaturatingAddV) { 4694 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4695 } else { 4696 assert(ideal_opc == Op_SaturatingSubV, ""); 4697 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4698 } 4699 break; 4700 default: 4701 fatal("Unsupported type %s", type2name(elem_bt)); 4702 break; 4703 } 4704 } 4705 4706 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4707 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4708 switch (elem_bt) { 4709 case T_BYTE: 4710 if (ideal_opc == Op_SaturatingAddV) { 4711 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4712 } else { 4713 assert(ideal_opc == Op_SaturatingSubV, ""); 4714 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4715 } 4716 break; 4717 case T_SHORT: 4718 if (ideal_opc == Op_SaturatingAddV) { 4719 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4720 } else { 4721 assert(ideal_opc == Op_SaturatingSubV, ""); 4722 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4723 } 4724 break; 4725 default: 4726 fatal("Unsupported type %s", type2name(elem_bt)); 4727 break; 4728 } 4729 } 4730 4731 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4732 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4733 bool is_varshift) { 4734 switch (ideal_opc) { 4735 case Op_AddVB: 4736 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4737 case Op_AddVS: 4738 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4739 case Op_AddVI: 4740 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4741 case Op_AddVL: 4742 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4743 case Op_AddVF: 4744 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4745 case Op_AddVD: 4746 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4747 case Op_SubVB: 4748 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4749 case Op_SubVS: 4750 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4751 case Op_SubVI: 4752 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4753 case Op_SubVL: 4754 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4755 case Op_SubVF: 4756 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4757 case Op_SubVD: 4758 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4759 case Op_MulVS: 4760 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4761 case Op_MulVI: 4762 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4763 case Op_MulVL: 4764 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4765 case Op_MulVF: 4766 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4767 case Op_MulVD: 4768 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4769 case Op_DivVF: 4770 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4771 case Op_DivVD: 4772 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4773 case Op_SqrtVF: 4774 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4775 case Op_SqrtVD: 4776 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4777 case Op_AbsVB: 4778 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4779 case Op_AbsVS: 4780 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4781 case Op_AbsVI: 4782 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4783 case Op_AbsVL: 4784 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4785 case Op_FmaVF: 4786 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4787 case Op_FmaVD: 4788 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4789 case Op_VectorRearrange: 4790 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4791 case Op_LShiftVS: 4792 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4793 case Op_LShiftVI: 4794 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4795 case Op_LShiftVL: 4796 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4797 case Op_RShiftVS: 4798 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4799 case Op_RShiftVI: 4800 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4801 case Op_RShiftVL: 4802 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4803 case Op_URShiftVS: 4804 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4805 case Op_URShiftVI: 4806 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4807 case Op_URShiftVL: 4808 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4809 case Op_RotateLeftV: 4810 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4811 case Op_RotateRightV: 4812 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4813 case Op_MaxV: 4814 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4815 case Op_MinV: 4816 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4817 case Op_UMinV: 4818 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4819 case Op_UMaxV: 4820 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4821 case Op_XorV: 4822 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4823 case Op_OrV: 4824 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4825 case Op_AndV: 4826 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4827 default: 4828 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4829 break; 4830 } 4831 } 4832 4833 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4834 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4835 switch (ideal_opc) { 4836 case Op_AddVB: 4837 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4838 case Op_AddVS: 4839 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4840 case Op_AddVI: 4841 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4842 case Op_AddVL: 4843 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4844 case Op_AddVF: 4845 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4846 case Op_AddVD: 4847 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4848 case Op_SubVB: 4849 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4850 case Op_SubVS: 4851 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4852 case Op_SubVI: 4853 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4854 case Op_SubVL: 4855 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4856 case Op_SubVF: 4857 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4858 case Op_SubVD: 4859 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4860 case Op_MulVS: 4861 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4862 case Op_MulVI: 4863 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4864 case Op_MulVL: 4865 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4866 case Op_MulVF: 4867 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4868 case Op_MulVD: 4869 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4870 case Op_DivVF: 4871 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4872 case Op_DivVD: 4873 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4874 case Op_FmaVF: 4875 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4876 case Op_FmaVD: 4877 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4878 case Op_MaxV: 4879 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4880 case Op_MinV: 4881 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4882 case Op_UMaxV: 4883 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4884 case Op_UMinV: 4885 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4886 case Op_XorV: 4887 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4888 case Op_OrV: 4889 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4890 case Op_AndV: 4891 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4892 default: 4893 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4894 break; 4895 } 4896 } 4897 4898 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4899 KRegister src1, KRegister src2) { 4900 BasicType etype = T_ILLEGAL; 4901 switch(mask_len) { 4902 case 2: 4903 case 4: 4904 case 8: etype = T_BYTE; break; 4905 case 16: etype = T_SHORT; break; 4906 case 32: etype = T_INT; break; 4907 case 64: etype = T_LONG; break; 4908 default: fatal("Unsupported type"); break; 4909 } 4910 assert(etype != T_ILLEGAL, ""); 4911 switch(ideal_opc) { 4912 case Op_AndVMask: 4913 kand(etype, dst, src1, src2); break; 4914 case Op_OrVMask: 4915 kor(etype, dst, src1, src2); break; 4916 case Op_XorVMask: 4917 kxor(etype, dst, src1, src2); break; 4918 default: 4919 fatal("Unsupported masked operation"); break; 4920 } 4921 } 4922 4923 /* 4924 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4925 * If src is NaN, the result is 0. 4926 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4927 * the result is equal to the value of Integer.MIN_VALUE. 4928 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4929 * the result is equal to the value of Integer.MAX_VALUE. 4930 */ 4931 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4932 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4933 Register rscratch, AddressLiteral float_sign_flip, 4934 int vec_enc) { 4935 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4936 Label done; 4937 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4938 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4939 vptest(xtmp2, xtmp2, vec_enc); 4940 jccb(Assembler::equal, done); 4941 4942 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4943 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4944 4945 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4946 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4947 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4948 4949 // Recompute the mask for remaining special value. 4950 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4951 // Extract SRC values corresponding to TRUE mask lanes. 4952 vpand(xtmp4, xtmp2, src, vec_enc); 4953 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4954 // values are set. 4955 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4956 4957 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4958 bind(done); 4959 } 4960 4961 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4962 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4963 Register rscratch, AddressLiteral float_sign_flip, 4964 int vec_enc) { 4965 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4966 Label done; 4967 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4968 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4969 kortestwl(ktmp1, ktmp1); 4970 jccb(Assembler::equal, done); 4971 4972 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4973 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4974 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4975 4976 kxorwl(ktmp1, ktmp1, ktmp2); 4977 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4978 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4979 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4980 bind(done); 4981 } 4982 4983 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4984 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4985 Register rscratch, AddressLiteral double_sign_flip, 4986 int vec_enc) { 4987 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4988 4989 Label done; 4990 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4991 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4992 kortestwl(ktmp1, ktmp1); 4993 jccb(Assembler::equal, done); 4994 4995 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4996 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4997 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4998 4999 kxorwl(ktmp1, ktmp1, ktmp2); 5000 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5001 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5002 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5003 bind(done); 5004 } 5005 5006 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5007 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5008 Register rscratch, AddressLiteral float_sign_flip, 5009 int vec_enc) { 5010 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5011 Label done; 5012 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5013 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5014 kortestwl(ktmp1, ktmp1); 5015 jccb(Assembler::equal, done); 5016 5017 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5018 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5019 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5020 5021 kxorwl(ktmp1, ktmp1, ktmp2); 5022 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5023 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5024 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5025 bind(done); 5026 } 5027 5028 /* 5029 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5030 * If src is NaN, the result is 0. 5031 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 5032 * the result is equal to the value of Long.MIN_VALUE. 5033 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 5034 * the result is equal to the value of Long.MAX_VALUE. 5035 */ 5036 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5037 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5038 Register rscratch, AddressLiteral double_sign_flip, 5039 int vec_enc) { 5040 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5041 5042 Label done; 5043 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5044 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 5045 kortestwl(ktmp1, ktmp1); 5046 jccb(Assembler::equal, done); 5047 5048 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5049 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5050 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5051 5052 kxorwl(ktmp1, ktmp1, ktmp2); 5053 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5054 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5055 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5056 bind(done); 5057 } 5058 5059 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5060 XMMRegister xtmp, int index, int vec_enc) { 5061 assert(vec_enc < Assembler::AVX_512bit, ""); 5062 if (vec_enc == Assembler::AVX_256bit) { 5063 vextractf128_high(xtmp, src); 5064 vshufps(dst, src, xtmp, index, vec_enc); 5065 } else { 5066 vshufps(dst, src, zero, index, vec_enc); 5067 } 5068 } 5069 5070 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5071 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5072 AddressLiteral float_sign_flip, int src_vec_enc) { 5073 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5074 5075 Label done; 5076 // Compare the destination lanes with float_sign_flip 5077 // value to get mask for all special values. 5078 movdqu(xtmp1, float_sign_flip, rscratch); 5079 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5080 ptest(xtmp2, xtmp2); 5081 jccb(Assembler::equal, done); 5082 5083 // Flip float_sign_flip to get max integer value. 5084 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5085 pxor(xtmp1, xtmp4); 5086 5087 // Set detination lanes corresponding to unordered source lanes as zero. 5088 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5089 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5090 5091 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5092 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5093 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5094 5095 // Recompute the mask for remaining special value. 5096 pxor(xtmp2, xtmp3); 5097 // Extract mask corresponding to non-negative source lanes. 5098 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5099 5100 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5101 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5102 pand(xtmp3, xtmp2); 5103 5104 // Replace destination lanes holding special value(0x80000000) with max int 5105 // if corresponding source lane holds a +ve value. 5106 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5107 bind(done); 5108 } 5109 5110 5111 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5112 XMMRegister xtmp, Register rscratch, int vec_enc) { 5113 switch(to_elem_bt) { 5114 case T_SHORT: 5115 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5116 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5117 vpackusdw(dst, dst, zero, vec_enc); 5118 if (vec_enc == Assembler::AVX_256bit) { 5119 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5120 } 5121 break; 5122 case T_BYTE: 5123 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5124 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5125 vpackusdw(dst, dst, zero, vec_enc); 5126 if (vec_enc == Assembler::AVX_256bit) { 5127 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5128 } 5129 vpackuswb(dst, dst, zero, vec_enc); 5130 break; 5131 default: assert(false, "%s", type2name(to_elem_bt)); 5132 } 5133 } 5134 5135 /* 5136 * Algorithm for vector D2L and F2I conversions:- 5137 * a) Perform vector D2L/F2I cast. 5138 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5139 * It signifies that source value could be any of the special floating point 5140 * values(NaN,-Inf,Inf,Max,-Min). 5141 * c) Set destination to zero if source is NaN value. 5142 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5143 */ 5144 5145 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5146 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5147 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5148 int to_elem_sz = type2aelembytes(to_elem_bt); 5149 assert(to_elem_sz <= 4, ""); 5150 vcvttps2dq(dst, src, vec_enc); 5151 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5152 if (to_elem_sz < 4) { 5153 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5154 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5155 } 5156 } 5157 5158 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5159 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5160 Register rscratch, int vec_enc) { 5161 int to_elem_sz = type2aelembytes(to_elem_bt); 5162 assert(to_elem_sz <= 4, ""); 5163 vcvttps2dq(dst, src, vec_enc); 5164 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5165 switch(to_elem_bt) { 5166 case T_INT: 5167 break; 5168 case T_SHORT: 5169 evpmovdw(dst, dst, vec_enc); 5170 break; 5171 case T_BYTE: 5172 evpmovdb(dst, dst, vec_enc); 5173 break; 5174 default: assert(false, "%s", type2name(to_elem_bt)); 5175 } 5176 } 5177 5178 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5179 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5180 Register rscratch, int vec_enc) { 5181 evcvttps2qq(dst, src, vec_enc); 5182 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5183 } 5184 5185 // Handling for downcasting from double to integer or sub-word types on AVX2. 5186 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5187 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5188 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5189 int to_elem_sz = type2aelembytes(to_elem_bt); 5190 assert(to_elem_sz < 8, ""); 5191 vcvttpd2dq(dst, src, vec_enc); 5192 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5193 float_sign_flip, vec_enc); 5194 if (to_elem_sz < 4) { 5195 // xtmp4 holds all zero lanes. 5196 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5197 } 5198 } 5199 5200 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5201 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5202 KRegister ktmp2, AddressLiteral sign_flip, 5203 Register rscratch, int vec_enc) { 5204 if (VM_Version::supports_avx512dq()) { 5205 evcvttpd2qq(dst, src, vec_enc); 5206 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5207 switch(to_elem_bt) { 5208 case T_LONG: 5209 break; 5210 case T_INT: 5211 evpmovsqd(dst, dst, vec_enc); 5212 break; 5213 case T_SHORT: 5214 evpmovsqd(dst, dst, vec_enc); 5215 evpmovdw(dst, dst, vec_enc); 5216 break; 5217 case T_BYTE: 5218 evpmovsqd(dst, dst, vec_enc); 5219 evpmovdb(dst, dst, vec_enc); 5220 break; 5221 default: assert(false, "%s", type2name(to_elem_bt)); 5222 } 5223 } else { 5224 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5225 vcvttpd2dq(dst, src, vec_enc); 5226 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5227 switch(to_elem_bt) { 5228 case T_INT: 5229 break; 5230 case T_SHORT: 5231 evpmovdw(dst, dst, vec_enc); 5232 break; 5233 case T_BYTE: 5234 evpmovdb(dst, dst, vec_enc); 5235 break; 5236 default: assert(false, "%s", type2name(to_elem_bt)); 5237 } 5238 } 5239 } 5240 5241 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5242 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5243 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5244 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5245 // and re-instantiate original MXCSR.RC mode after that. 5246 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5247 5248 mov64(tmp, julong_cast(0.5L)); 5249 evpbroadcastq(xtmp1, tmp, vec_enc); 5250 vaddpd(xtmp1, src , xtmp1, vec_enc); 5251 evcvtpd2qq(dst, xtmp1, vec_enc); 5252 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5253 double_sign_flip, vec_enc);; 5254 5255 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5256 } 5257 5258 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5259 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5260 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5261 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5262 // and re-instantiate original MXCSR.RC mode after that. 5263 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5264 5265 movl(tmp, jint_cast(0.5)); 5266 movq(xtmp1, tmp); 5267 vbroadcastss(xtmp1, xtmp1, vec_enc); 5268 vaddps(xtmp1, src , xtmp1, vec_enc); 5269 vcvtps2dq(dst, xtmp1, vec_enc); 5270 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5271 float_sign_flip, vec_enc); 5272 5273 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5274 } 5275 5276 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5277 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5278 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5279 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5280 // and re-instantiate original MXCSR.RC mode after that. 5281 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5282 5283 movl(tmp, jint_cast(0.5)); 5284 movq(xtmp1, tmp); 5285 vbroadcastss(xtmp1, xtmp1, vec_enc); 5286 vaddps(xtmp1, src , xtmp1, vec_enc); 5287 vcvtps2dq(dst, xtmp1, vec_enc); 5288 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5289 5290 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5291 } 5292 5293 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5294 BasicType from_elem_bt, BasicType to_elem_bt) { 5295 switch (from_elem_bt) { 5296 case T_BYTE: 5297 switch (to_elem_bt) { 5298 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5299 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5300 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5301 default: ShouldNotReachHere(); 5302 } 5303 break; 5304 case T_SHORT: 5305 switch (to_elem_bt) { 5306 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5307 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5308 default: ShouldNotReachHere(); 5309 } 5310 break; 5311 case T_INT: 5312 assert(to_elem_bt == T_LONG, ""); 5313 vpmovzxdq(dst, src, vlen_enc); 5314 break; 5315 default: 5316 ShouldNotReachHere(); 5317 } 5318 } 5319 5320 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5321 BasicType from_elem_bt, BasicType to_elem_bt) { 5322 switch (from_elem_bt) { 5323 case T_BYTE: 5324 switch (to_elem_bt) { 5325 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5326 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5327 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5328 default: ShouldNotReachHere(); 5329 } 5330 break; 5331 case T_SHORT: 5332 switch (to_elem_bt) { 5333 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5334 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5335 default: ShouldNotReachHere(); 5336 } 5337 break; 5338 case T_INT: 5339 assert(to_elem_bt == T_LONG, ""); 5340 vpmovsxdq(dst, src, vlen_enc); 5341 break; 5342 default: 5343 ShouldNotReachHere(); 5344 } 5345 } 5346 5347 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5348 BasicType dst_bt, BasicType src_bt, int vlen) { 5349 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5350 assert(vlen_enc != AVX_512bit, ""); 5351 5352 int dst_bt_size = type2aelembytes(dst_bt); 5353 int src_bt_size = type2aelembytes(src_bt); 5354 if (dst_bt_size > src_bt_size) { 5355 switch (dst_bt_size / src_bt_size) { 5356 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5357 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5358 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5359 default: ShouldNotReachHere(); 5360 } 5361 } else { 5362 assert(dst_bt_size < src_bt_size, ""); 5363 switch (src_bt_size / dst_bt_size) { 5364 case 2: { 5365 if (vlen_enc == AVX_128bit) { 5366 vpacksswb(dst, src, src, vlen_enc); 5367 } else { 5368 vpacksswb(dst, src, src, vlen_enc); 5369 vpermq(dst, dst, 0x08, vlen_enc); 5370 } 5371 break; 5372 } 5373 case 4: { 5374 if (vlen_enc == AVX_128bit) { 5375 vpackssdw(dst, src, src, vlen_enc); 5376 vpacksswb(dst, dst, dst, vlen_enc); 5377 } else { 5378 vpackssdw(dst, src, src, vlen_enc); 5379 vpermq(dst, dst, 0x08, vlen_enc); 5380 vpacksswb(dst, dst, dst, AVX_128bit); 5381 } 5382 break; 5383 } 5384 case 8: { 5385 if (vlen_enc == AVX_128bit) { 5386 vpshufd(dst, src, 0x08, vlen_enc); 5387 vpackssdw(dst, dst, dst, vlen_enc); 5388 vpacksswb(dst, dst, dst, vlen_enc); 5389 } else { 5390 vpshufd(dst, src, 0x08, vlen_enc); 5391 vpermq(dst, dst, 0x08, vlen_enc); 5392 vpackssdw(dst, dst, dst, AVX_128bit); 5393 vpacksswb(dst, dst, dst, AVX_128bit); 5394 } 5395 break; 5396 } 5397 default: ShouldNotReachHere(); 5398 } 5399 } 5400 } 5401 5402 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5403 bool merge, BasicType bt, int vlen_enc) { 5404 if (bt == T_INT) { 5405 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5406 } else { 5407 assert(bt == T_LONG, ""); 5408 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5409 } 5410 } 5411 5412 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5413 bool merge, BasicType bt, int vlen_enc) { 5414 if (bt == T_INT) { 5415 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5416 } else { 5417 assert(bt == T_LONG, ""); 5418 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5419 } 5420 } 5421 5422 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5423 Register rtmp2, XMMRegister xtmp, int mask_len, 5424 int vec_enc) { 5425 int index = 0; 5426 int vindex = 0; 5427 mov64(rtmp1, 0x0101010101010101L); 5428 pdepq(rtmp1, src, rtmp1); 5429 if (mask_len > 8) { 5430 movq(rtmp2, src); 5431 vpxor(xtmp, xtmp, xtmp, vec_enc); 5432 movq(xtmp, rtmp1); 5433 } 5434 movq(dst, rtmp1); 5435 5436 mask_len -= 8; 5437 while (mask_len > 0) { 5438 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5439 index++; 5440 if ((index % 2) == 0) { 5441 pxor(xtmp, xtmp); 5442 } 5443 mov64(rtmp1, 0x0101010101010101L); 5444 shrq(rtmp2, 8); 5445 pdepq(rtmp1, rtmp2, rtmp1); 5446 pinsrq(xtmp, rtmp1, index % 2); 5447 vindex = index / 2; 5448 if (vindex) { 5449 // Write entire 16 byte vector when both 64 bit 5450 // lanes are update to save redundant instructions. 5451 if (index % 2) { 5452 vinsertf128(dst, dst, xtmp, vindex); 5453 } 5454 } else { 5455 vmovdqu(dst, xtmp); 5456 } 5457 mask_len -= 8; 5458 } 5459 } 5460 5461 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5462 switch(opc) { 5463 case Op_VectorMaskTrueCount: 5464 popcntq(dst, tmp); 5465 break; 5466 case Op_VectorMaskLastTrue: 5467 if (VM_Version::supports_lzcnt()) { 5468 lzcntq(tmp, tmp); 5469 movl(dst, 63); 5470 subl(dst, tmp); 5471 } else { 5472 movl(dst, -1); 5473 bsrq(tmp, tmp); 5474 cmov32(Assembler::notZero, dst, tmp); 5475 } 5476 break; 5477 case Op_VectorMaskFirstTrue: 5478 if (VM_Version::supports_bmi1()) { 5479 if (masklen < 32) { 5480 orl(tmp, 1 << masklen); 5481 tzcntl(dst, tmp); 5482 } else if (masklen == 32) { 5483 tzcntl(dst, tmp); 5484 } else { 5485 assert(masklen == 64, ""); 5486 tzcntq(dst, tmp); 5487 } 5488 } else { 5489 if (masklen < 32) { 5490 orl(tmp, 1 << masklen); 5491 bsfl(dst, tmp); 5492 } else { 5493 assert(masklen == 32 || masklen == 64, ""); 5494 movl(dst, masklen); 5495 if (masklen == 32) { 5496 bsfl(tmp, tmp); 5497 } else { 5498 bsfq(tmp, tmp); 5499 } 5500 cmov32(Assembler::notZero, dst, tmp); 5501 } 5502 } 5503 break; 5504 case Op_VectorMaskToLong: 5505 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5506 break; 5507 default: assert(false, "Unhandled mask operation"); 5508 } 5509 } 5510 5511 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5512 int masklen, int masksize, int vec_enc) { 5513 assert(VM_Version::supports_popcnt(), ""); 5514 5515 if(VM_Version::supports_avx512bw()) { 5516 kmovql(tmp, mask); 5517 } else { 5518 assert(masklen <= 16, ""); 5519 kmovwl(tmp, mask); 5520 } 5521 5522 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5523 // operations needs to be clipped. 5524 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5525 andq(tmp, (1 << masklen) - 1); 5526 } 5527 5528 vector_mask_operation_helper(opc, dst, tmp, masklen); 5529 } 5530 5531 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5532 Register tmp, int masklen, BasicType bt, int vec_enc) { 5533 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5534 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5535 assert(VM_Version::supports_popcnt(), ""); 5536 5537 bool need_clip = false; 5538 switch(bt) { 5539 case T_BOOLEAN: 5540 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5541 vpxor(xtmp, xtmp, xtmp, vec_enc); 5542 vpsubb(xtmp, xtmp, mask, vec_enc); 5543 vpmovmskb(tmp, xtmp, vec_enc); 5544 need_clip = masklen < 16; 5545 break; 5546 case T_BYTE: 5547 vpmovmskb(tmp, mask, vec_enc); 5548 need_clip = masklen < 16; 5549 break; 5550 case T_SHORT: 5551 vpacksswb(xtmp, mask, mask, vec_enc); 5552 if (masklen >= 16) { 5553 vpermpd(xtmp, xtmp, 8, vec_enc); 5554 } 5555 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5556 need_clip = masklen < 16; 5557 break; 5558 case T_INT: 5559 case T_FLOAT: 5560 vmovmskps(tmp, mask, vec_enc); 5561 need_clip = masklen < 4; 5562 break; 5563 case T_LONG: 5564 case T_DOUBLE: 5565 vmovmskpd(tmp, mask, vec_enc); 5566 need_clip = masklen < 2; 5567 break; 5568 default: assert(false, "Unhandled type, %s", type2name(bt)); 5569 } 5570 5571 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5572 // operations needs to be clipped. 5573 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5574 // need_clip implies masklen < 32 5575 andq(tmp, (1 << masklen) - 1); 5576 } 5577 5578 vector_mask_operation_helper(opc, dst, tmp, masklen); 5579 } 5580 5581 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5582 Register rtmp2, int mask_len) { 5583 kmov(rtmp1, src); 5584 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5585 mov64(rtmp2, -1L); 5586 pextq(rtmp2, rtmp2, rtmp1); 5587 kmov(dst, rtmp2); 5588 } 5589 5590 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5591 XMMRegister mask, Register rtmp, Register rscratch, 5592 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5593 int vec_enc) { 5594 assert(type2aelembytes(bt) >= 4, ""); 5595 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5596 address compress_perm_table = nullptr; 5597 address expand_perm_table = nullptr; 5598 if (type2aelembytes(bt) == 8) { 5599 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5600 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5601 vmovmskpd(rtmp, mask, vec_enc); 5602 } else { 5603 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5604 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5605 vmovmskps(rtmp, mask, vec_enc); 5606 } 5607 shlq(rtmp, 5); // for 32 byte permute row. 5608 if (opcode == Op_CompressV) { 5609 lea(rscratch, ExternalAddress(compress_perm_table)); 5610 } else { 5611 lea(rscratch, ExternalAddress(expand_perm_table)); 5612 } 5613 addptr(rtmp, rscratch); 5614 vmovdqu(permv, Address(rtmp)); 5615 vpermps(dst, permv, src, Assembler::AVX_256bit); 5616 vpxor(xtmp, xtmp, xtmp, vec_enc); 5617 // Blend the result with zero vector using permute mask, each column entry 5618 // in a permute table row contains either a valid permute index or a -1 (default) 5619 // value, this can potentially be used as a blending mask after 5620 // compressing/expanding the source vector lanes. 5621 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5622 } 5623 5624 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5625 bool merge, BasicType bt, int vec_enc) { 5626 if (opcode == Op_CompressV) { 5627 switch(bt) { 5628 case T_BYTE: 5629 evpcompressb(dst, mask, src, merge, vec_enc); 5630 break; 5631 case T_CHAR: 5632 case T_SHORT: 5633 evpcompressw(dst, mask, src, merge, vec_enc); 5634 break; 5635 case T_INT: 5636 evpcompressd(dst, mask, src, merge, vec_enc); 5637 break; 5638 case T_FLOAT: 5639 evcompressps(dst, mask, src, merge, vec_enc); 5640 break; 5641 case T_LONG: 5642 evpcompressq(dst, mask, src, merge, vec_enc); 5643 break; 5644 case T_DOUBLE: 5645 evcompresspd(dst, mask, src, merge, vec_enc); 5646 break; 5647 default: 5648 fatal("Unsupported type %s", type2name(bt)); 5649 break; 5650 } 5651 } else { 5652 assert(opcode == Op_ExpandV, ""); 5653 switch(bt) { 5654 case T_BYTE: 5655 evpexpandb(dst, mask, src, merge, vec_enc); 5656 break; 5657 case T_CHAR: 5658 case T_SHORT: 5659 evpexpandw(dst, mask, src, merge, vec_enc); 5660 break; 5661 case T_INT: 5662 evpexpandd(dst, mask, src, merge, vec_enc); 5663 break; 5664 case T_FLOAT: 5665 evexpandps(dst, mask, src, merge, vec_enc); 5666 break; 5667 case T_LONG: 5668 evpexpandq(dst, mask, src, merge, vec_enc); 5669 break; 5670 case T_DOUBLE: 5671 evexpandpd(dst, mask, src, merge, vec_enc); 5672 break; 5673 default: 5674 fatal("Unsupported type %s", type2name(bt)); 5675 break; 5676 } 5677 } 5678 } 5679 5680 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5681 KRegister ktmp1, int vec_enc) { 5682 if (opcode == Op_SignumVD) { 5683 vsubpd(dst, zero, one, vec_enc); 5684 // if src < 0 ? -1 : 1 5685 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5686 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5687 // if src == NaN, -0.0 or 0.0 return src. 5688 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5689 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5690 } else { 5691 assert(opcode == Op_SignumVF, ""); 5692 vsubps(dst, zero, one, vec_enc); 5693 // if src < 0 ? -1 : 1 5694 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5695 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5696 // if src == NaN, -0.0 or 0.0 return src. 5697 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5698 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5699 } 5700 } 5701 5702 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5703 XMMRegister xtmp1, int vec_enc) { 5704 if (opcode == Op_SignumVD) { 5705 vsubpd(dst, zero, one, vec_enc); 5706 // if src < 0 ? -1 : 1 5707 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5708 // if src == NaN, -0.0 or 0.0 return src. 5709 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5710 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5711 } else { 5712 assert(opcode == Op_SignumVF, ""); 5713 vsubps(dst, zero, one, vec_enc); 5714 // if src < 0 ? -1 : 1 5715 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5716 // if src == NaN, -0.0 or 0.0 return src. 5717 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5718 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5719 } 5720 } 5721 5722 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5723 if (VM_Version::supports_avx512bw()) { 5724 if (mask_len > 32) { 5725 kmovql(dst, src); 5726 } else { 5727 kmovdl(dst, src); 5728 if (mask_len != 32) { 5729 kshiftrdl(dst, dst, 32 - mask_len); 5730 } 5731 } 5732 } else { 5733 assert(mask_len <= 16, ""); 5734 kmovwl(dst, src); 5735 if (mask_len != 16) { 5736 kshiftrwl(dst, dst, 16 - mask_len); 5737 } 5738 } 5739 } 5740 5741 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5742 int lane_size = type2aelembytes(bt); 5743 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5744 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) { 5745 movptr(rtmp, imm32); 5746 switch(lane_size) { 5747 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5748 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5749 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5750 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5751 fatal("Unsupported lane size %d", lane_size); 5752 break; 5753 } 5754 } else { 5755 movptr(rtmp, imm32); 5756 movq(dst, rtmp); 5757 switch(lane_size) { 5758 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5759 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5760 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5761 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5762 fatal("Unsupported lane size %d", lane_size); 5763 break; 5764 } 5765 } 5766 } 5767 5768 // 5769 // Following is lookup table based popcount computation algorithm:- 5770 // Index Bit set count 5771 // [ 0000 -> 0, 5772 // 0001 -> 1, 5773 // 0010 -> 1, 5774 // 0011 -> 2, 5775 // 0100 -> 1, 5776 // 0101 -> 2, 5777 // 0110 -> 2, 5778 // 0111 -> 3, 5779 // 1000 -> 1, 5780 // 1001 -> 2, 5781 // 1010 -> 3, 5782 // 1011 -> 3, 5783 // 1100 -> 2, 5784 // 1101 -> 3, 5785 // 1111 -> 4 ] 5786 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5787 // shuffle indices for lookup table access. 5788 // b. Right shift each byte of vector lane by 4 positions. 5789 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5790 // shuffle indices for lookup table access. 5791 // d. Add the bitset count of upper and lower 4 bits of each byte. 5792 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5793 // count of all the bytes of a quadword. 5794 // f. Perform step e. for upper 128bit vector lane. 5795 // g. Pack the bitset count of quadwords back to double word. 5796 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5797 5798 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5799 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5800 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5801 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5802 vpsrlw(dst, src, 4, vec_enc); 5803 vpand(dst, dst, xtmp1, vec_enc); 5804 vpand(xtmp1, src, xtmp1, vec_enc); 5805 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5806 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5807 vpshufb(dst, xtmp2, dst, vec_enc); 5808 vpaddb(dst, dst, xtmp1, vec_enc); 5809 } 5810 5811 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5812 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5813 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5814 // Following code is as per steps e,f,g and h of above algorithm. 5815 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5816 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5817 vpsadbw(dst, dst, xtmp2, vec_enc); 5818 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5819 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5820 vpackuswb(dst, xtmp1, dst, vec_enc); 5821 } 5822 5823 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5824 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5825 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5826 // Add the popcount of upper and lower bytes of word. 5827 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5828 vpsrlw(dst, xtmp1, 8, vec_enc); 5829 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5830 vpaddw(dst, dst, xtmp1, vec_enc); 5831 } 5832 5833 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5834 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5835 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5836 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5837 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5838 } 5839 5840 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5841 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5842 switch(bt) { 5843 case T_LONG: 5844 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5845 break; 5846 case T_INT: 5847 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5848 break; 5849 case T_CHAR: 5850 case T_SHORT: 5851 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5852 break; 5853 case T_BYTE: 5854 case T_BOOLEAN: 5855 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5856 break; 5857 default: 5858 fatal("Unsupported type %s", type2name(bt)); 5859 break; 5860 } 5861 } 5862 5863 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5864 KRegister mask, bool merge, int vec_enc) { 5865 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5866 switch(bt) { 5867 case T_LONG: 5868 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5869 evpopcntq(dst, mask, src, merge, vec_enc); 5870 break; 5871 case T_INT: 5872 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5873 evpopcntd(dst, mask, src, merge, vec_enc); 5874 break; 5875 case T_CHAR: 5876 case T_SHORT: 5877 assert(VM_Version::supports_avx512_bitalg(), ""); 5878 evpopcntw(dst, mask, src, merge, vec_enc); 5879 break; 5880 case T_BYTE: 5881 case T_BOOLEAN: 5882 assert(VM_Version::supports_avx512_bitalg(), ""); 5883 evpopcntb(dst, mask, src, merge, vec_enc); 5884 break; 5885 default: 5886 fatal("Unsupported type %s", type2name(bt)); 5887 break; 5888 } 5889 } 5890 5891 // Bit reversal algorithm first reverses the bits of each byte followed by 5892 // a byte level reversal for multi-byte primitive types (short/int/long). 5893 // Algorithm performs a lookup table access to get reverse bit sequence 5894 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5895 // is obtained by swapping the reverse bit sequences of upper and lower 5896 // nibble of a byte. 5897 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5898 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5899 if (VM_Version::supports_avx512vlbw()) { 5900 5901 // Get the reverse bit sequence of lower nibble of each byte. 5902 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5903 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5904 evpandq(dst, xtmp2, src, vec_enc); 5905 vpshufb(dst, xtmp1, dst, vec_enc); 5906 vpsllq(dst, dst, 4, vec_enc); 5907 5908 // Get the reverse bit sequence of upper nibble of each byte. 5909 vpandn(xtmp2, xtmp2, src, vec_enc); 5910 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5911 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5912 5913 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5914 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5915 evporq(xtmp2, dst, xtmp2, vec_enc); 5916 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5917 5918 } else if(vec_enc == Assembler::AVX_512bit) { 5919 // Shift based bit reversal. 5920 assert(bt == T_LONG || bt == T_INT, ""); 5921 5922 // Swap lower and upper nibble of each byte. 5923 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5924 5925 // Swap two least and most significant bits of each nibble. 5926 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5927 5928 // Swap adjacent pair of bits. 5929 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5930 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5931 5932 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5933 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5934 } else { 5935 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5936 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5937 5938 // Get the reverse bit sequence of lower nibble of each byte. 5939 vpand(dst, xtmp2, src, vec_enc); 5940 vpshufb(dst, xtmp1, dst, vec_enc); 5941 vpsllq(dst, dst, 4, vec_enc); 5942 5943 // Get the reverse bit sequence of upper nibble of each byte. 5944 vpandn(xtmp2, xtmp2, src, vec_enc); 5945 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5946 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5947 5948 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5949 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5950 vpor(xtmp2, dst, xtmp2, vec_enc); 5951 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5952 } 5953 } 5954 5955 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5956 XMMRegister xtmp, Register rscratch) { 5957 assert(VM_Version::supports_gfni(), ""); 5958 assert(rscratch != noreg || always_reachable(mask), "missing"); 5959 5960 // Galois field instruction based bit reversal based on following algorithm. 5961 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5962 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5963 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5964 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5965 } 5966 5967 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5968 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5969 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5970 evpandq(dst, xtmp1, src, vec_enc); 5971 vpsllq(dst, dst, nbits, vec_enc); 5972 vpandn(xtmp1, xtmp1, src, vec_enc); 5973 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5974 evporq(dst, dst, xtmp1, vec_enc); 5975 } 5976 5977 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5978 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5979 // Shift based bit reversal. 5980 assert(VM_Version::supports_evex(), ""); 5981 switch(bt) { 5982 case T_LONG: 5983 // Swap upper and lower double word of each quad word. 5984 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5985 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5986 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5987 break; 5988 case T_INT: 5989 // Swap upper and lower word of each double word. 5990 evprord(xtmp1, k0, src, 16, true, vec_enc); 5991 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5992 break; 5993 case T_CHAR: 5994 case T_SHORT: 5995 // Swap upper and lower byte of each word. 5996 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 5997 break; 5998 case T_BYTE: 5999 evmovdquq(dst, k0, src, true, vec_enc); 6000 break; 6001 default: 6002 fatal("Unsupported type %s", type2name(bt)); 6003 break; 6004 } 6005 } 6006 6007 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 6008 if (bt == T_BYTE) { 6009 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 6010 evmovdquq(dst, k0, src, true, vec_enc); 6011 } else { 6012 vmovdqu(dst, src); 6013 } 6014 return; 6015 } 6016 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 6017 // pre-computed shuffle indices. 6018 switch(bt) { 6019 case T_LONG: 6020 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 6021 break; 6022 case T_INT: 6023 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 6024 break; 6025 case T_CHAR: 6026 case T_SHORT: 6027 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6028 break; 6029 default: 6030 fatal("Unsupported type %s", type2name(bt)); 6031 break; 6032 } 6033 vpshufb(dst, src, dst, vec_enc); 6034 } 6035 6036 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6037 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6038 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6039 assert(is_integral_type(bt), ""); 6040 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6041 assert(VM_Version::supports_avx512cd(), ""); 6042 switch(bt) { 6043 case T_LONG: 6044 evplzcntq(dst, ktmp, src, merge, vec_enc); 6045 break; 6046 case T_INT: 6047 evplzcntd(dst, ktmp, src, merge, vec_enc); 6048 break; 6049 case T_SHORT: 6050 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6051 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6052 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6053 vpunpckhwd(dst, xtmp1, src, vec_enc); 6054 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6055 vpackusdw(dst, xtmp2, dst, vec_enc); 6056 break; 6057 case T_BYTE: 6058 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6059 // accessing the lookup table. 6060 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6061 // accessing the lookup table. 6062 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6063 assert(VM_Version::supports_avx512bw(), ""); 6064 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6065 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6066 vpand(xtmp2, dst, src, vec_enc); 6067 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6068 vpsrlw(xtmp3, src, 4, vec_enc); 6069 vpand(xtmp3, dst, xtmp3, vec_enc); 6070 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6071 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6072 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6073 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6074 break; 6075 default: 6076 fatal("Unsupported type %s", type2name(bt)); 6077 break; 6078 } 6079 } 6080 6081 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6082 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6083 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6084 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6085 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6086 // accessing the lookup table. 6087 vpand(dst, xtmp2, src, vec_enc); 6088 vpshufb(dst, xtmp1, dst, vec_enc); 6089 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6090 // accessing the lookup table. 6091 vpsrlw(xtmp3, src, 4, vec_enc); 6092 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6093 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6094 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6095 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6096 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6097 vpaddb(dst, dst, xtmp2, vec_enc); 6098 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6099 } 6100 6101 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6102 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6103 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6104 // Add zero counts of lower byte and upper byte of a word if 6105 // upper byte holds a zero value. 6106 vpsrlw(xtmp3, src, 8, vec_enc); 6107 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6108 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6109 vpsllw(xtmp2, dst, 8, vec_enc); 6110 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6111 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6112 vpsrlw(dst, dst, 8, vec_enc); 6113 } 6114 6115 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6116 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6117 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6118 // hence biased exponent can be used to compute leading zero count as per 6119 // following formula:- 6120 // LZCNT = 31 - (biased_exp - 127) 6121 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6122 6123 // Broadcast 0xFF 6124 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6125 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6126 6127 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher 6128 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit 6129 // contributes to the leading number of zeros. 6130 vpsrld(xtmp2, src, 1, vec_enc); 6131 vpandn(xtmp3, xtmp2, src, vec_enc); 6132 6133 // Extract biased exponent. 6134 vcvtdq2ps(dst, xtmp3, vec_enc); 6135 vpsrld(dst, dst, 23, vec_enc); 6136 vpand(dst, dst, xtmp1, vec_enc); 6137 6138 // Broadcast 127. 6139 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6140 // Exponent = biased_exp - 127 6141 vpsubd(dst, dst, xtmp1, vec_enc); 6142 6143 // Exponent_plus_one = Exponent + 1 6144 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6145 vpaddd(dst, dst, xtmp3, vec_enc); 6146 6147 // Replace -ve exponent with zero, exponent is -ve when src 6148 // lane contains a zero value. 6149 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6150 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6151 6152 // Rematerialize broadcast 32. 6153 vpslld(xtmp1, xtmp3, 5, vec_enc); 6154 // Exponent is 32 if corresponding source lane contains max_int value. 6155 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6156 // LZCNT = 32 - exponent_plus_one 6157 vpsubd(dst, xtmp1, dst, vec_enc); 6158 6159 // Replace LZCNT with a value 1 if corresponding source lane 6160 // contains max_int value. 6161 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6162 6163 // Replace biased_exp with 0 if source lane value is less than zero. 6164 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6165 vblendvps(dst, dst, xtmp2, src, vec_enc); 6166 } 6167 6168 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6169 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6170 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6171 // Add zero counts of lower word and upper word of a double word if 6172 // upper word holds a zero value. 6173 vpsrld(xtmp3, src, 16, vec_enc); 6174 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6175 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6176 vpslld(xtmp2, dst, 16, vec_enc); 6177 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6178 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6179 vpsrld(dst, dst, 16, vec_enc); 6180 // Add zero counts of lower doubleword and upper doubleword of a 6181 // quadword if upper doubleword holds a zero value. 6182 vpsrlq(xtmp3, src, 32, vec_enc); 6183 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6184 vpsllq(xtmp2, dst, 32, vec_enc); 6185 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6186 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6187 vpsrlq(dst, dst, 32, vec_enc); 6188 } 6189 6190 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6191 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6192 Register rtmp, int vec_enc) { 6193 assert(is_integral_type(bt), "unexpected type"); 6194 assert(vec_enc < Assembler::AVX_512bit, ""); 6195 switch(bt) { 6196 case T_LONG: 6197 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6198 break; 6199 case T_INT: 6200 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6201 break; 6202 case T_SHORT: 6203 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6204 break; 6205 case T_BYTE: 6206 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6207 break; 6208 default: 6209 fatal("Unsupported type %s", type2name(bt)); 6210 break; 6211 } 6212 } 6213 6214 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6215 switch(bt) { 6216 case T_BYTE: 6217 vpsubb(dst, src1, src2, vec_enc); 6218 break; 6219 case T_SHORT: 6220 vpsubw(dst, src1, src2, vec_enc); 6221 break; 6222 case T_INT: 6223 vpsubd(dst, src1, src2, vec_enc); 6224 break; 6225 case T_LONG: 6226 vpsubq(dst, src1, src2, vec_enc); 6227 break; 6228 default: 6229 fatal("Unsupported type %s", type2name(bt)); 6230 break; 6231 } 6232 } 6233 6234 // Trailing zero count computation is based on leading zero count operation as per 6235 // following equation. All AVX3 targets support AVX512CD feature which offers 6236 // direct vector instruction to compute leading zero count. 6237 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6238 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6239 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6240 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6241 assert(is_integral_type(bt), ""); 6242 // xtmp = -1 6243 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6244 // xtmp = xtmp + src 6245 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6246 // xtmp = xtmp & ~src 6247 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6248 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6249 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6250 vpsub(bt, dst, xtmp4, dst, vec_enc); 6251 } 6252 6253 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6254 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6255 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6256 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6257 assert(is_integral_type(bt), ""); 6258 // xtmp = 0 6259 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6260 // xtmp = 0 - src 6261 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6262 // xtmp = xtmp | src 6263 vpor(xtmp3, xtmp3, src, vec_enc); 6264 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6265 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6266 vpsub(bt, dst, xtmp1, dst, vec_enc); 6267 } 6268 6269 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6270 Label done; 6271 Label neg_divisor_fastpath; 6272 cmpl(divisor, 0); 6273 jccb(Assembler::less, neg_divisor_fastpath); 6274 xorl(rdx, rdx); 6275 divl(divisor); 6276 jmpb(done); 6277 bind(neg_divisor_fastpath); 6278 // Fastpath for divisor < 0: 6279 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6280 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6281 movl(rdx, rax); 6282 subl(rdx, divisor); 6283 if (VM_Version::supports_bmi1()) { 6284 andnl(rax, rdx, rax); 6285 } else { 6286 notl(rdx); 6287 andl(rax, rdx); 6288 } 6289 shrl(rax, 31); 6290 bind(done); 6291 } 6292 6293 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6294 Label done; 6295 Label neg_divisor_fastpath; 6296 cmpl(divisor, 0); 6297 jccb(Assembler::less, neg_divisor_fastpath); 6298 xorl(rdx, rdx); 6299 divl(divisor); 6300 jmpb(done); 6301 bind(neg_divisor_fastpath); 6302 // Fastpath when divisor < 0: 6303 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6304 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6305 movl(rdx, rax); 6306 subl(rax, divisor); 6307 if (VM_Version::supports_bmi1()) { 6308 andnl(rax, rax, rdx); 6309 } else { 6310 notl(rax); 6311 andl(rax, rdx); 6312 } 6313 sarl(rax, 31); 6314 andl(rax, divisor); 6315 subl(rdx, rax); 6316 bind(done); 6317 } 6318 6319 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6320 Label done; 6321 Label neg_divisor_fastpath; 6322 6323 cmpl(divisor, 0); 6324 jccb(Assembler::less, neg_divisor_fastpath); 6325 xorl(rdx, rdx); 6326 divl(divisor); 6327 jmpb(done); 6328 bind(neg_divisor_fastpath); 6329 // Fastpath for divisor < 0: 6330 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6331 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6332 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6333 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6334 movl(rdx, rax); 6335 subl(rax, divisor); 6336 if (VM_Version::supports_bmi1()) { 6337 andnl(rax, rax, rdx); 6338 } else { 6339 notl(rax); 6340 andl(rax, rdx); 6341 } 6342 movl(tmp, rax); 6343 shrl(rax, 31); // quotient 6344 sarl(tmp, 31); 6345 andl(tmp, divisor); 6346 subl(rdx, tmp); // remainder 6347 bind(done); 6348 } 6349 6350 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6351 XMMRegister xtmp2, Register rtmp) { 6352 if(VM_Version::supports_gfni()) { 6353 // Galois field instruction based bit reversal based on following algorithm. 6354 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6355 mov64(rtmp, 0x8040201008040201L); 6356 movq(xtmp1, src); 6357 movq(xtmp2, rtmp); 6358 gf2p8affineqb(xtmp1, xtmp2, 0); 6359 movq(dst, xtmp1); 6360 } else { 6361 // Swap even and odd numbered bits. 6362 movl(rtmp, src); 6363 andl(rtmp, 0x55555555); 6364 shll(rtmp, 1); 6365 movl(dst, src); 6366 andl(dst, 0xAAAAAAAA); 6367 shrl(dst, 1); 6368 orl(dst, rtmp); 6369 6370 // Swap LSB and MSB 2 bits of each nibble. 6371 movl(rtmp, dst); 6372 andl(rtmp, 0x33333333); 6373 shll(rtmp, 2); 6374 andl(dst, 0xCCCCCCCC); 6375 shrl(dst, 2); 6376 orl(dst, rtmp); 6377 6378 // Swap LSB and MSB 4 bits of each byte. 6379 movl(rtmp, dst); 6380 andl(rtmp, 0x0F0F0F0F); 6381 shll(rtmp, 4); 6382 andl(dst, 0xF0F0F0F0); 6383 shrl(dst, 4); 6384 orl(dst, rtmp); 6385 } 6386 bswapl(dst); 6387 } 6388 6389 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6390 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6391 if(VM_Version::supports_gfni()) { 6392 // Galois field instruction based bit reversal based on following algorithm. 6393 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6394 mov64(rtmp1, 0x8040201008040201L); 6395 movq(xtmp1, src); 6396 movq(xtmp2, rtmp1); 6397 gf2p8affineqb(xtmp1, xtmp2, 0); 6398 movq(dst, xtmp1); 6399 } else { 6400 // Swap even and odd numbered bits. 6401 movq(rtmp1, src); 6402 mov64(rtmp2, 0x5555555555555555L); 6403 andq(rtmp1, rtmp2); 6404 shlq(rtmp1, 1); 6405 movq(dst, src); 6406 notq(rtmp2); 6407 andq(dst, rtmp2); 6408 shrq(dst, 1); 6409 orq(dst, rtmp1); 6410 6411 // Swap LSB and MSB 2 bits of each nibble. 6412 movq(rtmp1, dst); 6413 mov64(rtmp2, 0x3333333333333333L); 6414 andq(rtmp1, rtmp2); 6415 shlq(rtmp1, 2); 6416 notq(rtmp2); 6417 andq(dst, rtmp2); 6418 shrq(dst, 2); 6419 orq(dst, rtmp1); 6420 6421 // Swap LSB and MSB 4 bits of each byte. 6422 movq(rtmp1, dst); 6423 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6424 andq(rtmp1, rtmp2); 6425 shlq(rtmp1, 4); 6426 notq(rtmp2); 6427 andq(dst, rtmp2); 6428 shrq(dst, 4); 6429 orq(dst, rtmp1); 6430 } 6431 bswapq(dst); 6432 } 6433 6434 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6435 Label done; 6436 Label neg_divisor_fastpath; 6437 cmpq(divisor, 0); 6438 jccb(Assembler::less, neg_divisor_fastpath); 6439 xorl(rdx, rdx); 6440 divq(divisor); 6441 jmpb(done); 6442 bind(neg_divisor_fastpath); 6443 // Fastpath for divisor < 0: 6444 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6445 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6446 movq(rdx, rax); 6447 subq(rdx, divisor); 6448 if (VM_Version::supports_bmi1()) { 6449 andnq(rax, rdx, rax); 6450 } else { 6451 notq(rdx); 6452 andq(rax, rdx); 6453 } 6454 shrq(rax, 63); 6455 bind(done); 6456 } 6457 6458 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6459 Label done; 6460 Label neg_divisor_fastpath; 6461 cmpq(divisor, 0); 6462 jccb(Assembler::less, neg_divisor_fastpath); 6463 xorq(rdx, rdx); 6464 divq(divisor); 6465 jmp(done); 6466 bind(neg_divisor_fastpath); 6467 // Fastpath when divisor < 0: 6468 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6469 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6470 movq(rdx, rax); 6471 subq(rax, divisor); 6472 if (VM_Version::supports_bmi1()) { 6473 andnq(rax, rax, rdx); 6474 } else { 6475 notq(rax); 6476 andq(rax, rdx); 6477 } 6478 sarq(rax, 63); 6479 andq(rax, divisor); 6480 subq(rdx, rax); 6481 bind(done); 6482 } 6483 6484 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6485 Label done; 6486 Label neg_divisor_fastpath; 6487 cmpq(divisor, 0); 6488 jccb(Assembler::less, neg_divisor_fastpath); 6489 xorq(rdx, rdx); 6490 divq(divisor); 6491 jmp(done); 6492 bind(neg_divisor_fastpath); 6493 // Fastpath for divisor < 0: 6494 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6495 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6496 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6497 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6498 movq(rdx, rax); 6499 subq(rax, divisor); 6500 if (VM_Version::supports_bmi1()) { 6501 andnq(rax, rax, rdx); 6502 } else { 6503 notq(rax); 6504 andq(rax, rdx); 6505 } 6506 movq(tmp, rax); 6507 shrq(rax, 63); // quotient 6508 sarq(tmp, 63); 6509 andq(tmp, divisor); 6510 subq(rdx, tmp); // remainder 6511 bind(done); 6512 } 6513 6514 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6515 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6516 int vlen_enc) { 6517 assert(VM_Version::supports_avx512bw(), ""); 6518 // Byte shuffles are inlane operations and indices are determined using 6519 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6520 // normalized to index range 0-15. This makes sure that all the multiples 6521 // of an index value are placed at same relative position in 128 bit 6522 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6523 // will be 16th element in their respective 128 bit lanes. 6524 movl(rtmp, 16); 6525 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6526 6527 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6528 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6529 // original shuffle indices and move the shuffled lanes corresponding to true 6530 // mask to destination vector. 6531 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6532 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6533 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6534 6535 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6536 // and broadcasting second 128 bit lane. 6537 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6538 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6539 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6540 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6541 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6542 6543 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6544 // and broadcasting third 128 bit lane. 6545 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6546 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6547 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6548 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6549 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6550 6551 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6552 // and broadcasting third 128 bit lane. 6553 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6554 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6555 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6556 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6557 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6558 } 6559 6560 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6561 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6562 if (vlen_enc == AVX_128bit) { 6563 vpermilps(dst, src, shuffle, vlen_enc); 6564 } else if (bt == T_INT) { 6565 vpermd(dst, shuffle, src, vlen_enc); 6566 } else { 6567 assert(bt == T_FLOAT, ""); 6568 vpermps(dst, shuffle, src, vlen_enc); 6569 } 6570 } 6571 6572 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 6573 switch(opcode) { 6574 case Op_AddHF: vaddsh(dst, src1, src2); break; 6575 case Op_SubHF: vsubsh(dst, src1, src2); break; 6576 case Op_MulHF: vmulsh(dst, src1, src2); break; 6577 case Op_DivHF: vdivsh(dst, src1, src2); break; 6578 default: assert(false, "%s", NodeClassNames[opcode]); break; 6579 } 6580 } 6581 6582 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6583 switch(elem_bt) { 6584 case T_BYTE: 6585 if (ideal_opc == Op_SaturatingAddV) { 6586 vpaddsb(dst, src1, src2, vlen_enc); 6587 } else { 6588 assert(ideal_opc == Op_SaturatingSubV, ""); 6589 vpsubsb(dst, src1, src2, vlen_enc); 6590 } 6591 break; 6592 case T_SHORT: 6593 if (ideal_opc == Op_SaturatingAddV) { 6594 vpaddsw(dst, src1, src2, vlen_enc); 6595 } else { 6596 assert(ideal_opc == Op_SaturatingSubV, ""); 6597 vpsubsw(dst, src1, src2, vlen_enc); 6598 } 6599 break; 6600 default: 6601 fatal("Unsupported type %s", type2name(elem_bt)); 6602 break; 6603 } 6604 } 6605 6606 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6607 switch(elem_bt) { 6608 case T_BYTE: 6609 if (ideal_opc == Op_SaturatingAddV) { 6610 vpaddusb(dst, src1, src2, vlen_enc); 6611 } else { 6612 assert(ideal_opc == Op_SaturatingSubV, ""); 6613 vpsubusb(dst, src1, src2, vlen_enc); 6614 } 6615 break; 6616 case T_SHORT: 6617 if (ideal_opc == Op_SaturatingAddV) { 6618 vpaddusw(dst, src1, src2, vlen_enc); 6619 } else { 6620 assert(ideal_opc == Op_SaturatingSubV, ""); 6621 vpsubusw(dst, src1, src2, vlen_enc); 6622 } 6623 break; 6624 default: 6625 fatal("Unsupported type %s", type2name(elem_bt)); 6626 break; 6627 } 6628 } 6629 6630 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6631 XMMRegister src2, KRegister ktmp, int vlen_enc) { 6632 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input. 6633 // overflow_mask = Inp1 <u Inp2 6634 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc); 6635 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative) 6636 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false); 6637 } 6638 6639 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6640 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6641 // Emulate unsigned comparison using signed comparison 6642 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE 6643 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true); 6644 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc); 6645 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc); 6646 6647 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc); 6648 6649 // Res = INP1 - INP2 (non-commutative and non-associative) 6650 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6651 // Res = Mask ? Zero : Res 6652 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); 6653 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc); 6654 } 6655 6656 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6657 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) { 6658 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation. 6659 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2) 6660 // Res = Signed Add INP1, INP2 6661 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6662 // T1 = SRC1 | SRC2 6663 vpor(xtmp1, src1, src2, vlen_enc); 6664 // Max_Unsigned = -1 6665 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6666 // Unsigned compare: Mask = Res <u T1 6667 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc); 6668 // res = Mask ? Max_Unsigned : Res 6669 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc); 6670 } 6671 6672 // 6673 // Section 2-13 Hacker's Delight list following overflow detection check for saturating 6674 // unsigned addition operation. 6675 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1 6676 // 6677 // We empirically determined its semantic equivalence to following reduced expression 6678 // overflow_mask = (a + b) <u (a | b) 6679 // 6680 // and also verified it though Alive2 solver. 6681 // (https://alive2.llvm.org/ce/z/XDQ7dY) 6682 // 6683 6684 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6685 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) { 6686 // Res = Signed Add INP1, INP2 6687 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6688 // Compute T1 = INP1 | INP2 6689 vpor(xtmp3, src1, src2, vlen_enc); 6690 // T1 = Minimum signed value. 6691 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6692 // Convert T1 to signed value, T1 = T1 + MIN_VALUE 6693 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc); 6694 // Convert Res to signed value, Res<s> = Res + MIN_VALUE 6695 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc); 6696 // Compute overflow detection mask = Res<1> <s T1 6697 if (elem_bt == T_INT) { 6698 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc); 6699 } else { 6700 assert(elem_bt == T_LONG, ""); 6701 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc); 6702 } 6703 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6704 } 6705 6706 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6707 int vlen_enc, bool xtmp2_hold_M1) { 6708 if (VM_Version::supports_avx512dq()) { 6709 evpmovq2m(ktmp, src, vlen_enc); 6710 } else { 6711 assert(VM_Version::supports_evex(), ""); 6712 if (!xtmp2_hold_M1) { 6713 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6714 } 6715 evpsraq(xtmp1, src, 63, vlen_enc); 6716 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6717 } 6718 } 6719 6720 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6721 int vlen_enc, bool xtmp2_hold_M1) { 6722 if (VM_Version::supports_avx512dq()) { 6723 evpmovd2m(ktmp, src, vlen_enc); 6724 } else { 6725 assert(VM_Version::supports_evex(), ""); 6726 if (!xtmp2_hold_M1) { 6727 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6728 } 6729 vpsrad(xtmp1, src, 31, vlen_enc); 6730 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6731 } 6732 } 6733 6734 6735 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 6736 if (elem_bt == T_LONG) { 6737 if (VM_Version::supports_evex()) { 6738 evpsraq(dst, src, 63, vlen_enc); 6739 } else { 6740 vpsrad(dst, src, 31, vlen_enc); 6741 vpshufd(dst, dst, 0xF5, vlen_enc); 6742 } 6743 } else { 6744 assert(elem_bt == T_INT, ""); 6745 vpsrad(dst, src, 31, vlen_enc); 6746 } 6747 } 6748 6749 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6750 if (compute_allones) { 6751 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6752 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6753 } else { 6754 vpcmpeqq(allones, allones, allones, vlen_enc); 6755 } 6756 } 6757 if (elem_bt == T_LONG) { 6758 vpsrlq(dst, allones, 1, vlen_enc); 6759 } else { 6760 assert(elem_bt == T_INT, ""); 6761 vpsrld(dst, allones, 1, vlen_enc); 6762 } 6763 } 6764 6765 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6766 if (compute_allones) { 6767 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6768 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6769 } else { 6770 vpcmpeqq(allones, allones, allones, vlen_enc); 6771 } 6772 } 6773 if (elem_bt == T_LONG) { 6774 vpsllq(dst, allones, 63, vlen_enc); 6775 } else { 6776 assert(elem_bt == T_INT, ""); 6777 vpslld(dst, allones, 31, vlen_enc); 6778 } 6779 } 6780 6781 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2, 6782 Assembler::ComparisonPredicate cond, int vlen_enc) { 6783 switch(elem_bt) { 6784 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break; 6785 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break; 6786 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break; 6787 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break; 6788 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6789 } 6790 } 6791 6792 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6793 switch(elem_bt) { 6794 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break; 6795 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break; 6796 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break; 6797 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break; 6798 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6799 } 6800 } 6801 6802 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1, 6803 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) { 6804 if (elem_bt == T_LONG) { 6805 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6806 } else { 6807 assert(elem_bt == T_INT, ""); 6808 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6809 } 6810 } 6811 6812 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6813 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6814 KRegister ktmp1, KRegister ktmp2, int vlen_enc) { 6815 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6816 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6817 // Overflow detection based on Hacker's delight section 2-13. 6818 if (ideal_opc == Op_SaturatingAddV) { 6819 // res = src1 + src2 6820 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6821 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6822 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6823 vpxor(xtmp1, dst, src1, vlen_enc); 6824 vpxor(xtmp2, dst, src2, vlen_enc); 6825 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6826 } else { 6827 assert(ideal_opc == Op_SaturatingSubV, ""); 6828 // res = src1 - src2 6829 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6830 // Overflow occurs when both inputs have opposite polarity and 6831 // result polarity does not comply with first input polarity. 6832 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6833 vpxor(xtmp1, src1, src2, vlen_enc); 6834 vpxor(xtmp2, dst, src1, vlen_enc); 6835 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6836 } 6837 6838 // Compute overflow detection mask. 6839 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc); 6840 // Note: xtmp1 hold -1 in all its lanes after above call. 6841 6842 // Compute mask based on first input polarity. 6843 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true); 6844 6845 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6846 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6847 6848 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to 6849 // set bits in first input polarity mask holds a min value. 6850 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc); 6851 // Blend destination lanes with saturated values using overflow detection mask. 6852 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc); 6853 } 6854 6855 6856 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6857 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6858 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) { 6859 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6860 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6861 // Overflow detection based on Hacker's delight section 2-13. 6862 if (ideal_opc == Op_SaturatingAddV) { 6863 // res = src1 + src2 6864 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6865 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6866 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6867 vpxor(xtmp1, dst, src1, vlen_enc); 6868 vpxor(xtmp2, dst, src2, vlen_enc); 6869 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6870 } else { 6871 assert(ideal_opc == Op_SaturatingSubV, ""); 6872 // res = src1 - src2 6873 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6874 // Overflow occurs when both inputs have opposite polarity and 6875 // result polarity does not comply with first input polarity. 6876 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6877 vpxor(xtmp1, src1, src2, vlen_enc); 6878 vpxor(xtmp2, dst, src1, vlen_enc); 6879 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6880 } 6881 6882 // Sign-extend to compute overflow detection mask. 6883 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc); 6884 6885 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc); 6886 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc); 6887 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6888 6889 // Compose saturating min/max vector using first input polarity mask. 6890 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc); 6891 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc); 6892 6893 // Blend result with saturating vector using overflow detection mask. 6894 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6895 } 6896 6897 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6898 switch(elem_bt) { 6899 case T_BYTE: 6900 if (ideal_opc == Op_SaturatingAddV) { 6901 vpaddsb(dst, src1, src2, vlen_enc); 6902 } else { 6903 assert(ideal_opc == Op_SaturatingSubV, ""); 6904 vpsubsb(dst, src1, src2, vlen_enc); 6905 } 6906 break; 6907 case T_SHORT: 6908 if (ideal_opc == Op_SaturatingAddV) { 6909 vpaddsw(dst, src1, src2, vlen_enc); 6910 } else { 6911 assert(ideal_opc == Op_SaturatingSubV, ""); 6912 vpsubsw(dst, src1, src2, vlen_enc); 6913 } 6914 break; 6915 default: 6916 fatal("Unsupported type %s", type2name(elem_bt)); 6917 break; 6918 } 6919 } 6920 6921 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6922 switch(elem_bt) { 6923 case T_BYTE: 6924 if (ideal_opc == Op_SaturatingAddV) { 6925 vpaddusb(dst, src1, src2, vlen_enc); 6926 } else { 6927 assert(ideal_opc == Op_SaturatingSubV, ""); 6928 vpsubusb(dst, src1, src2, vlen_enc); 6929 } 6930 break; 6931 case T_SHORT: 6932 if (ideal_opc == Op_SaturatingAddV) { 6933 vpaddusw(dst, src1, src2, vlen_enc); 6934 } else { 6935 assert(ideal_opc == Op_SaturatingSubV, ""); 6936 vpsubusw(dst, src1, src2, vlen_enc); 6937 } 6938 break; 6939 default: 6940 fatal("Unsupported type %s", type2name(elem_bt)); 6941 break; 6942 } 6943 } 6944 6945 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6946 XMMRegister src2, int vlen_enc) { 6947 switch(elem_bt) { 6948 case T_BYTE: 6949 evpermi2b(dst, src1, src2, vlen_enc); 6950 break; 6951 case T_SHORT: 6952 evpermi2w(dst, src1, src2, vlen_enc); 6953 break; 6954 case T_INT: 6955 evpermi2d(dst, src1, src2, vlen_enc); 6956 break; 6957 case T_LONG: 6958 evpermi2q(dst, src1, src2, vlen_enc); 6959 break; 6960 case T_FLOAT: 6961 evpermi2ps(dst, src1, src2, vlen_enc); 6962 break; 6963 case T_DOUBLE: 6964 evpermi2pd(dst, src1, src2, vlen_enc); 6965 break; 6966 default: 6967 fatal("Unsupported type %s", type2name(elem_bt)); 6968 break; 6969 } 6970 } 6971 6972 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) { 6973 if (is_unsigned) { 6974 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 6975 } else { 6976 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 6977 } 6978 } 6979 6980 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) { 6981 if (is_unsigned) { 6982 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 6983 } else { 6984 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 6985 } 6986 } 6987 6988 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6989 switch(opcode) { 6990 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 6991 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 6992 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 6993 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 6994 default: assert(false, "%s", NodeClassNames[opcode]); break; 6995 } 6996 } 6997 6998 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6999 switch(opcode) { 7000 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 7001 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 7002 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 7003 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 7004 default: assert(false, "%s", NodeClassNames[opcode]); break; 7005 } 7006 } 7007 7008 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7009 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) { 7010 vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit); 7011 } 7012 7013 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7014 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 7015 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) { 7016 // Move sign bits of src2 to mask register. 7017 evpmovw2m(ktmp, src2, vlen_enc); 7018 // xtmp1 = src2 < 0 ? src2 : src1 7019 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7020 // xtmp2 = src2 < 0 ? ? src1 : src2 7021 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7022 // Idea behind above swapping is to make seconds source operand a +ve value. 7023 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in 7024 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction, 7025 // the second source operand, either a NaN or a valid floating-point value, is returned 7026 // dst = max(xtmp1, xtmp2) 7027 evmaxph(dst, xtmp1, xtmp2, vlen_enc); 7028 // isNaN = is_unordered_quiet(xtmp1) 7029 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7030 // Final result is same as first source if its a NaN value, 7031 // in case second operand holds a NaN value then as per above semantics 7032 // result is same as second operand. 7033 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7034 } else { 7035 assert(opcode == Op_MinVHF || opcode == Op_MinHF, ""); 7036 // Move sign bits of src1 to mask register. 7037 evpmovw2m(ktmp, src1, vlen_enc); 7038 // xtmp1 = src1 < 0 ? src2 : src1 7039 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7040 // xtmp2 = src1 < 0 ? src1 : src2 7041 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7042 // Idea behind above swapping is to make seconds source operand a -ve value. 7043 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in 7044 // the second source operand is returned. 7045 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN 7046 // or a valid floating-point value, is written to the result. 7047 // dst = min(xtmp1, xtmp2) 7048 evminph(dst, xtmp1, xtmp2, vlen_enc); 7049 // isNaN = is_unordered_quiet(xtmp1) 7050 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7051 // Final result is same as first source if its a NaN value, 7052 // in case second operand holds a NaN value then as per above semantics 7053 // result is same as second operand. 7054 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7055 } 7056 }