1 /* 2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "gc/shared/barrierSet.hpp" 28 #include "gc/shared/barrierSetAssembler.hpp" 29 #include "oops/methodData.hpp" 30 #include "opto/c2_MacroAssembler.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/output.hpp" 33 #include "opto/opcodes.hpp" 34 #include "opto/subnode.hpp" 35 #include "runtime/globals.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 #include "utilities/checkedCast.hpp" 39 #include "utilities/globalDefinitions.hpp" 40 #include "utilities/powerOfTwo.hpp" 41 #include "utilities/sizes.hpp" 42 43 #ifdef PRODUCT 44 #define BLOCK_COMMENT(str) /* nothing */ 45 #define STOP(error) stop(error) 46 #else 47 #define BLOCK_COMMENT(str) block_comment(str) 48 #define STOP(error) block_comment(error); stop(error) 49 #endif 50 51 // C2 compiled method's prolog code. 52 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 53 54 // WARNING: Initial instruction MUST be 5 bytes or longer so that 55 // NativeJump::patch_verified_entry will be able to patch out the entry 56 // code safely. The push to verify stack depth is ok at 5 bytes, 57 // the frame allocation can be either 3 or 6 bytes. So if we don't do 58 // stack bang then we must use the 6 byte frame allocation even if 59 // we have no frame. :-( 60 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 61 62 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 63 // Remove word for return addr 64 framesize -= wordSize; 65 stack_bang_size -= wordSize; 66 67 // Calls to C2R adapters often do not accept exceptional returns. 68 // We require that their callers must bang for them. But be careful, because 69 // some VM calls (such as call site linkage) can use several kilobytes of 70 // stack. But the stack safety zone should account for that. 71 // See bugs 4446381, 4468289, 4497237. 72 if (stack_bang_size > 0) { 73 generate_stack_overflow_check(stack_bang_size); 74 75 // We always push rbp, so that on return to interpreter rbp, will be 76 // restored correctly and we can correct the stack. 77 push(rbp); 78 // Save caller's stack pointer into RBP if the frame pointer is preserved. 79 if (PreserveFramePointer) { 80 mov(rbp, rsp); 81 } 82 // Remove word for ebp 83 framesize -= wordSize; 84 85 // Create frame 86 if (framesize) { 87 subptr(rsp, framesize); 88 } 89 } else { 90 // Create frame (force generation of a 4 byte immediate value) 91 subptr_imm32(rsp, framesize); 92 93 // Save RBP register now. 94 framesize -= wordSize; 95 movptr(Address(rsp, framesize), rbp); 96 // Save caller's stack pointer into RBP if the frame pointer is preserved. 97 if (PreserveFramePointer) { 98 movptr(rbp, rsp); 99 if (framesize > 0) { 100 addptr(rbp, framesize); 101 } 102 } 103 } 104 105 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 106 framesize -= wordSize; 107 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 108 } 109 110 #ifdef ASSERT 111 if (VerifyStackAtCalls) { 112 Label L; 113 push(rax); 114 mov(rax, rsp); 115 andptr(rax, StackAlignmentInBytes-1); 116 cmpptr(rax, StackAlignmentInBytes-wordSize); 117 pop(rax); 118 jcc(Assembler::equal, L); 119 STOP("Stack is not properly aligned!"); 120 bind(L); 121 } 122 #endif 123 124 if (!is_stub) { 125 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 126 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 127 Label dummy_slow_path; 128 Label dummy_continuation; 129 Label* slow_path = &dummy_slow_path; 130 Label* continuation = &dummy_continuation; 131 if (!Compile::current()->output()->in_scratch_emit_size()) { 132 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 133 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 134 Compile::current()->output()->add_stub(stub); 135 slow_path = &stub->entry(); 136 continuation = &stub->continuation(); 137 } 138 bs->nmethod_entry_barrier(this, slow_path, continuation); 139 } 140 } 141 142 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 143 switch (vlen_in_bytes) { 144 case 4: // fall-through 145 case 8: // fall-through 146 case 16: return Assembler::AVX_128bit; 147 case 32: return Assembler::AVX_256bit; 148 case 64: return Assembler::AVX_512bit; 149 150 default: { 151 ShouldNotReachHere(); 152 return Assembler::AVX_NoVec; 153 } 154 } 155 } 156 157 // fast_lock and fast_unlock used by C2 158 159 // Because the transitions from emitted code to the runtime 160 // monitorenter/exit helper stubs are so slow it's critical that 161 // we inline both the stack-locking fast path and the inflated fast path. 162 // 163 // See also: cmpFastLock and cmpFastUnlock. 164 // 165 // What follows is a specialized inline transliteration of the code 166 // in enter() and exit(). If we're concerned about I$ bloat another 167 // option would be to emit TrySlowEnter and TrySlowExit methods 168 // at startup-time. These methods would accept arguments as 169 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 170 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 171 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 172 // In practice, however, the # of lock sites is bounded and is usually small. 173 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 174 // if the processor uses simple bimodal branch predictors keyed by EIP 175 // Since the helper routines would be called from multiple synchronization 176 // sites. 177 // 178 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 179 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 180 // to those specialized methods. That'd give us a mostly platform-independent 181 // implementation that the JITs could optimize and inline at their pleasure. 182 // Done correctly, the only time we'd need to cross to native could would be 183 // to park() or unpark() threads. We'd also need a few more unsafe operators 184 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 185 // (b) explicit barriers or fence operations. 186 // 187 // TODO: 188 // 189 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 190 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 191 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 192 // the lock operators would typically be faster than reifying Self. 193 // 194 // * Ideally I'd define the primitives as: 195 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 196 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 197 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 198 // Instead, we're stuck with a rather awkward and brittle register assignments below. 199 // Furthermore the register assignments are overconstrained, possibly resulting in 200 // sub-optimal code near the synchronization site. 201 // 202 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 203 // Alternately, use a better sp-proximity test. 204 // 205 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 206 // Either one is sufficient to uniquely identify a thread. 207 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 208 // 209 // * Intrinsify notify() and notifyAll() for the common cases where the 210 // object is locked by the calling thread but the waitlist is empty. 211 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 212 // 213 // * use jccb and jmpb instead of jcc and jmp to improve code density. 214 // But beware of excessive branch density on AMD Opterons. 215 // 216 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 217 // or failure of the fast path. If the fast path fails then we pass 218 // control to the slow path, typically in C. In fast_lock and 219 // fast_unlock we often branch to DONE_LABEL, just to find that C2 220 // will emit a conditional branch immediately after the node. 221 // So we have branches to branches and lots of ICC.ZF games. 222 // Instead, it might be better to have C2 pass a "FailureLabel" 223 // into fast_lock and fast_unlock. In the case of success, control 224 // will drop through the node. ICC.ZF is undefined at exit. 225 // In the case of failure, the node will branch directly to the 226 // FailureLabel 227 228 229 // obj: object to lock 230 // box: on-stack box address (displaced header location) - KILLED 231 // rax,: tmp -- KILLED 232 // scr: tmp -- KILLED 233 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 234 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 235 Metadata* method_data) { 236 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 237 // Ensure the register assignments are disjoint 238 assert(tmpReg == rax, ""); 239 assert(cx1Reg == noreg, ""); 240 assert(cx2Reg == noreg, ""); 241 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 242 243 // Possible cases that we'll encounter in fast_lock 244 // ------------------------------------------------ 245 // * Inflated 246 // -- unlocked 247 // -- Locked 248 // = by self 249 // = by other 250 // * neutral 251 // * stack-locked 252 // -- by self 253 // = sp-proximity test hits 254 // = sp-proximity test generates false-negative 255 // -- by other 256 // 257 258 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 259 260 if (DiagnoseSyncOnValueBasedClasses != 0) { 261 load_klass(tmpReg, objReg, scrReg); 262 testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 263 jcc(Assembler::notZero, DONE_LABEL); 264 } 265 266 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 267 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 268 jcc(Assembler::notZero, IsInflated); 269 270 if (LockingMode == LM_MONITOR) { 271 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 272 testptr(objReg, objReg); 273 } else { 274 assert(LockingMode == LM_LEGACY, "must be"); 275 // Attempt stack-locking ... 276 orptr (tmpReg, markWord::unlocked_value); 277 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 278 lock(); 279 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 280 jcc(Assembler::equal, COUNT); // Success 281 282 // Recursive locking. 283 // The object is stack-locked: markword contains stack pointer to BasicLock. 284 // Locked by current thread if difference with current SP is less than one page. 285 subptr(tmpReg, rsp); 286 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 287 andptr(tmpReg, (int32_t) (7 - (int)os::vm_page_size()) ); 288 movptr(Address(boxReg, 0), tmpReg); 289 } 290 jmp(DONE_LABEL); 291 292 bind(IsInflated); 293 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 294 295 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 296 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 297 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 298 299 // It's inflated and we use scrReg for ObjectMonitor* in this section. 300 movptr(boxReg, Address(r15_thread, JavaThread::monitor_owner_id_offset())); 301 movq(scrReg, tmpReg); 302 xorq(tmpReg, tmpReg); 303 lock(); 304 cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 305 306 // Propagate ICC.ZF from CAS above into DONE_LABEL. 307 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 308 309 cmpptr(boxReg, rax); // Check if we are already the owner (recursive lock) 310 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 311 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 312 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 313 bind(DONE_LABEL); 314 315 // ZFlag == 1 count in fast path 316 // ZFlag == 0 count in slow path 317 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 318 319 bind(COUNT); 320 if (LockingMode == LM_LEGACY) { 321 // Count monitors in fast path 322 increment(Address(thread, JavaThread::held_monitor_count_offset())); 323 } 324 xorl(tmpReg, tmpReg); // Set ZF == 1 325 326 bind(NO_COUNT); 327 328 // At NO_COUNT the icc ZFlag is set as follows ... 329 // fast_unlock uses the same protocol. 330 // ZFlag == 1 -> Success 331 // ZFlag == 0 -> Failure - force control through the slow path 332 } 333 334 // obj: object to unlock 335 // box: box address (displaced header location), killed. Must be EAX. 336 // tmp: killed, cannot be obj nor box. 337 // 338 // Some commentary on balanced locking: 339 // 340 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 341 // Methods that don't have provably balanced locking are forced to run in the 342 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 343 // The interpreter provides two properties: 344 // I1: At return-time the interpreter automatically and quietly unlocks any 345 // objects acquired the current activation (frame). Recall that the 346 // interpreter maintains an on-stack list of locks currently held by 347 // a frame. 348 // I2: If a method attempts to unlock an object that is not held by the 349 // the frame the interpreter throws IMSX. 350 // 351 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 352 // B() doesn't have provably balanced locking so it runs in the interpreter. 353 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 354 // is still locked by A(). 355 // 356 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 357 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 358 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 359 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 360 // Arguably given that the spec legislates the JNI case as undefined our implementation 361 // could reasonably *avoid* checking owner in fast_unlock(). 362 // In the interest of performance we elide m->Owner==Self check in unlock. 363 // A perfectly viable alternative is to elide the owner check except when 364 // Xcheck:jni is enabled. 365 366 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { 367 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 368 assert(boxReg == rax, ""); 369 assert_different_registers(objReg, boxReg, tmpReg); 370 371 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 372 373 if (LockingMode == LM_LEGACY) { 374 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 375 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 376 } 377 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 378 if (LockingMode != LM_MONITOR) { 379 testptr(tmpReg, markWord::monitor_value); // Inflated? 380 jcc(Assembler::zero, Stacked); 381 } 382 383 // It's inflated. 384 385 // Despite our balanced locking property we still check that m->_owner == Self 386 // as java routines or native JNI code called by this thread might 387 // have released the lock. 388 // 389 // If there's no contention try a 1-0 exit. That is, exit without 390 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 391 // we detect and recover from the race that the 1-0 exit admits. 392 // 393 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 394 // before it STs null into _owner, releasing the lock. Updates 395 // to data protected by the critical section must be visible before 396 // we drop the lock (and thus before any other thread could acquire 397 // the lock and observe the fields protected by the lock). 398 // IA32's memory-model is SPO, so STs are ordered with respect to 399 // each other and there's no need for an explicit barrier (fence). 400 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 401 Label LSuccess, LNotRecursive; 402 403 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 404 jccb(Assembler::equal, LNotRecursive); 405 406 // Recursive inflated unlock 407 decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 408 jmpb(LSuccess); 409 410 bind(LNotRecursive); 411 412 // Set owner to null. 413 // Release to satisfy the JMM 414 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 415 // We need a full fence after clearing owner to avoid stranding. 416 // StoreLoad achieves this. 417 membar(StoreLoad); 418 419 // Check if the entry_list is empty. 420 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(entry_list)), NULL_WORD); 421 jccb(Assembler::zero, LSuccess); // If so we are done. 422 423 // Check if there is a successor. 424 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 425 jccb(Assembler::notZero, LSuccess); // If so we are done. 426 427 // Save the monitor pointer in the current thread, so we can try to 428 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 429 andptr(tmpReg, ~(int32_t)markWord::monitor_value); 430 movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg); 431 432 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 433 jmpb (DONE_LABEL); 434 435 bind (LSuccess); 436 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 437 jmpb (DONE_LABEL); 438 439 if (LockingMode == LM_LEGACY) { 440 bind (Stacked); 441 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 442 lock(); 443 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 444 // Intentional fall-thru into DONE_LABEL 445 } 446 447 bind(DONE_LABEL); 448 449 // ZFlag == 1 count in fast path 450 // ZFlag == 0 count in slow path 451 jccb(Assembler::notZero, NO_COUNT); 452 453 bind(COUNT); 454 455 if (LockingMode == LM_LEGACY) { 456 // Count monitors in fast path 457 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 458 } 459 460 xorl(tmpReg, tmpReg); // Set ZF == 1 461 462 bind(NO_COUNT); 463 } 464 465 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 466 Register t, Register thread) { 467 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 468 assert(rax_reg == rax, "Used for CAS"); 469 assert_different_registers(obj, box, rax_reg, t, thread); 470 471 // Handle inflated monitor. 472 Label inflated; 473 // Finish fast lock successfully. ZF value is irrelevant. 474 Label locked; 475 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 476 Label slow_path; 477 478 if (UseObjectMonitorTable) { 479 // Clear cache in case fast locking succeeds or we need to take the slow-path. 480 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 481 } 482 483 if (DiagnoseSyncOnValueBasedClasses != 0) { 484 load_klass(rax_reg, obj, t); 485 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 486 jcc(Assembler::notZero, slow_path); 487 } 488 489 const Register mark = t; 490 491 { // Lightweight Lock 492 493 Label push; 494 495 const Register top = UseObjectMonitorTable ? rax_reg : box; 496 497 // Load the mark. 498 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 499 500 // Prefetch top. 501 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 502 503 // Check for monitor (0b10). 504 testptr(mark, markWord::monitor_value); 505 jcc(Assembler::notZero, inflated); 506 507 // Check if lock-stack is full. 508 cmpl(top, LockStack::end_offset() - 1); 509 jcc(Assembler::greater, slow_path); 510 511 // Check if recursive. 512 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 513 jccb(Assembler::equal, push); 514 515 // Try to lock. Transition lock bits 0b01 => 0b00 516 movptr(rax_reg, mark); 517 orptr(rax_reg, markWord::unlocked_value); 518 andptr(mark, ~(int32_t)markWord::unlocked_value); 519 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 520 jcc(Assembler::notEqual, slow_path); 521 522 if (UseObjectMonitorTable) { 523 // Need to reload top, clobbered by CAS. 524 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 525 } 526 bind(push); 527 // After successful lock, push object on lock-stack. 528 movptr(Address(thread, top), obj); 529 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 530 jmpb(locked); 531 } 532 533 { // Handle inflated monitor. 534 bind(inflated); 535 536 const Register monitor = t; 537 538 if (!UseObjectMonitorTable) { 539 assert(mark == monitor, "should be the same here"); 540 } else { 541 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 542 // Fetch ObjectMonitor* from the cache or take the slow-path. 543 Label monitor_found; 544 545 // Load cache address 546 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 547 548 const int num_unrolled = 2; 549 for (int i = 0; i < num_unrolled; i++) { 550 cmpptr(obj, Address(t)); 551 jccb(Assembler::equal, monitor_found); 552 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 553 } 554 555 Label loop; 556 557 // Search for obj in cache. 558 bind(loop); 559 560 // Check for match. 561 cmpptr(obj, Address(t)); 562 jccb(Assembler::equal, monitor_found); 563 564 // Search until null encountered, guaranteed _null_sentinel at end. 565 cmpptr(Address(t), 1); 566 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 567 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 568 jmpb(loop); 569 570 // Cache hit. 571 bind(monitor_found); 572 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 573 } 574 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 575 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 576 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 577 578 Label monitor_locked; 579 // Lock the monitor. 580 581 if (UseObjectMonitorTable) { 582 // Cache the monitor for unlock before trashing box. On failure to acquire 583 // the lock, the slow path will reset the entry accordingly (see CacheSetter). 584 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 585 } 586 587 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 588 xorptr(rax_reg, rax_reg); 589 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset())); 590 lock(); cmpxchgptr(box, owner_address); 591 jccb(Assembler::equal, monitor_locked); 592 593 // Check if recursive. 594 cmpptr(box, rax_reg); 595 jccb(Assembler::notEqual, slow_path); 596 597 // Recursive. 598 increment(recursions_address); 599 600 bind(monitor_locked); 601 } 602 603 bind(locked); 604 // Set ZF = 1 605 xorl(rax_reg, rax_reg); 606 607 #ifdef ASSERT 608 // Check that locked label is reached with ZF set. 609 Label zf_correct; 610 Label zf_bad_zero; 611 jcc(Assembler::zero, zf_correct); 612 jmp(zf_bad_zero); 613 #endif 614 615 bind(slow_path); 616 #ifdef ASSERT 617 // Check that slow_path label is reached with ZF not set. 618 jcc(Assembler::notZero, zf_correct); 619 stop("Fast Lock ZF != 0"); 620 bind(zf_bad_zero); 621 stop("Fast Lock ZF != 1"); 622 bind(zf_correct); 623 #endif 624 // C2 uses the value of ZF to determine the continuation. 625 } 626 627 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 628 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 629 assert(reg_rax == rax, "Used for CAS"); 630 assert_different_registers(obj, reg_rax, t); 631 632 // Handle inflated monitor. 633 Label inflated, inflated_check_lock_stack; 634 // Finish fast unlock successfully. MUST jump with ZF == 1 635 Label unlocked, slow_path; 636 637 const Register mark = t; 638 const Register monitor = t; 639 const Register top = UseObjectMonitorTable ? t : reg_rax; 640 const Register box = reg_rax; 641 642 Label dummy; 643 C2FastUnlockLightweightStub* stub = nullptr; 644 645 if (!Compile::current()->output()->in_scratch_emit_size()) { 646 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 647 Compile::current()->output()->add_stub(stub); 648 } 649 650 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 651 652 { // Lightweight Unlock 653 654 // Load top. 655 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 656 657 if (!UseObjectMonitorTable) { 658 // Prefetch mark. 659 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 660 } 661 662 // Check if obj is top of lock-stack. 663 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 664 // Top of lock stack was not obj. Must be monitor. 665 jcc(Assembler::notEqual, inflated_check_lock_stack); 666 667 // Pop lock-stack. 668 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 669 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 670 671 // Check if recursive. 672 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 673 jcc(Assembler::equal, unlocked); 674 675 // We elide the monitor check, let the CAS fail instead. 676 677 if (UseObjectMonitorTable) { 678 // Load mark. 679 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 680 } 681 682 // Try to unlock. Transition lock bits 0b00 => 0b01 683 movptr(reg_rax, mark); 684 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 685 orptr(mark, markWord::unlocked_value); 686 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 687 jcc(Assembler::notEqual, push_and_slow_path); 688 jmp(unlocked); 689 } 690 691 692 { // Handle inflated monitor. 693 bind(inflated_check_lock_stack); 694 #ifdef ASSERT 695 Label check_done; 696 subl(top, oopSize); 697 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 698 jcc(Assembler::below, check_done); 699 cmpptr(obj, Address(thread, top)); 700 jccb(Assembler::notEqual, inflated_check_lock_stack); 701 stop("Fast Unlock lock on stack"); 702 bind(check_done); 703 if (UseObjectMonitorTable) { 704 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 705 } 706 testptr(mark, markWord::monitor_value); 707 jccb(Assembler::notZero, inflated); 708 stop("Fast Unlock not monitor"); 709 #endif 710 711 bind(inflated); 712 713 if (!UseObjectMonitorTable) { 714 assert(mark == monitor, "should be the same here"); 715 } else { 716 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 717 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 718 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 719 cmpptr(monitor, alignof(ObjectMonitor*)); 720 jcc(Assembler::below, slow_path); 721 } 722 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 723 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 724 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 725 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag}; 726 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 727 728 Label recursive; 729 730 // Check if recursive. 731 cmpptr(recursions_address, 0); 732 jccb(Assembler::notZero, recursive); 733 734 // Set owner to null. 735 // Release to satisfy the JMM 736 movptr(owner_address, NULL_WORD); 737 // We need a full fence after clearing owner to avoid stranding. 738 // StoreLoad achieves this. 739 membar(StoreLoad); 740 741 // Check if the entry_list is empty. 742 cmpptr(entry_list_address, NULL_WORD); 743 jccb(Assembler::zero, unlocked); // If so we are done. 744 745 // Check if there is a successor. 746 cmpptr(succ_address, NULL_WORD); 747 jccb(Assembler::notZero, unlocked); // If so we are done. 748 749 // Save the monitor pointer in the current thread, so we can try to 750 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 751 if (!UseObjectMonitorTable) { 752 andptr(monitor, ~(int32_t)markWord::monitor_value); 753 } 754 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 755 756 orl(t, 1); // Fast Unlock ZF = 0 757 jmpb(slow_path); 758 759 // Recursive unlock. 760 bind(recursive); 761 decrement(recursions_address); 762 } 763 764 bind(unlocked); 765 xorl(t, t); // Fast Unlock ZF = 1 766 767 #ifdef ASSERT 768 // Check that unlocked label is reached with ZF set. 769 Label zf_correct; 770 Label zf_bad_zero; 771 jcc(Assembler::zero, zf_correct); 772 jmp(zf_bad_zero); 773 #endif 774 775 bind(slow_path); 776 if (stub != nullptr) { 777 bind(stub->slow_path_continuation()); 778 } 779 #ifdef ASSERT 780 // Check that stub->continuation() label is reached with ZF not set. 781 jcc(Assembler::notZero, zf_correct); 782 stop("Fast Unlock ZF != 0"); 783 bind(zf_bad_zero); 784 stop("Fast Unlock ZF != 1"); 785 bind(zf_correct); 786 #endif 787 // C2 uses the value of ZF to determine the continuation. 788 } 789 790 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) { 791 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi); 792 } 793 794 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) { 795 const int framesize = Compile::current()->output()->frame_size_in_bytes(); 796 masm->movptr(dst, rsp); 797 if (framesize > 2 * wordSize) { 798 masm->addptr(dst, framesize - 2 * wordSize); 799 } 800 } 801 802 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) { 803 if (PreserveFramePointer) { 804 // frame pointer is valid 805 #ifdef ASSERT 806 // Verify frame pointer value in rbp. 807 reconstruct_frame_pointer_helper(this, rtmp); 808 Label L_success; 809 cmpq(rbp, rtmp); 810 jccb(Assembler::equal, L_success); 811 STOP("frame pointer mismatch"); 812 bind(L_success); 813 #endif // ASSERT 814 } else { 815 reconstruct_frame_pointer_helper(this, rbp); 816 } 817 } 818 819 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) { 820 jint lo = t->_lo; 821 jint hi = t->_hi; 822 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi); 823 if (t == TypeInt::INT) { 824 return; 825 } 826 827 BLOCK_COMMENT("CastII {"); 828 Label fail; 829 Label succeed; 830 if (hi == max_jint) { 831 cmpl(val, lo); 832 jccb(Assembler::greaterEqual, succeed); 833 } else { 834 if (lo != min_jint) { 835 cmpl(val, lo); 836 jccb(Assembler::less, fail); 837 } 838 cmpl(val, hi); 839 jccb(Assembler::lessEqual, succeed); 840 } 841 842 bind(fail); 843 movl(c_rarg0, idx); 844 movl(c_rarg1, val); 845 movl(c_rarg2, lo); 846 movl(c_rarg3, hi); 847 reconstruct_frame_pointer(rscratch1); 848 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range))); 849 hlt(); 850 bind(succeed); 851 BLOCK_COMMENT("} // CastII"); 852 } 853 854 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) { 855 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi); 856 } 857 858 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) { 859 jlong lo = t->_lo; 860 jlong hi = t->_hi; 861 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi); 862 if (t == TypeLong::LONG) { 863 return; 864 } 865 866 BLOCK_COMMENT("CastLL {"); 867 Label fail; 868 Label succeed; 869 870 auto cmp_val = [&](jlong bound) { 871 if (is_simm32(bound)) { 872 cmpq(val, checked_cast<int>(bound)); 873 } else { 874 mov64(tmp, bound); 875 cmpq(val, tmp); 876 } 877 }; 878 879 if (hi == max_jlong) { 880 cmp_val(lo); 881 jccb(Assembler::greaterEqual, succeed); 882 } else { 883 if (lo != min_jlong) { 884 cmp_val(lo); 885 jccb(Assembler::less, fail); 886 } 887 cmp_val(hi); 888 jccb(Assembler::lessEqual, succeed); 889 } 890 891 bind(fail); 892 movl(c_rarg0, idx); 893 movq(c_rarg1, val); 894 mov64(c_rarg2, lo); 895 mov64(c_rarg3, hi); 896 reconstruct_frame_pointer(rscratch1); 897 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range))); 898 hlt(); 899 bind(succeed); 900 BLOCK_COMMENT("} // CastLL"); 901 } 902 903 //------------------------------------------------------------------------------------------- 904 // Generic instructions support for use in .ad files C2 code generation 905 906 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 907 if (dst != src) { 908 movdqu(dst, src); 909 } 910 if (opcode == Op_AbsVD) { 911 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 912 } else { 913 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 914 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 915 } 916 } 917 918 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 919 if (opcode == Op_AbsVD) { 920 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 921 } else { 922 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 923 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 924 } 925 } 926 927 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 928 if (dst != src) { 929 movdqu(dst, src); 930 } 931 if (opcode == Op_AbsVF) { 932 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 933 } else { 934 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 935 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 936 } 937 } 938 939 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 940 if (opcode == Op_AbsVF) { 941 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 942 } else { 943 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 944 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 945 } 946 } 947 948 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 949 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 950 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 951 952 if (opcode == Op_MinV) { 953 if (elem_bt == T_BYTE) { 954 pminsb(dst, src); 955 } else if (elem_bt == T_SHORT) { 956 pminsw(dst, src); 957 } else if (elem_bt == T_INT) { 958 pminsd(dst, src); 959 } else { 960 assert(elem_bt == T_LONG, "required"); 961 assert(tmp == xmm0, "required"); 962 assert_different_registers(dst, src, tmp); 963 movdqu(xmm0, dst); 964 pcmpgtq(xmm0, src); 965 blendvpd(dst, src); // xmm0 as mask 966 } 967 } else { // opcode == Op_MaxV 968 if (elem_bt == T_BYTE) { 969 pmaxsb(dst, src); 970 } else if (elem_bt == T_SHORT) { 971 pmaxsw(dst, src); 972 } else if (elem_bt == T_INT) { 973 pmaxsd(dst, src); 974 } else { 975 assert(elem_bt == T_LONG, "required"); 976 assert(tmp == xmm0, "required"); 977 assert_different_registers(dst, src, tmp); 978 movdqu(xmm0, src); 979 pcmpgtq(xmm0, dst); 980 blendvpd(dst, src); // xmm0 as mask 981 } 982 } 983 } 984 985 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 986 XMMRegister src1, Address src2, int vlen_enc) { 987 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 988 if (opcode == Op_UMinV) { 989 switch(elem_bt) { 990 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 991 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 992 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 993 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 994 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 995 } 996 } else { 997 assert(opcode == Op_UMaxV, "required"); 998 switch(elem_bt) { 999 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 1000 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 1001 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 1002 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 1003 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 1004 } 1005 } 1006 } 1007 1008 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 1009 // For optimality, leverage a full vector width of 512 bits 1010 // for operations over smaller vector sizes on AVX512 targets. 1011 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { 1012 if (opcode == Op_UMaxV) { 1013 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 1014 } else { 1015 assert(opcode == Op_UMinV, "required"); 1016 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 1017 } 1018 } else { 1019 // T1 = -1 1020 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc); 1021 // T1 = -1 << 63 1022 vpsllq(xtmp1, xtmp1, 63, vlen_enc); 1023 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2 1024 vpaddq(xtmp2, xtmp1, src2, vlen_enc); 1025 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1 1026 vpaddq(xtmp1, xtmp1, src1, vlen_enc); 1027 // Mask = T2 > T1 1028 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc); 1029 if (opcode == Op_UMaxV) { 1030 // Res = Mask ? Src2 : Src1 1031 vpblendvb(dst, src1, src2, xtmp1, vlen_enc); 1032 } else { 1033 // Res = Mask ? Src1 : Src2 1034 vpblendvb(dst, src2, src1, xtmp1, vlen_enc); 1035 } 1036 } 1037 } 1038 1039 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 1040 XMMRegister src1, XMMRegister src2, int vlen_enc) { 1041 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 1042 if (opcode == Op_UMinV) { 1043 switch(elem_bt) { 1044 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 1045 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 1046 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 1047 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 1048 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 1049 } 1050 } else { 1051 assert(opcode == Op_UMaxV, "required"); 1052 switch(elem_bt) { 1053 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 1054 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 1055 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 1056 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 1057 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 1058 } 1059 } 1060 } 1061 1062 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1063 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1064 int vlen_enc) { 1065 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1066 1067 if (opcode == Op_MinV) { 1068 if (elem_bt == T_BYTE) { 1069 vpminsb(dst, src1, src2, vlen_enc); 1070 } else if (elem_bt == T_SHORT) { 1071 vpminsw(dst, src1, src2, vlen_enc); 1072 } else if (elem_bt == T_INT) { 1073 vpminsd(dst, src1, src2, vlen_enc); 1074 } else { 1075 assert(elem_bt == T_LONG, "required"); 1076 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1077 vpminsq(dst, src1, src2, vlen_enc); 1078 } else { 1079 assert_different_registers(dst, src1, src2); 1080 vpcmpgtq(dst, src1, src2, vlen_enc); 1081 vblendvpd(dst, src1, src2, dst, vlen_enc); 1082 } 1083 } 1084 } else { // opcode == Op_MaxV 1085 if (elem_bt == T_BYTE) { 1086 vpmaxsb(dst, src1, src2, vlen_enc); 1087 } else if (elem_bt == T_SHORT) { 1088 vpmaxsw(dst, src1, src2, vlen_enc); 1089 } else if (elem_bt == T_INT) { 1090 vpmaxsd(dst, src1, src2, vlen_enc); 1091 } else { 1092 assert(elem_bt == T_LONG, "required"); 1093 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1094 vpmaxsq(dst, src1, src2, vlen_enc); 1095 } else { 1096 assert_different_registers(dst, src1, src2); 1097 vpcmpgtq(dst, src1, src2, vlen_enc); 1098 vblendvpd(dst, src2, src1, dst, vlen_enc); 1099 } 1100 } 1101 } 1102 } 1103 1104 // Float/Double min max 1105 1106 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1107 XMMRegister dst, XMMRegister a, XMMRegister b, 1108 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1109 int vlen_enc) { 1110 assert(UseAVX > 0, "required"); 1111 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1112 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1113 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1114 assert_different_registers(a, tmp, atmp, btmp); 1115 assert_different_registers(b, tmp, atmp, btmp); 1116 1117 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1118 bool is_double_word = is_double_word_type(elem_bt); 1119 1120 /* Note on 'non-obvious' assembly sequence: 1121 * 1122 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1123 * and Java on how they handle floats: 1124 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1125 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1126 * 1127 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1128 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1129 * (only useful when signs differ, noop otherwise) 1130 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1131 1132 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1133 * btmp = (b < +0.0) ? a : b 1134 * atmp = (b < +0.0) ? b : a 1135 * Tmp = Max_Float(atmp , btmp) 1136 * Res = (atmp == NaN) ? atmp : Tmp 1137 */ 1138 1139 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1140 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1141 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1142 XMMRegister mask; 1143 1144 if (!is_double_word && is_min) { 1145 mask = a; 1146 vblend = &MacroAssembler::vblendvps; 1147 vmaxmin = &MacroAssembler::vminps; 1148 vcmp = &MacroAssembler::vcmpps; 1149 } else if (!is_double_word && !is_min) { 1150 mask = b; 1151 vblend = &MacroAssembler::vblendvps; 1152 vmaxmin = &MacroAssembler::vmaxps; 1153 vcmp = &MacroAssembler::vcmpps; 1154 } else if (is_double_word && is_min) { 1155 mask = a; 1156 vblend = &MacroAssembler::vblendvpd; 1157 vmaxmin = &MacroAssembler::vminpd; 1158 vcmp = &MacroAssembler::vcmppd; 1159 } else { 1160 assert(is_double_word && !is_min, "sanity"); 1161 mask = b; 1162 vblend = &MacroAssembler::vblendvpd; 1163 vmaxmin = &MacroAssembler::vmaxpd; 1164 vcmp = &MacroAssembler::vcmppd; 1165 } 1166 1167 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1168 XMMRegister maxmin, scratch; 1169 if (dst == btmp) { 1170 maxmin = btmp; 1171 scratch = tmp; 1172 } else { 1173 maxmin = tmp; 1174 scratch = btmp; 1175 } 1176 1177 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1178 if (precompute_mask && !is_double_word) { 1179 vpsrad(tmp, mask, 32, vlen_enc); 1180 mask = tmp; 1181 } else if (precompute_mask && is_double_word) { 1182 vpxor(tmp, tmp, tmp, vlen_enc); 1183 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1184 mask = tmp; 1185 } 1186 1187 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1188 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1189 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1190 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1191 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1192 } 1193 1194 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1195 XMMRegister dst, XMMRegister a, XMMRegister b, 1196 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1197 int vlen_enc) { 1198 assert(UseAVX > 2, "required"); 1199 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1200 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1201 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1202 assert_different_registers(dst, a, atmp, btmp); 1203 assert_different_registers(dst, b, atmp, btmp); 1204 1205 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1206 bool is_double_word = is_double_word_type(elem_bt); 1207 bool merge = true; 1208 1209 if (!is_double_word && is_min) { 1210 evpmovd2m(ktmp, a, vlen_enc); 1211 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1212 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1213 vminps(dst, atmp, btmp, vlen_enc); 1214 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1215 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1216 } else if (!is_double_word && !is_min) { 1217 evpmovd2m(ktmp, b, vlen_enc); 1218 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1219 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1220 vmaxps(dst, atmp, btmp, vlen_enc); 1221 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1222 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1223 } else if (is_double_word && is_min) { 1224 evpmovq2m(ktmp, a, vlen_enc); 1225 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1226 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1227 vminpd(dst, atmp, btmp, vlen_enc); 1228 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1229 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1230 } else { 1231 assert(is_double_word && !is_min, "sanity"); 1232 evpmovq2m(ktmp, b, vlen_enc); 1233 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1234 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1235 vmaxpd(dst, atmp, btmp, vlen_enc); 1236 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1237 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1238 } 1239 } 1240 1241 // Float/Double signum 1242 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1243 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1244 1245 Label DONE_LABEL; 1246 1247 if (opcode == Op_SignumF) { 1248 ucomiss(dst, zero); 1249 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1250 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1251 movflt(dst, one); 1252 jcc(Assembler::above, DONE_LABEL); 1253 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1254 } else if (opcode == Op_SignumD) { 1255 ucomisd(dst, zero); 1256 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1257 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1258 movdbl(dst, one); 1259 jcc(Assembler::above, DONE_LABEL); 1260 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1261 } 1262 1263 bind(DONE_LABEL); 1264 } 1265 1266 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1267 if (sign) { 1268 pmovsxbw(dst, src); 1269 } else { 1270 pmovzxbw(dst, src); 1271 } 1272 } 1273 1274 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1275 if (sign) { 1276 vpmovsxbw(dst, src, vector_len); 1277 } else { 1278 vpmovzxbw(dst, src, vector_len); 1279 } 1280 } 1281 1282 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1283 if (sign) { 1284 vpmovsxbd(dst, src, vector_len); 1285 } else { 1286 vpmovzxbd(dst, src, vector_len); 1287 } 1288 } 1289 1290 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1291 if (sign) { 1292 vpmovsxwd(dst, src, vector_len); 1293 } else { 1294 vpmovzxwd(dst, src, vector_len); 1295 } 1296 } 1297 1298 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1299 int shift, int vector_len) { 1300 if (opcode == Op_RotateLeftV) { 1301 if (etype == T_INT) { 1302 evprold(dst, src, shift, vector_len); 1303 } else { 1304 assert(etype == T_LONG, "expected type T_LONG"); 1305 evprolq(dst, src, shift, vector_len); 1306 } 1307 } else { 1308 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1309 if (etype == T_INT) { 1310 evprord(dst, src, shift, vector_len); 1311 } else { 1312 assert(etype == T_LONG, "expected type T_LONG"); 1313 evprorq(dst, src, shift, vector_len); 1314 } 1315 } 1316 } 1317 1318 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1319 XMMRegister shift, int vector_len) { 1320 if (opcode == Op_RotateLeftV) { 1321 if (etype == T_INT) { 1322 evprolvd(dst, src, shift, vector_len); 1323 } else { 1324 assert(etype == T_LONG, "expected type T_LONG"); 1325 evprolvq(dst, src, shift, vector_len); 1326 } 1327 } else { 1328 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1329 if (etype == T_INT) { 1330 evprorvd(dst, src, shift, vector_len); 1331 } else { 1332 assert(etype == T_LONG, "expected type T_LONG"); 1333 evprorvq(dst, src, shift, vector_len); 1334 } 1335 } 1336 } 1337 1338 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1339 if (opcode == Op_RShiftVI) { 1340 psrad(dst, shift); 1341 } else if (opcode == Op_LShiftVI) { 1342 pslld(dst, shift); 1343 } else { 1344 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1345 psrld(dst, shift); 1346 } 1347 } 1348 1349 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1350 switch (opcode) { 1351 case Op_RShiftVI: psrad(dst, shift); break; 1352 case Op_LShiftVI: pslld(dst, shift); break; 1353 case Op_URShiftVI: psrld(dst, shift); break; 1354 1355 default: assert(false, "%s", NodeClassNames[opcode]); 1356 } 1357 } 1358 1359 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1360 if (opcode == Op_RShiftVI) { 1361 vpsrad(dst, nds, shift, vector_len); 1362 } else if (opcode == Op_LShiftVI) { 1363 vpslld(dst, nds, shift, vector_len); 1364 } else { 1365 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1366 vpsrld(dst, nds, shift, vector_len); 1367 } 1368 } 1369 1370 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1371 switch (opcode) { 1372 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1373 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1374 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1375 1376 default: assert(false, "%s", NodeClassNames[opcode]); 1377 } 1378 } 1379 1380 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1381 switch (opcode) { 1382 case Op_RShiftVB: // fall-through 1383 case Op_RShiftVS: psraw(dst, shift); break; 1384 1385 case Op_LShiftVB: // fall-through 1386 case Op_LShiftVS: psllw(dst, shift); break; 1387 1388 case Op_URShiftVS: // fall-through 1389 case Op_URShiftVB: psrlw(dst, shift); break; 1390 1391 default: assert(false, "%s", NodeClassNames[opcode]); 1392 } 1393 } 1394 1395 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1396 switch (opcode) { 1397 case Op_RShiftVB: // fall-through 1398 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1399 1400 case Op_LShiftVB: // fall-through 1401 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1402 1403 case Op_URShiftVS: // fall-through 1404 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1405 1406 default: assert(false, "%s", NodeClassNames[opcode]); 1407 } 1408 } 1409 1410 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1411 switch (opcode) { 1412 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1413 case Op_LShiftVL: psllq(dst, shift); break; 1414 case Op_URShiftVL: psrlq(dst, shift); break; 1415 1416 default: assert(false, "%s", NodeClassNames[opcode]); 1417 } 1418 } 1419 1420 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1421 if (opcode == Op_RShiftVL) { 1422 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1423 } else if (opcode == Op_LShiftVL) { 1424 psllq(dst, shift); 1425 } else { 1426 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1427 psrlq(dst, shift); 1428 } 1429 } 1430 1431 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1432 switch (opcode) { 1433 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1434 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1435 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1436 1437 default: assert(false, "%s", NodeClassNames[opcode]); 1438 } 1439 } 1440 1441 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1442 if (opcode == Op_RShiftVL) { 1443 evpsraq(dst, nds, shift, vector_len); 1444 } else if (opcode == Op_LShiftVL) { 1445 vpsllq(dst, nds, shift, vector_len); 1446 } else { 1447 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1448 vpsrlq(dst, nds, shift, vector_len); 1449 } 1450 } 1451 1452 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1453 switch (opcode) { 1454 case Op_RShiftVB: // fall-through 1455 case Op_RShiftVS: // fall-through 1456 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1457 1458 case Op_LShiftVB: // fall-through 1459 case Op_LShiftVS: // fall-through 1460 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1461 1462 case Op_URShiftVB: // fall-through 1463 case Op_URShiftVS: // fall-through 1464 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1465 1466 default: assert(false, "%s", NodeClassNames[opcode]); 1467 } 1468 } 1469 1470 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1471 switch (opcode) { 1472 case Op_RShiftVB: // fall-through 1473 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1474 1475 case Op_LShiftVB: // fall-through 1476 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1477 1478 case Op_URShiftVB: // fall-through 1479 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1480 1481 default: assert(false, "%s", NodeClassNames[opcode]); 1482 } 1483 } 1484 1485 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1486 assert(UseAVX >= 2, "required"); 1487 switch (opcode) { 1488 case Op_RShiftVL: { 1489 if (UseAVX > 2) { 1490 assert(tmp == xnoreg, "not used"); 1491 if (!VM_Version::supports_avx512vl()) { 1492 vlen_enc = Assembler::AVX_512bit; 1493 } 1494 evpsravq(dst, src, shift, vlen_enc); 1495 } else { 1496 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1497 vpsrlvq(dst, src, shift, vlen_enc); 1498 vpsrlvq(tmp, tmp, shift, vlen_enc); 1499 vpxor(dst, dst, tmp, vlen_enc); 1500 vpsubq(dst, dst, tmp, vlen_enc); 1501 } 1502 break; 1503 } 1504 case Op_LShiftVL: { 1505 assert(tmp == xnoreg, "not used"); 1506 vpsllvq(dst, src, shift, vlen_enc); 1507 break; 1508 } 1509 case Op_URShiftVL: { 1510 assert(tmp == xnoreg, "not used"); 1511 vpsrlvq(dst, src, shift, vlen_enc); 1512 break; 1513 } 1514 default: assert(false, "%s", NodeClassNames[opcode]); 1515 } 1516 } 1517 1518 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1519 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1520 assert(opcode == Op_LShiftVB || 1521 opcode == Op_RShiftVB || 1522 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1523 bool sign = (opcode != Op_URShiftVB); 1524 assert(vector_len == 0, "required"); 1525 vextendbd(sign, dst, src, 1); 1526 vpmovzxbd(vtmp, shift, 1); 1527 varshiftd(opcode, dst, dst, vtmp, 1); 1528 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1529 vextracti128_high(vtmp, dst); 1530 vpackusdw(dst, dst, vtmp, 0); 1531 } 1532 1533 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1534 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1535 assert(opcode == Op_LShiftVB || 1536 opcode == Op_RShiftVB || 1537 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1538 bool sign = (opcode != Op_URShiftVB); 1539 int ext_vector_len = vector_len + 1; 1540 vextendbw(sign, dst, src, ext_vector_len); 1541 vpmovzxbw(vtmp, shift, ext_vector_len); 1542 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1543 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1544 if (vector_len == 0) { 1545 vextracti128_high(vtmp, dst); 1546 vpackuswb(dst, dst, vtmp, vector_len); 1547 } else { 1548 vextracti64x4_high(vtmp, dst); 1549 vpackuswb(dst, dst, vtmp, vector_len); 1550 vpermq(dst, dst, 0xD8, vector_len); 1551 } 1552 } 1553 1554 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1555 switch(typ) { 1556 case T_BYTE: 1557 pinsrb(dst, val, idx); 1558 break; 1559 case T_SHORT: 1560 pinsrw(dst, val, idx); 1561 break; 1562 case T_INT: 1563 pinsrd(dst, val, idx); 1564 break; 1565 case T_LONG: 1566 pinsrq(dst, val, idx); 1567 break; 1568 default: 1569 assert(false,"Should not reach here."); 1570 break; 1571 } 1572 } 1573 1574 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1575 switch(typ) { 1576 case T_BYTE: 1577 vpinsrb(dst, src, val, idx); 1578 break; 1579 case T_SHORT: 1580 vpinsrw(dst, src, val, idx); 1581 break; 1582 case T_INT: 1583 vpinsrd(dst, src, val, idx); 1584 break; 1585 case T_LONG: 1586 vpinsrq(dst, src, val, idx); 1587 break; 1588 default: 1589 assert(false,"Should not reach here."); 1590 break; 1591 } 1592 } 1593 1594 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1595 XMMRegister dst, Register base, 1596 Register idx_base, 1597 Register offset, Register mask, 1598 Register mask_idx, Register rtmp, 1599 int vlen_enc) { 1600 vpxor(dst, dst, dst, vlen_enc); 1601 if (elem_bt == T_SHORT) { 1602 for (int i = 0; i < 4; i++) { 1603 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1604 Label skip_load; 1605 btq(mask, mask_idx); 1606 jccb(Assembler::carryClear, skip_load); 1607 movl(rtmp, Address(idx_base, i * 4)); 1608 if (offset != noreg) { 1609 addl(rtmp, offset); 1610 } 1611 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1612 bind(skip_load); 1613 incq(mask_idx); 1614 } 1615 } else { 1616 assert(elem_bt == T_BYTE, ""); 1617 for (int i = 0; i < 8; i++) { 1618 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1619 Label skip_load; 1620 btq(mask, mask_idx); 1621 jccb(Assembler::carryClear, skip_load); 1622 movl(rtmp, Address(idx_base, i * 4)); 1623 if (offset != noreg) { 1624 addl(rtmp, offset); 1625 } 1626 pinsrb(dst, Address(base, rtmp), i); 1627 bind(skip_load); 1628 incq(mask_idx); 1629 } 1630 } 1631 } 1632 1633 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1634 Register base, Register idx_base, 1635 Register offset, Register rtmp, 1636 int vlen_enc) { 1637 vpxor(dst, dst, dst, vlen_enc); 1638 if (elem_bt == T_SHORT) { 1639 for (int i = 0; i < 4; i++) { 1640 // dst[i] = src[offset + idx_base[i]] 1641 movl(rtmp, Address(idx_base, i * 4)); 1642 if (offset != noreg) { 1643 addl(rtmp, offset); 1644 } 1645 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1646 } 1647 } else { 1648 assert(elem_bt == T_BYTE, ""); 1649 for (int i = 0; i < 8; i++) { 1650 // dst[i] = src[offset + idx_base[i]] 1651 movl(rtmp, Address(idx_base, i * 4)); 1652 if (offset != noreg) { 1653 addl(rtmp, offset); 1654 } 1655 pinsrb(dst, Address(base, rtmp), i); 1656 } 1657 } 1658 } 1659 1660 /* 1661 * Gather using hybrid algorithm, first partially unroll scalar loop 1662 * to accumulate values from gather indices into a quad-word(64bit) slice. 1663 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1664 * permutation to place the slice into appropriate vector lane 1665 * locations in destination vector. Following pseudo code describes the 1666 * algorithm in detail: 1667 * 1668 * DST_VEC = ZERO_VEC 1669 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1670 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1671 * FOREACH_ITER: 1672 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1673 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1674 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1675 * PERM_INDEX = PERM_INDEX - TWO_VEC 1676 * 1677 * With each iteration, doubleword permute indices (0,1) corresponding 1678 * to gathered quadword gets right shifted by two lane positions. 1679 * 1680 */ 1681 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1682 Register base, Register idx_base, 1683 Register offset, Register mask, 1684 XMMRegister xtmp1, XMMRegister xtmp2, 1685 XMMRegister temp_dst, Register rtmp, 1686 Register mask_idx, Register length, 1687 int vector_len, int vlen_enc) { 1688 Label GATHER8_LOOP; 1689 assert(is_subword_type(elem_ty), ""); 1690 movl(length, vector_len); 1691 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1692 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1693 vallones(xtmp2, vlen_enc); 1694 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1695 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1696 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1697 1698 bind(GATHER8_LOOP); 1699 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1700 if (mask == noreg) { 1701 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1702 } else { 1703 vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc); 1704 } 1705 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1706 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1707 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1708 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1709 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1710 vpor(dst, dst, temp_dst, vlen_enc); 1711 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1712 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1713 jcc(Assembler::notEqual, GATHER8_LOOP); 1714 } 1715 1716 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1717 switch(typ) { 1718 case T_INT: 1719 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1720 break; 1721 case T_FLOAT: 1722 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1723 break; 1724 case T_LONG: 1725 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1726 break; 1727 case T_DOUBLE: 1728 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1729 break; 1730 default: 1731 assert(false,"Should not reach here."); 1732 break; 1733 } 1734 } 1735 1736 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1737 switch(typ) { 1738 case T_INT: 1739 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1740 break; 1741 case T_FLOAT: 1742 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1743 break; 1744 case T_LONG: 1745 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1746 break; 1747 case T_DOUBLE: 1748 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1749 break; 1750 default: 1751 assert(false,"Should not reach here."); 1752 break; 1753 } 1754 } 1755 1756 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1757 switch(typ) { 1758 case T_INT: 1759 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1760 break; 1761 case T_FLOAT: 1762 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1763 break; 1764 case T_LONG: 1765 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1766 break; 1767 case T_DOUBLE: 1768 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1769 break; 1770 default: 1771 assert(false,"Should not reach here."); 1772 break; 1773 } 1774 } 1775 1776 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1777 if (vlen_in_bytes <= 16) { 1778 pxor (dst, dst); 1779 psubb(dst, src); 1780 switch (elem_bt) { 1781 case T_BYTE: /* nothing to do */ break; 1782 case T_SHORT: pmovsxbw(dst, dst); break; 1783 case T_INT: pmovsxbd(dst, dst); break; 1784 case T_FLOAT: pmovsxbd(dst, dst); break; 1785 case T_LONG: pmovsxbq(dst, dst); break; 1786 case T_DOUBLE: pmovsxbq(dst, dst); break; 1787 1788 default: assert(false, "%s", type2name(elem_bt)); 1789 } 1790 } else { 1791 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1792 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1793 1794 vpxor (dst, dst, dst, vlen_enc); 1795 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1796 1797 switch (elem_bt) { 1798 case T_BYTE: /* nothing to do */ break; 1799 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1800 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1801 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1802 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1803 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1804 1805 default: assert(false, "%s", type2name(elem_bt)); 1806 } 1807 } 1808 } 1809 1810 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1811 if (novlbwdq) { 1812 vpmovsxbd(xtmp, src, vlen_enc); 1813 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1814 Assembler::eq, true, vlen_enc, noreg); 1815 } else { 1816 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1817 vpsubb(xtmp, xtmp, src, vlen_enc); 1818 evpmovb2m(dst, xtmp, vlen_enc); 1819 } 1820 } 1821 1822 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) { 1823 if (is_integral_type(bt)) { 1824 switch (vlen_in_bytes) { 1825 case 4: movdl(dst, src); break; 1826 case 8: movq(dst, src); break; 1827 case 16: movdqu(dst, src); break; 1828 case 32: vmovdqu(dst, src); break; 1829 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1830 default: ShouldNotReachHere(); 1831 } 1832 } else { 1833 switch (vlen_in_bytes) { 1834 case 4: movflt(dst, src); break; 1835 case 8: movdbl(dst, src); break; 1836 case 16: movups(dst, src); break; 1837 case 32: vmovups(dst, src, Assembler::AVX_256bit); break; 1838 case 64: vmovups(dst, src, Assembler::AVX_512bit); break; 1839 default: ShouldNotReachHere(); 1840 } 1841 } 1842 } 1843 1844 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1845 assert(rscratch != noreg || always_reachable(src), "missing"); 1846 1847 if (reachable(src)) { 1848 load_vector(bt, dst, as_Address(src), vlen_in_bytes); 1849 } else { 1850 lea(rscratch, src); 1851 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes); 1852 } 1853 } 1854 1855 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1856 int vlen_enc = vector_length_encoding(vlen); 1857 if (VM_Version::supports_avx()) { 1858 if (bt == T_LONG) { 1859 if (VM_Version::supports_avx2()) { 1860 vpbroadcastq(dst, src, vlen_enc); 1861 } else { 1862 vmovddup(dst, src, vlen_enc); 1863 } 1864 } else if (bt == T_DOUBLE) { 1865 if (vlen_enc != Assembler::AVX_128bit) { 1866 vbroadcastsd(dst, src, vlen_enc, noreg); 1867 } else { 1868 vmovddup(dst, src, vlen_enc); 1869 } 1870 } else { 1871 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1872 vpbroadcastd(dst, src, vlen_enc); 1873 } else { 1874 vbroadcastss(dst, src, vlen_enc); 1875 } 1876 } 1877 } else if (VM_Version::supports_sse3()) { 1878 movddup(dst, src); 1879 } else { 1880 load_vector(bt, dst, src, vlen); 1881 } 1882 } 1883 1884 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1885 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1886 int offset = exact_log2(type2aelembytes(bt)) << 6; 1887 if (is_floating_point_type(bt)) { 1888 offset += 128; 1889 } 1890 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1891 load_vector(T_BYTE, dst, addr, vlen_in_bytes); 1892 } 1893 1894 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1895 1896 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1897 int vector_len = Assembler::AVX_128bit; 1898 1899 switch (opcode) { 1900 case Op_AndReductionV: pand(dst, src); break; 1901 case Op_OrReductionV: por (dst, src); break; 1902 case Op_XorReductionV: pxor(dst, src); break; 1903 case Op_MinReductionV: 1904 switch (typ) { 1905 case T_BYTE: pminsb(dst, src); break; 1906 case T_SHORT: pminsw(dst, src); break; 1907 case T_INT: pminsd(dst, src); break; 1908 case T_LONG: assert(UseAVX > 2, "required"); 1909 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1910 default: assert(false, "wrong type"); 1911 } 1912 break; 1913 case Op_MaxReductionV: 1914 switch (typ) { 1915 case T_BYTE: pmaxsb(dst, src); break; 1916 case T_SHORT: pmaxsw(dst, src); break; 1917 case T_INT: pmaxsd(dst, src); break; 1918 case T_LONG: assert(UseAVX > 2, "required"); 1919 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1920 default: assert(false, "wrong type"); 1921 } 1922 break; 1923 case Op_AddReductionVF: addss(dst, src); break; 1924 case Op_AddReductionVD: addsd(dst, src); break; 1925 case Op_AddReductionVI: 1926 switch (typ) { 1927 case T_BYTE: paddb(dst, src); break; 1928 case T_SHORT: paddw(dst, src); break; 1929 case T_INT: paddd(dst, src); break; 1930 default: assert(false, "wrong type"); 1931 } 1932 break; 1933 case Op_AddReductionVL: paddq(dst, src); break; 1934 case Op_MulReductionVF: mulss(dst, src); break; 1935 case Op_MulReductionVD: mulsd(dst, src); break; 1936 case Op_MulReductionVI: 1937 switch (typ) { 1938 case T_SHORT: pmullw(dst, src); break; 1939 case T_INT: pmulld(dst, src); break; 1940 default: assert(false, "wrong type"); 1941 } 1942 break; 1943 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1944 evpmullq(dst, dst, src, vector_len); break; 1945 default: assert(false, "wrong opcode"); 1946 } 1947 } 1948 1949 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1950 switch (opcode) { 1951 case Op_AddReductionVF: addps(dst, src); break; 1952 case Op_AddReductionVD: addpd(dst, src); break; 1953 case Op_MulReductionVF: mulps(dst, src); break; 1954 case Op_MulReductionVD: mulpd(dst, src); break; 1955 default: assert(false, "%s", NodeClassNames[opcode]); 1956 } 1957 } 1958 1959 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1960 int vector_len = Assembler::AVX_256bit; 1961 1962 switch (opcode) { 1963 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1964 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1965 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1966 case Op_MinReductionV: 1967 switch (typ) { 1968 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1969 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1970 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1971 case T_LONG: assert(UseAVX > 2, "required"); 1972 vpminsq(dst, src1, src2, vector_len); break; 1973 default: assert(false, "wrong type"); 1974 } 1975 break; 1976 case Op_MaxReductionV: 1977 switch (typ) { 1978 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1979 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1980 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1981 case T_LONG: assert(UseAVX > 2, "required"); 1982 vpmaxsq(dst, src1, src2, vector_len); break; 1983 default: assert(false, "wrong type"); 1984 } 1985 break; 1986 case Op_AddReductionVI: 1987 switch (typ) { 1988 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1989 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1990 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1991 default: assert(false, "wrong type"); 1992 } 1993 break; 1994 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1995 case Op_MulReductionVI: 1996 switch (typ) { 1997 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1998 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1999 default: assert(false, "wrong type"); 2000 } 2001 break; 2002 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 2003 default: assert(false, "wrong opcode"); 2004 } 2005 } 2006 2007 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 2008 int vector_len = Assembler::AVX_256bit; 2009 2010 switch (opcode) { 2011 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 2012 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 2013 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 2014 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 2015 default: assert(false, "%s", NodeClassNames[opcode]); 2016 } 2017 } 2018 2019 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 2020 XMMRegister dst, XMMRegister src, 2021 XMMRegister vtmp1, XMMRegister vtmp2) { 2022 switch (opcode) { 2023 case Op_AddReductionVF: 2024 case Op_MulReductionVF: 2025 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 2026 break; 2027 2028 case Op_AddReductionVD: 2029 case Op_MulReductionVD: 2030 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 2031 break; 2032 2033 default: assert(false, "wrong opcode"); 2034 } 2035 } 2036 2037 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 2038 XMMRegister dst, XMMRegister src, 2039 XMMRegister vtmp1, XMMRegister vtmp2) { 2040 switch (opcode) { 2041 case Op_AddReductionVF: 2042 case Op_MulReductionVF: 2043 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 2044 break; 2045 2046 case Op_AddReductionVD: 2047 case Op_MulReductionVD: 2048 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 2049 break; 2050 2051 default: assert(false, "%s", NodeClassNames[opcode]); 2052 } 2053 } 2054 2055 void C2_MacroAssembler::reduceB(int opcode, int vlen, 2056 Register dst, Register src1, XMMRegister src2, 2057 XMMRegister vtmp1, XMMRegister vtmp2) { 2058 switch (vlen) { 2059 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2060 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2061 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2062 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2063 2064 default: assert(false, "wrong vector length"); 2065 } 2066 } 2067 2068 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 2069 Register dst, Register src1, XMMRegister src2, 2070 XMMRegister vtmp1, XMMRegister vtmp2) { 2071 switch (vlen) { 2072 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2073 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2074 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2075 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2076 2077 default: assert(false, "wrong vector length"); 2078 } 2079 } 2080 2081 void C2_MacroAssembler::reduceS(int opcode, int vlen, 2082 Register dst, Register src1, XMMRegister src2, 2083 XMMRegister vtmp1, XMMRegister vtmp2) { 2084 switch (vlen) { 2085 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2086 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2087 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2088 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2089 2090 default: assert(false, "wrong vector length"); 2091 } 2092 } 2093 2094 void C2_MacroAssembler::reduceI(int opcode, int vlen, 2095 Register dst, Register src1, XMMRegister src2, 2096 XMMRegister vtmp1, XMMRegister vtmp2) { 2097 switch (vlen) { 2098 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2099 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2100 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2101 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2102 2103 default: assert(false, "wrong vector length"); 2104 } 2105 } 2106 2107 void C2_MacroAssembler::reduceL(int opcode, int vlen, 2108 Register dst, Register src1, XMMRegister src2, 2109 XMMRegister vtmp1, XMMRegister vtmp2) { 2110 switch (vlen) { 2111 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2112 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2113 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2114 2115 default: assert(false, "wrong vector length"); 2116 } 2117 } 2118 2119 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2120 switch (vlen) { 2121 case 2: 2122 assert(vtmp2 == xnoreg, ""); 2123 reduce2F(opcode, dst, src, vtmp1); 2124 break; 2125 case 4: 2126 assert(vtmp2 == xnoreg, ""); 2127 reduce4F(opcode, dst, src, vtmp1); 2128 break; 2129 case 8: 2130 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2131 break; 2132 case 16: 2133 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2134 break; 2135 default: assert(false, "wrong vector length"); 2136 } 2137 } 2138 2139 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2140 switch (vlen) { 2141 case 2: 2142 assert(vtmp2 == xnoreg, ""); 2143 reduce2D(opcode, dst, src, vtmp1); 2144 break; 2145 case 4: 2146 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2147 break; 2148 case 8: 2149 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2150 break; 2151 default: assert(false, "wrong vector length"); 2152 } 2153 } 2154 2155 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2156 switch (vlen) { 2157 case 2: 2158 assert(vtmp1 == xnoreg, ""); 2159 assert(vtmp2 == xnoreg, ""); 2160 unorderedReduce2F(opcode, dst, src); 2161 break; 2162 case 4: 2163 assert(vtmp2 == xnoreg, ""); 2164 unorderedReduce4F(opcode, dst, src, vtmp1); 2165 break; 2166 case 8: 2167 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 2168 break; 2169 case 16: 2170 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 2171 break; 2172 default: assert(false, "wrong vector length"); 2173 } 2174 } 2175 2176 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2177 switch (vlen) { 2178 case 2: 2179 assert(vtmp1 == xnoreg, ""); 2180 assert(vtmp2 == xnoreg, ""); 2181 unorderedReduce2D(opcode, dst, src); 2182 break; 2183 case 4: 2184 assert(vtmp2 == xnoreg, ""); 2185 unorderedReduce4D(opcode, dst, src, vtmp1); 2186 break; 2187 case 8: 2188 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2189 break; 2190 default: assert(false, "wrong vector length"); 2191 } 2192 } 2193 2194 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2195 if (opcode == Op_AddReductionVI) { 2196 if (vtmp1 != src2) { 2197 movdqu(vtmp1, src2); 2198 } 2199 phaddd(vtmp1, vtmp1); 2200 } else { 2201 pshufd(vtmp1, src2, 0x1); 2202 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2203 } 2204 movdl(vtmp2, src1); 2205 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2206 movdl(dst, vtmp1); 2207 } 2208 2209 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2210 if (opcode == Op_AddReductionVI) { 2211 if (vtmp1 != src2) { 2212 movdqu(vtmp1, src2); 2213 } 2214 phaddd(vtmp1, src2); 2215 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2216 } else { 2217 pshufd(vtmp2, src2, 0xE); 2218 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2219 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2220 } 2221 } 2222 2223 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2224 if (opcode == Op_AddReductionVI) { 2225 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2226 vextracti128_high(vtmp2, vtmp1); 2227 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2228 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2229 } else { 2230 vextracti128_high(vtmp1, src2); 2231 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2232 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2233 } 2234 } 2235 2236 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2237 vextracti64x4_high(vtmp2, src2); 2238 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2239 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2240 } 2241 2242 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2243 pshufd(vtmp2, src2, 0x1); 2244 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2245 movdqu(vtmp1, vtmp2); 2246 psrldq(vtmp1, 2); 2247 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2248 movdqu(vtmp2, vtmp1); 2249 psrldq(vtmp2, 1); 2250 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2251 movdl(vtmp2, src1); 2252 pmovsxbd(vtmp1, vtmp1); 2253 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2254 pextrb(dst, vtmp1, 0x0); 2255 movsbl(dst, dst); 2256 } 2257 2258 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2259 pshufd(vtmp1, src2, 0xE); 2260 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2261 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2262 } 2263 2264 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2265 vextracti128_high(vtmp2, src2); 2266 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2267 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2268 } 2269 2270 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2271 vextracti64x4_high(vtmp1, src2); 2272 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2273 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2274 } 2275 2276 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2277 pmovsxbw(vtmp2, src2); 2278 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2279 } 2280 2281 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2282 if (UseAVX > 1) { 2283 int vector_len = Assembler::AVX_256bit; 2284 vpmovsxbw(vtmp1, src2, vector_len); 2285 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2286 } else { 2287 pmovsxbw(vtmp2, src2); 2288 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2289 pshufd(vtmp2, src2, 0x1); 2290 pmovsxbw(vtmp2, src2); 2291 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2292 } 2293 } 2294 2295 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2296 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2297 int vector_len = Assembler::AVX_512bit; 2298 vpmovsxbw(vtmp1, src2, vector_len); 2299 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2300 } else { 2301 assert(UseAVX >= 2,"Should not reach here."); 2302 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2303 vextracti128_high(vtmp2, src2); 2304 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2305 } 2306 } 2307 2308 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2309 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2310 vextracti64x4_high(vtmp2, src2); 2311 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2312 } 2313 2314 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2315 if (opcode == Op_AddReductionVI) { 2316 if (vtmp1 != src2) { 2317 movdqu(vtmp1, src2); 2318 } 2319 phaddw(vtmp1, vtmp1); 2320 phaddw(vtmp1, vtmp1); 2321 } else { 2322 pshufd(vtmp2, src2, 0x1); 2323 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2324 movdqu(vtmp1, vtmp2); 2325 psrldq(vtmp1, 2); 2326 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2327 } 2328 movdl(vtmp2, src1); 2329 pmovsxwd(vtmp1, vtmp1); 2330 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2331 pextrw(dst, vtmp1, 0x0); 2332 movswl(dst, dst); 2333 } 2334 2335 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2336 if (opcode == Op_AddReductionVI) { 2337 if (vtmp1 != src2) { 2338 movdqu(vtmp1, src2); 2339 } 2340 phaddw(vtmp1, src2); 2341 } else { 2342 pshufd(vtmp1, src2, 0xE); 2343 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2344 } 2345 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2346 } 2347 2348 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2349 if (opcode == Op_AddReductionVI) { 2350 int vector_len = Assembler::AVX_256bit; 2351 vphaddw(vtmp2, src2, src2, vector_len); 2352 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2353 } else { 2354 vextracti128_high(vtmp2, src2); 2355 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2356 } 2357 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2358 } 2359 2360 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2361 int vector_len = Assembler::AVX_256bit; 2362 vextracti64x4_high(vtmp1, src2); 2363 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2364 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2365 } 2366 2367 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2368 pshufd(vtmp2, src2, 0xE); 2369 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2370 movdq(vtmp1, src1); 2371 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2372 movdq(dst, vtmp1); 2373 } 2374 2375 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2376 vextracti128_high(vtmp1, src2); 2377 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2378 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2379 } 2380 2381 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2382 vextracti64x4_high(vtmp2, src2); 2383 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2384 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2385 } 2386 2387 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2388 mov64(temp, -1L); 2389 bzhiq(temp, temp, len); 2390 kmovql(dst, temp); 2391 } 2392 2393 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2394 reduce_operation_128(T_FLOAT, opcode, dst, src); 2395 pshufd(vtmp, src, 0x1); 2396 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2397 } 2398 2399 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2400 reduce2F(opcode, dst, src, vtmp); 2401 pshufd(vtmp, src, 0x2); 2402 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2403 pshufd(vtmp, src, 0x3); 2404 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2405 } 2406 2407 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2408 reduce4F(opcode, dst, src, vtmp2); 2409 vextractf128_high(vtmp2, src); 2410 reduce4F(opcode, dst, vtmp2, vtmp1); 2411 } 2412 2413 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2414 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2415 vextracti64x4_high(vtmp1, src); 2416 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2417 } 2418 2419 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2420 pshufd(dst, src, 0x1); 2421 reduce_operation_128(T_FLOAT, opcode, dst, src); 2422 } 2423 2424 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2425 pshufd(vtmp, src, 0xE); 2426 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2427 unorderedReduce2F(opcode, dst, vtmp); 2428 } 2429 2430 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2431 vextractf128_high(vtmp1, src); 2432 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2433 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2434 } 2435 2436 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2437 vextractf64x4_high(vtmp2, src); 2438 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2439 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2440 } 2441 2442 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2443 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2444 pshufd(vtmp, src, 0xE); 2445 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2446 } 2447 2448 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2449 reduce2D(opcode, dst, src, vtmp2); 2450 vextractf128_high(vtmp2, src); 2451 reduce2D(opcode, dst, vtmp2, vtmp1); 2452 } 2453 2454 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2455 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2456 vextracti64x4_high(vtmp1, src); 2457 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2458 } 2459 2460 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2461 pshufd(dst, src, 0xE); 2462 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2463 } 2464 2465 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2466 vextractf128_high(vtmp, src); 2467 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2468 unorderedReduce2D(opcode, dst, vtmp); 2469 } 2470 2471 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2472 vextractf64x4_high(vtmp2, src); 2473 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2474 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2475 } 2476 2477 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2478 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2479 } 2480 2481 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2482 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2483 } 2484 2485 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 2486 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2487 } 2488 2489 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2490 int vec_enc) { 2491 switch(elem_bt) { 2492 case T_INT: 2493 case T_FLOAT: 2494 vmaskmovps(dst, src, mask, vec_enc); 2495 break; 2496 case T_LONG: 2497 case T_DOUBLE: 2498 vmaskmovpd(dst, src, mask, vec_enc); 2499 break; 2500 default: 2501 fatal("Unsupported type %s", type2name(elem_bt)); 2502 break; 2503 } 2504 } 2505 2506 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2507 int vec_enc) { 2508 switch(elem_bt) { 2509 case T_INT: 2510 case T_FLOAT: 2511 vmaskmovps(dst, src, mask, vec_enc); 2512 break; 2513 case T_LONG: 2514 case T_DOUBLE: 2515 vmaskmovpd(dst, src, mask, vec_enc); 2516 break; 2517 default: 2518 fatal("Unsupported type %s", type2name(elem_bt)); 2519 break; 2520 } 2521 } 2522 2523 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2524 XMMRegister dst, XMMRegister src, 2525 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2526 XMMRegister xmm_0, XMMRegister xmm_1) { 2527 const int permconst[] = {1, 14}; 2528 XMMRegister wsrc = src; 2529 XMMRegister wdst = xmm_0; 2530 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2531 2532 int vlen_enc = Assembler::AVX_128bit; 2533 if (vlen == 16) { 2534 vlen_enc = Assembler::AVX_256bit; 2535 } 2536 2537 for (int i = log2(vlen) - 1; i >=0; i--) { 2538 if (i == 0 && !is_dst_valid) { 2539 wdst = dst; 2540 } 2541 if (i == 3) { 2542 vextracti64x4_high(wtmp, wsrc); 2543 } else if (i == 2) { 2544 vextracti128_high(wtmp, wsrc); 2545 } else { // i = [0,1] 2546 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2547 } 2548 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2549 wsrc = wdst; 2550 vlen_enc = Assembler::AVX_128bit; 2551 } 2552 if (is_dst_valid) { 2553 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2554 } 2555 } 2556 2557 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2558 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2559 XMMRegister xmm_0, XMMRegister xmm_1) { 2560 XMMRegister wsrc = src; 2561 XMMRegister wdst = xmm_0; 2562 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2563 int vlen_enc = Assembler::AVX_128bit; 2564 if (vlen == 8) { 2565 vlen_enc = Assembler::AVX_256bit; 2566 } 2567 for (int i = log2(vlen) - 1; i >=0; i--) { 2568 if (i == 0 && !is_dst_valid) { 2569 wdst = dst; 2570 } 2571 if (i == 1) { 2572 vextracti128_high(wtmp, wsrc); 2573 } else if (i == 2) { 2574 vextracti64x4_high(wtmp, wsrc); 2575 } else { 2576 assert(i == 0, "%d", i); 2577 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2578 } 2579 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2580 wsrc = wdst; 2581 vlen_enc = Assembler::AVX_128bit; 2582 } 2583 if (is_dst_valid) { 2584 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2585 } 2586 } 2587 2588 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2589 switch (bt) { 2590 case T_BYTE: pextrb(dst, src, idx); break; 2591 case T_SHORT: pextrw(dst, src, idx); break; 2592 case T_INT: pextrd(dst, src, idx); break; 2593 case T_LONG: pextrq(dst, src, idx); break; 2594 2595 default: 2596 assert(false,"Should not reach here."); 2597 break; 2598 } 2599 } 2600 2601 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2602 int esize = type2aelembytes(typ); 2603 int elem_per_lane = 16/esize; 2604 int lane = elemindex / elem_per_lane; 2605 int eindex = elemindex % elem_per_lane; 2606 2607 if (lane >= 2) { 2608 assert(UseAVX > 2, "required"); 2609 vextractf32x4(dst, src, lane & 3); 2610 return dst; 2611 } else if (lane > 0) { 2612 assert(UseAVX > 0, "required"); 2613 vextractf128(dst, src, lane); 2614 return dst; 2615 } else { 2616 return src; 2617 } 2618 } 2619 2620 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2621 if (typ == T_BYTE) { 2622 movsbl(dst, dst); 2623 } else if (typ == T_SHORT) { 2624 movswl(dst, dst); 2625 } 2626 } 2627 2628 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2629 int esize = type2aelembytes(typ); 2630 int elem_per_lane = 16/esize; 2631 int eindex = elemindex % elem_per_lane; 2632 assert(is_integral_type(typ),"required"); 2633 2634 if (eindex == 0) { 2635 if (typ == T_LONG) { 2636 movq(dst, src); 2637 } else { 2638 movdl(dst, src); 2639 movsxl(typ, dst); 2640 } 2641 } else { 2642 extract(typ, dst, src, eindex); 2643 movsxl(typ, dst); 2644 } 2645 } 2646 2647 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2648 int esize = type2aelembytes(typ); 2649 int elem_per_lane = 16/esize; 2650 int eindex = elemindex % elem_per_lane; 2651 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2652 2653 if (eindex == 0) { 2654 movq(dst, src); 2655 } else { 2656 if (typ == T_FLOAT) { 2657 if (UseAVX == 0) { 2658 movdqu(dst, src); 2659 shufps(dst, dst, eindex); 2660 } else { 2661 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2662 } 2663 } else { 2664 if (UseAVX == 0) { 2665 movdqu(dst, src); 2666 psrldq(dst, eindex*esize); 2667 } else { 2668 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2669 } 2670 movq(dst, dst); 2671 } 2672 } 2673 // Zero upper bits 2674 if (typ == T_FLOAT) { 2675 if (UseAVX == 0) { 2676 assert(vtmp != xnoreg, "required."); 2677 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2678 pand(dst, vtmp); 2679 } else { 2680 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2681 } 2682 } 2683 } 2684 2685 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2686 switch(typ) { 2687 case T_BYTE: 2688 case T_BOOLEAN: 2689 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2690 break; 2691 case T_SHORT: 2692 case T_CHAR: 2693 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2694 break; 2695 case T_INT: 2696 case T_FLOAT: 2697 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2698 break; 2699 case T_LONG: 2700 case T_DOUBLE: 2701 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2702 break; 2703 default: 2704 assert(false,"Should not reach here."); 2705 break; 2706 } 2707 } 2708 2709 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2710 assert(rscratch != noreg || always_reachable(src2), "missing"); 2711 2712 switch(typ) { 2713 case T_BOOLEAN: 2714 case T_BYTE: 2715 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2716 break; 2717 case T_CHAR: 2718 case T_SHORT: 2719 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2720 break; 2721 case T_INT: 2722 case T_FLOAT: 2723 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2724 break; 2725 case T_LONG: 2726 case T_DOUBLE: 2727 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2728 break; 2729 default: 2730 assert(false,"Should not reach here."); 2731 break; 2732 } 2733 } 2734 2735 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2736 switch(typ) { 2737 case T_BYTE: 2738 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2739 break; 2740 case T_SHORT: 2741 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2742 break; 2743 case T_INT: 2744 case T_FLOAT: 2745 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2746 break; 2747 case T_LONG: 2748 case T_DOUBLE: 2749 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2750 break; 2751 default: 2752 assert(false,"Should not reach here."); 2753 break; 2754 } 2755 } 2756 2757 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2758 assert(vlen_in_bytes <= 32, ""); 2759 int esize = type2aelembytes(bt); 2760 if (vlen_in_bytes == 32) { 2761 assert(vtmp == xnoreg, "required."); 2762 if (esize >= 4) { 2763 vtestps(src1, src2, AVX_256bit); 2764 } else { 2765 vptest(src1, src2, AVX_256bit); 2766 } 2767 return; 2768 } 2769 if (vlen_in_bytes < 16) { 2770 // Duplicate the lower part to fill the whole register, 2771 // Don't need to do so for src2 2772 assert(vtmp != xnoreg, "required"); 2773 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2774 pshufd(vtmp, src1, shuffle_imm); 2775 } else { 2776 assert(vtmp == xnoreg, "required"); 2777 vtmp = src1; 2778 } 2779 if (esize >= 4 && VM_Version::supports_avx()) { 2780 vtestps(vtmp, src2, AVX_128bit); 2781 } else { 2782 ptest(vtmp, src2); 2783 } 2784 } 2785 2786 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2787 #ifdef ASSERT 2788 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2789 bool is_bw_supported = VM_Version::supports_avx512bw(); 2790 if (is_bw && !is_bw_supported) { 2791 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2792 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2793 "XMM register should be 0-15"); 2794 } 2795 #endif // ASSERT 2796 switch (elem_bt) { 2797 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2798 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2799 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2800 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2801 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2802 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2803 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2804 } 2805 } 2806 2807 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2808 assert(UseAVX >= 2, "required"); 2809 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2810 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2811 if ((UseAVX > 2) && 2812 (!is_bw || VM_Version::supports_avx512bw()) && 2813 (!is_vl || VM_Version::supports_avx512vl())) { 2814 switch (elem_bt) { 2815 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2816 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2817 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2818 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2819 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2820 } 2821 } else { 2822 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2823 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2824 switch (elem_bt) { 2825 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2826 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2827 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2828 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2829 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2830 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2831 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2832 } 2833 } 2834 } 2835 2836 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2837 switch (to_elem_bt) { 2838 case T_SHORT: 2839 vpmovsxbw(dst, src, vlen_enc); 2840 break; 2841 case T_INT: 2842 vpmovsxbd(dst, src, vlen_enc); 2843 break; 2844 case T_FLOAT: 2845 vpmovsxbd(dst, src, vlen_enc); 2846 vcvtdq2ps(dst, dst, vlen_enc); 2847 break; 2848 case T_LONG: 2849 vpmovsxbq(dst, src, vlen_enc); 2850 break; 2851 case T_DOUBLE: { 2852 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2853 vpmovsxbd(dst, src, mid_vlen_enc); 2854 vcvtdq2pd(dst, dst, vlen_enc); 2855 break; 2856 } 2857 default: 2858 fatal("Unsupported type %s", type2name(to_elem_bt)); 2859 break; 2860 } 2861 } 2862 2863 //------------------------------------------------------------------------------------------- 2864 2865 // IndexOf for constant substrings with size >= 8 chars 2866 // which don't need to be loaded through stack. 2867 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2868 Register cnt1, Register cnt2, 2869 int int_cnt2, Register result, 2870 XMMRegister vec, Register tmp, 2871 int ae) { 2872 ShortBranchVerifier sbv(this); 2873 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2874 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2875 2876 // This method uses the pcmpestri instruction with bound registers 2877 // inputs: 2878 // xmm - substring 2879 // rax - substring length (elements count) 2880 // mem - scanned string 2881 // rdx - string length (elements count) 2882 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2883 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2884 // outputs: 2885 // rcx - matched index in string 2886 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2887 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2888 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2889 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2890 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2891 2892 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2893 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2894 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2895 2896 // Note, inline_string_indexOf() generates checks: 2897 // if (substr.count > string.count) return -1; 2898 // if (substr.count == 0) return 0; 2899 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2900 2901 // Load substring. 2902 if (ae == StrIntrinsicNode::UL) { 2903 pmovzxbw(vec, Address(str2, 0)); 2904 } else { 2905 movdqu(vec, Address(str2, 0)); 2906 } 2907 movl(cnt2, int_cnt2); 2908 movptr(result, str1); // string addr 2909 2910 if (int_cnt2 > stride) { 2911 jmpb(SCAN_TO_SUBSTR); 2912 2913 // Reload substr for rescan, this code 2914 // is executed only for large substrings (> 8 chars) 2915 bind(RELOAD_SUBSTR); 2916 if (ae == StrIntrinsicNode::UL) { 2917 pmovzxbw(vec, Address(str2, 0)); 2918 } else { 2919 movdqu(vec, Address(str2, 0)); 2920 } 2921 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2922 2923 bind(RELOAD_STR); 2924 // We came here after the beginning of the substring was 2925 // matched but the rest of it was not so we need to search 2926 // again. Start from the next element after the previous match. 2927 2928 // cnt2 is number of substring reminding elements and 2929 // cnt1 is number of string reminding elements when cmp failed. 2930 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2931 subl(cnt1, cnt2); 2932 addl(cnt1, int_cnt2); 2933 movl(cnt2, int_cnt2); // Now restore cnt2 2934 2935 decrementl(cnt1); // Shift to next element 2936 cmpl(cnt1, cnt2); 2937 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2938 2939 addptr(result, (1<<scale1)); 2940 2941 } // (int_cnt2 > 8) 2942 2943 // Scan string for start of substr in 16-byte vectors 2944 bind(SCAN_TO_SUBSTR); 2945 pcmpestri(vec, Address(result, 0), mode); 2946 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2947 subl(cnt1, stride); 2948 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2949 cmpl(cnt1, cnt2); 2950 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2951 addptr(result, 16); 2952 jmpb(SCAN_TO_SUBSTR); 2953 2954 // Found a potential substr 2955 bind(FOUND_CANDIDATE); 2956 // Matched whole vector if first element matched (tmp(rcx) == 0). 2957 if (int_cnt2 == stride) { 2958 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2959 } else { // int_cnt2 > 8 2960 jccb(Assembler::overflow, FOUND_SUBSTR); 2961 } 2962 // After pcmpestri tmp(rcx) contains matched element index 2963 // Compute start addr of substr 2964 lea(result, Address(result, tmp, scale1)); 2965 2966 // Make sure string is still long enough 2967 subl(cnt1, tmp); 2968 cmpl(cnt1, cnt2); 2969 if (int_cnt2 == stride) { 2970 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2971 } else { // int_cnt2 > 8 2972 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2973 } 2974 // Left less then substring. 2975 2976 bind(RET_NOT_FOUND); 2977 movl(result, -1); 2978 jmp(EXIT); 2979 2980 if (int_cnt2 > stride) { 2981 // This code is optimized for the case when whole substring 2982 // is matched if its head is matched. 2983 bind(MATCH_SUBSTR_HEAD); 2984 pcmpestri(vec, Address(result, 0), mode); 2985 // Reload only string if does not match 2986 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2987 2988 Label CONT_SCAN_SUBSTR; 2989 // Compare the rest of substring (> 8 chars). 2990 bind(FOUND_SUBSTR); 2991 // First 8 chars are already matched. 2992 negptr(cnt2); 2993 addptr(cnt2, stride); 2994 2995 bind(SCAN_SUBSTR); 2996 subl(cnt1, stride); 2997 cmpl(cnt2, -stride); // Do not read beyond substring 2998 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2999 // Back-up strings to avoid reading beyond substring: 3000 // cnt1 = cnt1 - cnt2 + 8 3001 addl(cnt1, cnt2); // cnt2 is negative 3002 addl(cnt1, stride); 3003 movl(cnt2, stride); negptr(cnt2); 3004 bind(CONT_SCAN_SUBSTR); 3005 if (int_cnt2 < (int)G) { 3006 int tail_off1 = int_cnt2<<scale1; 3007 int tail_off2 = int_cnt2<<scale2; 3008 if (ae == StrIntrinsicNode::UL) { 3009 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 3010 } else { 3011 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 3012 } 3013 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 3014 } else { 3015 // calculate index in register to avoid integer overflow (int_cnt2*2) 3016 movl(tmp, int_cnt2); 3017 addptr(tmp, cnt2); 3018 if (ae == StrIntrinsicNode::UL) { 3019 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 3020 } else { 3021 movdqu(vec, Address(str2, tmp, scale2, 0)); 3022 } 3023 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 3024 } 3025 // Need to reload strings pointers if not matched whole vector 3026 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3027 addptr(cnt2, stride); 3028 jcc(Assembler::negative, SCAN_SUBSTR); 3029 // Fall through if found full substring 3030 3031 } // (int_cnt2 > 8) 3032 3033 bind(RET_FOUND); 3034 // Found result if we matched full small substring. 3035 // Compute substr offset 3036 subptr(result, str1); 3037 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3038 shrl(result, 1); // index 3039 } 3040 bind(EXIT); 3041 3042 } // string_indexofC8 3043 3044 // Small strings are loaded through stack if they cross page boundary. 3045 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 3046 Register cnt1, Register cnt2, 3047 int int_cnt2, Register result, 3048 XMMRegister vec, Register tmp, 3049 int ae) { 3050 ShortBranchVerifier sbv(this); 3051 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3052 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3053 3054 // 3055 // int_cnt2 is length of small (< 8 chars) constant substring 3056 // or (-1) for non constant substring in which case its length 3057 // is in cnt2 register. 3058 // 3059 // Note, inline_string_indexOf() generates checks: 3060 // if (substr.count > string.count) return -1; 3061 // if (substr.count == 0) return 0; 3062 // 3063 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 3064 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 3065 // This method uses the pcmpestri instruction with bound registers 3066 // inputs: 3067 // xmm - substring 3068 // rax - substring length (elements count) 3069 // mem - scanned string 3070 // rdx - string length (elements count) 3071 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 3072 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 3073 // outputs: 3074 // rcx - matched index in string 3075 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3076 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 3077 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 3078 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 3079 3080 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 3081 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 3082 FOUND_CANDIDATE; 3083 3084 { //======================================================== 3085 // We don't know where these strings are located 3086 // and we can't read beyond them. Load them through stack. 3087 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 3088 3089 movptr(tmp, rsp); // save old SP 3090 3091 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 3092 if (int_cnt2 == (1>>scale2)) { // One byte 3093 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 3094 load_unsigned_byte(result, Address(str2, 0)); 3095 movdl(vec, result); // move 32 bits 3096 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 3097 // Not enough header space in 32-bit VM: 12+3 = 15. 3098 movl(result, Address(str2, -1)); 3099 shrl(result, 8); 3100 movdl(vec, result); // move 32 bits 3101 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 3102 load_unsigned_short(result, Address(str2, 0)); 3103 movdl(vec, result); // move 32 bits 3104 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 3105 movdl(vec, Address(str2, 0)); // move 32 bits 3106 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 3107 movq(vec, Address(str2, 0)); // move 64 bits 3108 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 3109 // Array header size is 12 bytes in 32-bit VM 3110 // + 6 bytes for 3 chars == 18 bytes, 3111 // enough space to load vec and shift. 3112 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3113 if (ae == StrIntrinsicNode::UL) { 3114 int tail_off = int_cnt2-8; 3115 pmovzxbw(vec, Address(str2, tail_off)); 3116 psrldq(vec, -2*tail_off); 3117 } 3118 else { 3119 int tail_off = int_cnt2*(1<<scale2); 3120 movdqu(vec, Address(str2, tail_off-16)); 3121 psrldq(vec, 16-tail_off); 3122 } 3123 } 3124 } else { // not constant substring 3125 cmpl(cnt2, stride); 3126 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3127 3128 // We can read beyond string if srt+16 does not cross page boundary 3129 // since heaps are aligned and mapped by pages. 3130 assert(os::vm_page_size() < (int)G, "default page should be small"); 3131 movl(result, str2); // We need only low 32 bits 3132 andl(result, ((int)os::vm_page_size()-1)); 3133 cmpl(result, ((int)os::vm_page_size()-16)); 3134 jccb(Assembler::belowEqual, CHECK_STR); 3135 3136 // Move small strings to stack to allow load 16 bytes into vec. 3137 subptr(rsp, 16); 3138 int stk_offset = wordSize-(1<<scale2); 3139 push(cnt2); 3140 3141 bind(COPY_SUBSTR); 3142 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3143 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3144 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3145 } else if (ae == StrIntrinsicNode::UU) { 3146 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3147 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3148 } 3149 decrement(cnt2); 3150 jccb(Assembler::notZero, COPY_SUBSTR); 3151 3152 pop(cnt2); 3153 movptr(str2, rsp); // New substring address 3154 } // non constant 3155 3156 bind(CHECK_STR); 3157 cmpl(cnt1, stride); 3158 jccb(Assembler::aboveEqual, BIG_STRINGS); 3159 3160 // Check cross page boundary. 3161 movl(result, str1); // We need only low 32 bits 3162 andl(result, ((int)os::vm_page_size()-1)); 3163 cmpl(result, ((int)os::vm_page_size()-16)); 3164 jccb(Assembler::belowEqual, BIG_STRINGS); 3165 3166 subptr(rsp, 16); 3167 int stk_offset = -(1<<scale1); 3168 if (int_cnt2 < 0) { // not constant 3169 push(cnt2); 3170 stk_offset += wordSize; 3171 } 3172 movl(cnt2, cnt1); 3173 3174 bind(COPY_STR); 3175 if (ae == StrIntrinsicNode::LL) { 3176 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3177 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3178 } else { 3179 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3180 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3181 } 3182 decrement(cnt2); 3183 jccb(Assembler::notZero, COPY_STR); 3184 3185 if (int_cnt2 < 0) { // not constant 3186 pop(cnt2); 3187 } 3188 movptr(str1, rsp); // New string address 3189 3190 bind(BIG_STRINGS); 3191 // Load substring. 3192 if (int_cnt2 < 0) { // -1 3193 if (ae == StrIntrinsicNode::UL) { 3194 pmovzxbw(vec, Address(str2, 0)); 3195 } else { 3196 movdqu(vec, Address(str2, 0)); 3197 } 3198 push(cnt2); // substr count 3199 push(str2); // substr addr 3200 push(str1); // string addr 3201 } else { 3202 // Small (< 8 chars) constant substrings are loaded already. 3203 movl(cnt2, int_cnt2); 3204 } 3205 push(tmp); // original SP 3206 3207 } // Finished loading 3208 3209 //======================================================== 3210 // Start search 3211 // 3212 3213 movptr(result, str1); // string addr 3214 3215 if (int_cnt2 < 0) { // Only for non constant substring 3216 jmpb(SCAN_TO_SUBSTR); 3217 3218 // SP saved at sp+0 3219 // String saved at sp+1*wordSize 3220 // Substr saved at sp+2*wordSize 3221 // Substr count saved at sp+3*wordSize 3222 3223 // Reload substr for rescan, this code 3224 // is executed only for large substrings (> 8 chars) 3225 bind(RELOAD_SUBSTR); 3226 movptr(str2, Address(rsp, 2*wordSize)); 3227 movl(cnt2, Address(rsp, 3*wordSize)); 3228 if (ae == StrIntrinsicNode::UL) { 3229 pmovzxbw(vec, Address(str2, 0)); 3230 } else { 3231 movdqu(vec, Address(str2, 0)); 3232 } 3233 // We came here after the beginning of the substring was 3234 // matched but the rest of it was not so we need to search 3235 // again. Start from the next element after the previous match. 3236 subptr(str1, result); // Restore counter 3237 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3238 shrl(str1, 1); 3239 } 3240 addl(cnt1, str1); 3241 decrementl(cnt1); // Shift to next element 3242 cmpl(cnt1, cnt2); 3243 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3244 3245 addptr(result, (1<<scale1)); 3246 } // non constant 3247 3248 // Scan string for start of substr in 16-byte vectors 3249 bind(SCAN_TO_SUBSTR); 3250 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3251 pcmpestri(vec, Address(result, 0), mode); 3252 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3253 subl(cnt1, stride); 3254 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3255 cmpl(cnt1, cnt2); 3256 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3257 addptr(result, 16); 3258 3259 bind(ADJUST_STR); 3260 cmpl(cnt1, stride); // Do not read beyond string 3261 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3262 // Back-up string to avoid reading beyond string. 3263 lea(result, Address(result, cnt1, scale1, -16)); 3264 movl(cnt1, stride); 3265 jmpb(SCAN_TO_SUBSTR); 3266 3267 // Found a potential substr 3268 bind(FOUND_CANDIDATE); 3269 // After pcmpestri tmp(rcx) contains matched element index 3270 3271 // Make sure string is still long enough 3272 subl(cnt1, tmp); 3273 cmpl(cnt1, cnt2); 3274 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3275 // Left less then substring. 3276 3277 bind(RET_NOT_FOUND); 3278 movl(result, -1); 3279 jmp(CLEANUP); 3280 3281 bind(FOUND_SUBSTR); 3282 // Compute start addr of substr 3283 lea(result, Address(result, tmp, scale1)); 3284 if (int_cnt2 > 0) { // Constant substring 3285 // Repeat search for small substring (< 8 chars) 3286 // from new point without reloading substring. 3287 // Have to check that we don't read beyond string. 3288 cmpl(tmp, stride-int_cnt2); 3289 jccb(Assembler::greater, ADJUST_STR); 3290 // Fall through if matched whole substring. 3291 } else { // non constant 3292 assert(int_cnt2 == -1, "should be != 0"); 3293 3294 addl(tmp, cnt2); 3295 // Found result if we matched whole substring. 3296 cmpl(tmp, stride); 3297 jcc(Assembler::lessEqual, RET_FOUND); 3298 3299 // Repeat search for small substring (<= 8 chars) 3300 // from new point 'str1' without reloading substring. 3301 cmpl(cnt2, stride); 3302 // Have to check that we don't read beyond string. 3303 jccb(Assembler::lessEqual, ADJUST_STR); 3304 3305 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3306 // Compare the rest of substring (> 8 chars). 3307 movptr(str1, result); 3308 3309 cmpl(tmp, cnt2); 3310 // First 8 chars are already matched. 3311 jccb(Assembler::equal, CHECK_NEXT); 3312 3313 bind(SCAN_SUBSTR); 3314 pcmpestri(vec, Address(str1, 0), mode); 3315 // Need to reload strings pointers if not matched whole vector 3316 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3317 3318 bind(CHECK_NEXT); 3319 subl(cnt2, stride); 3320 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3321 addptr(str1, 16); 3322 if (ae == StrIntrinsicNode::UL) { 3323 addptr(str2, 8); 3324 } else { 3325 addptr(str2, 16); 3326 } 3327 subl(cnt1, stride); 3328 cmpl(cnt2, stride); // Do not read beyond substring 3329 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3330 // Back-up strings to avoid reading beyond substring. 3331 3332 if (ae == StrIntrinsicNode::UL) { 3333 lea(str2, Address(str2, cnt2, scale2, -8)); 3334 lea(str1, Address(str1, cnt2, scale1, -16)); 3335 } else { 3336 lea(str2, Address(str2, cnt2, scale2, -16)); 3337 lea(str1, Address(str1, cnt2, scale1, -16)); 3338 } 3339 subl(cnt1, cnt2); 3340 movl(cnt2, stride); 3341 addl(cnt1, stride); 3342 bind(CONT_SCAN_SUBSTR); 3343 if (ae == StrIntrinsicNode::UL) { 3344 pmovzxbw(vec, Address(str2, 0)); 3345 } else { 3346 movdqu(vec, Address(str2, 0)); 3347 } 3348 jmp(SCAN_SUBSTR); 3349 3350 bind(RET_FOUND_LONG); 3351 movptr(str1, Address(rsp, wordSize)); 3352 } // non constant 3353 3354 bind(RET_FOUND); 3355 // Compute substr offset 3356 subptr(result, str1); 3357 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3358 shrl(result, 1); // index 3359 } 3360 bind(CLEANUP); 3361 pop(rsp); // restore SP 3362 3363 } // string_indexof 3364 3365 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3366 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3367 ShortBranchVerifier sbv(this); 3368 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3369 3370 int stride = 8; 3371 3372 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3373 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3374 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3375 FOUND_SEQ_CHAR, DONE_LABEL; 3376 3377 movptr(result, str1); 3378 if (UseAVX >= 2) { 3379 cmpl(cnt1, stride); 3380 jcc(Assembler::less, SCAN_TO_CHAR); 3381 cmpl(cnt1, 2*stride); 3382 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3383 movdl(vec1, ch); 3384 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3385 vpxor(vec2, vec2); 3386 movl(tmp, cnt1); 3387 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3388 andl(cnt1,0x0000000F); //tail count (in chars) 3389 3390 bind(SCAN_TO_16_CHAR_LOOP); 3391 vmovdqu(vec3, Address(result, 0)); 3392 vpcmpeqw(vec3, vec3, vec1, 1); 3393 vptest(vec2, vec3); 3394 jcc(Assembler::carryClear, FOUND_CHAR); 3395 addptr(result, 32); 3396 subl(tmp, 2*stride); 3397 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3398 jmp(SCAN_TO_8_CHAR); 3399 bind(SCAN_TO_8_CHAR_INIT); 3400 movdl(vec1, ch); 3401 pshuflw(vec1, vec1, 0x00); 3402 pshufd(vec1, vec1, 0); 3403 pxor(vec2, vec2); 3404 } 3405 bind(SCAN_TO_8_CHAR); 3406 cmpl(cnt1, stride); 3407 jcc(Assembler::less, SCAN_TO_CHAR); 3408 if (UseAVX < 2) { 3409 movdl(vec1, ch); 3410 pshuflw(vec1, vec1, 0x00); 3411 pshufd(vec1, vec1, 0); 3412 pxor(vec2, vec2); 3413 } 3414 movl(tmp, cnt1); 3415 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3416 andl(cnt1,0x00000007); //tail count (in chars) 3417 3418 bind(SCAN_TO_8_CHAR_LOOP); 3419 movdqu(vec3, Address(result, 0)); 3420 pcmpeqw(vec3, vec1); 3421 ptest(vec2, vec3); 3422 jcc(Assembler::carryClear, FOUND_CHAR); 3423 addptr(result, 16); 3424 subl(tmp, stride); 3425 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3426 bind(SCAN_TO_CHAR); 3427 testl(cnt1, cnt1); 3428 jcc(Assembler::zero, RET_NOT_FOUND); 3429 bind(SCAN_TO_CHAR_LOOP); 3430 load_unsigned_short(tmp, Address(result, 0)); 3431 cmpl(ch, tmp); 3432 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3433 addptr(result, 2); 3434 subl(cnt1, 1); 3435 jccb(Assembler::zero, RET_NOT_FOUND); 3436 jmp(SCAN_TO_CHAR_LOOP); 3437 3438 bind(RET_NOT_FOUND); 3439 movl(result, -1); 3440 jmpb(DONE_LABEL); 3441 3442 bind(FOUND_CHAR); 3443 if (UseAVX >= 2) { 3444 vpmovmskb(tmp, vec3); 3445 } else { 3446 pmovmskb(tmp, vec3); 3447 } 3448 bsfl(ch, tmp); 3449 addptr(result, ch); 3450 3451 bind(FOUND_SEQ_CHAR); 3452 subptr(result, str1); 3453 shrl(result, 1); 3454 3455 bind(DONE_LABEL); 3456 } // string_indexof_char 3457 3458 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3459 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3460 ShortBranchVerifier sbv(this); 3461 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3462 3463 int stride = 16; 3464 3465 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3466 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3467 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3468 FOUND_SEQ_CHAR, DONE_LABEL; 3469 3470 movptr(result, str1); 3471 if (UseAVX >= 2) { 3472 cmpl(cnt1, stride); 3473 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3474 cmpl(cnt1, stride*2); 3475 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3476 movdl(vec1, ch); 3477 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3478 vpxor(vec2, vec2); 3479 movl(tmp, cnt1); 3480 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3481 andl(cnt1,0x0000001F); //tail count (in chars) 3482 3483 bind(SCAN_TO_32_CHAR_LOOP); 3484 vmovdqu(vec3, Address(result, 0)); 3485 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3486 vptest(vec2, vec3); 3487 jcc(Assembler::carryClear, FOUND_CHAR); 3488 addptr(result, 32); 3489 subl(tmp, stride*2); 3490 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3491 jmp(SCAN_TO_16_CHAR); 3492 3493 bind(SCAN_TO_16_CHAR_INIT); 3494 movdl(vec1, ch); 3495 pxor(vec2, vec2); 3496 pshufb(vec1, vec2); 3497 } 3498 3499 bind(SCAN_TO_16_CHAR); 3500 cmpl(cnt1, stride); 3501 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3502 if (UseAVX < 2) { 3503 movdl(vec1, ch); 3504 pxor(vec2, vec2); 3505 pshufb(vec1, vec2); 3506 } 3507 movl(tmp, cnt1); 3508 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3509 andl(cnt1,0x0000000F); //tail count (in bytes) 3510 3511 bind(SCAN_TO_16_CHAR_LOOP); 3512 movdqu(vec3, Address(result, 0)); 3513 pcmpeqb(vec3, vec1); 3514 ptest(vec2, vec3); 3515 jcc(Assembler::carryClear, FOUND_CHAR); 3516 addptr(result, 16); 3517 subl(tmp, stride); 3518 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3519 3520 bind(SCAN_TO_CHAR_INIT); 3521 testl(cnt1, cnt1); 3522 jcc(Assembler::zero, RET_NOT_FOUND); 3523 bind(SCAN_TO_CHAR_LOOP); 3524 load_unsigned_byte(tmp, Address(result, 0)); 3525 cmpl(ch, tmp); 3526 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3527 addptr(result, 1); 3528 subl(cnt1, 1); 3529 jccb(Assembler::zero, RET_NOT_FOUND); 3530 jmp(SCAN_TO_CHAR_LOOP); 3531 3532 bind(RET_NOT_FOUND); 3533 movl(result, -1); 3534 jmpb(DONE_LABEL); 3535 3536 bind(FOUND_CHAR); 3537 if (UseAVX >= 2) { 3538 vpmovmskb(tmp, vec3); 3539 } else { 3540 pmovmskb(tmp, vec3); 3541 } 3542 bsfl(ch, tmp); 3543 addptr(result, ch); 3544 3545 bind(FOUND_SEQ_CHAR); 3546 subptr(result, str1); 3547 3548 bind(DONE_LABEL); 3549 } // stringL_indexof_char 3550 3551 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3552 switch (eltype) { 3553 case T_BOOLEAN: return sizeof(jboolean); 3554 case T_BYTE: return sizeof(jbyte); 3555 case T_SHORT: return sizeof(jshort); 3556 case T_CHAR: return sizeof(jchar); 3557 case T_INT: return sizeof(jint); 3558 default: 3559 ShouldNotReachHere(); 3560 return -1; 3561 } 3562 } 3563 3564 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3565 switch (eltype) { 3566 // T_BOOLEAN used as surrogate for unsigned byte 3567 case T_BOOLEAN: movzbl(dst, src); break; 3568 case T_BYTE: movsbl(dst, src); break; 3569 case T_SHORT: movswl(dst, src); break; 3570 case T_CHAR: movzwl(dst, src); break; 3571 case T_INT: movl(dst, src); break; 3572 default: 3573 ShouldNotReachHere(); 3574 } 3575 } 3576 3577 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3578 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3579 } 3580 3581 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3582 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3583 } 3584 3585 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3586 const int vlen = Assembler::AVX_256bit; 3587 switch (eltype) { 3588 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3589 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3590 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3591 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3592 case T_INT: 3593 // do nothing 3594 break; 3595 default: 3596 ShouldNotReachHere(); 3597 } 3598 } 3599 3600 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3601 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3602 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3603 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3604 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3605 BasicType eltype) { 3606 ShortBranchVerifier sbv(this); 3607 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3608 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3609 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3610 3611 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3612 SHORT_UNROLLED_LOOP_EXIT, 3613 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3614 UNROLLED_VECTOR_LOOP_BEGIN, 3615 END; 3616 switch (eltype) { 3617 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3618 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3619 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3620 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3621 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3622 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3623 } 3624 3625 // For "renaming" for readibility of the code 3626 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3627 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3628 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3629 3630 const int elsize = arrays_hashcode_elsize(eltype); 3631 3632 /* 3633 if (cnt1 >= 2) { 3634 if (cnt1 >= 32) { 3635 UNROLLED VECTOR LOOP 3636 } 3637 UNROLLED SCALAR LOOP 3638 } 3639 SINGLE SCALAR 3640 */ 3641 3642 cmpl(cnt1, 32); 3643 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3644 3645 // cnt1 >= 32 && generate_vectorized_loop 3646 xorl(index, index); 3647 3648 // vresult = IntVector.zero(I256); 3649 for (int idx = 0; idx < 4; idx++) { 3650 vpxor(vresult[idx], vresult[idx]); 3651 } 3652 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3653 Register bound = tmp2; 3654 Register next = tmp3; 3655 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3656 movl(next, Address(tmp2, 0)); 3657 movdl(vnext, next); 3658 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3659 3660 // index = 0; 3661 // bound = cnt1 & ~(32 - 1); 3662 movl(bound, cnt1); 3663 andl(bound, ~(32 - 1)); 3664 // for (; index < bound; index += 32) { 3665 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3666 // result *= next; 3667 imull(result, next); 3668 // loop fission to upfront the cost of fetching from memory, OOO execution 3669 // can then hopefully do a better job of prefetching 3670 for (int idx = 0; idx < 4; idx++) { 3671 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3672 } 3673 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3674 for (int idx = 0; idx < 4; idx++) { 3675 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3676 arrays_hashcode_elvcast(vtmp[idx], eltype); 3677 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3678 } 3679 // index += 32; 3680 addl(index, 32); 3681 // index < bound; 3682 cmpl(index, bound); 3683 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3684 // } 3685 3686 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3687 subl(cnt1, bound); 3688 // release bound 3689 3690 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3691 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3692 for (int idx = 0; idx < 4; idx++) { 3693 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, (int)((8 * idx + 1) * sizeof(jint))), T_INT); 3694 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3695 } 3696 // result += vresult.reduceLanes(ADD); 3697 for (int idx = 0; idx < 4; idx++) { 3698 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3699 } 3700 3701 // } else if (cnt1 < 32) { 3702 3703 bind(SHORT_UNROLLED_BEGIN); 3704 // int i = 1; 3705 movl(index, 1); 3706 cmpl(index, cnt1); 3707 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3708 3709 // for (; i < cnt1 ; i += 2) { 3710 bind(SHORT_UNROLLED_LOOP_BEGIN); 3711 movl(tmp3, 961); 3712 imull(result, tmp3); 3713 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3714 movl(tmp3, tmp2); 3715 shll(tmp3, 5); 3716 subl(tmp3, tmp2); 3717 addl(result, tmp3); 3718 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3719 addl(result, tmp3); 3720 addl(index, 2); 3721 cmpl(index, cnt1); 3722 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3723 3724 // } 3725 // if (i >= cnt1) { 3726 bind(SHORT_UNROLLED_LOOP_EXIT); 3727 jccb(Assembler::greater, END); 3728 movl(tmp2, result); 3729 shll(result, 5); 3730 subl(result, tmp2); 3731 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3732 addl(result, tmp3); 3733 // } 3734 bind(END); 3735 3736 BLOCK_COMMENT("} // arrays_hashcode"); 3737 3738 } // arrays_hashcode 3739 3740 // helper function for string_compare 3741 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3742 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3743 Address::ScaleFactor scale2, Register index, int ae) { 3744 if (ae == StrIntrinsicNode::LL) { 3745 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3746 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3747 } else if (ae == StrIntrinsicNode::UU) { 3748 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3749 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3750 } else { 3751 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3752 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3753 } 3754 } 3755 3756 // Compare strings, used for char[] and byte[]. 3757 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3758 Register cnt1, Register cnt2, Register result, 3759 XMMRegister vec1, int ae, KRegister mask) { 3760 ShortBranchVerifier sbv(this); 3761 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3762 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3 3763 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3764 int stride2x2 = 0x40; 3765 Address::ScaleFactor scale = Address::no_scale; 3766 Address::ScaleFactor scale1 = Address::no_scale; 3767 Address::ScaleFactor scale2 = Address::no_scale; 3768 3769 if (ae != StrIntrinsicNode::LL) { 3770 stride2x2 = 0x20; 3771 } 3772 3773 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3774 shrl(cnt2, 1); 3775 } 3776 // Compute the minimum of the string lengths and the 3777 // difference of the string lengths (stack). 3778 // Do the conditional move stuff 3779 movl(result, cnt1); 3780 subl(cnt1, cnt2); 3781 push(cnt1); 3782 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3783 3784 // Is the minimum length zero? 3785 testl(cnt2, cnt2); 3786 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3787 if (ae == StrIntrinsicNode::LL) { 3788 // Load first bytes 3789 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3790 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3791 } else if (ae == StrIntrinsicNode::UU) { 3792 // Load first characters 3793 load_unsigned_short(result, Address(str1, 0)); 3794 load_unsigned_short(cnt1, Address(str2, 0)); 3795 } else { 3796 load_unsigned_byte(result, Address(str1, 0)); 3797 load_unsigned_short(cnt1, Address(str2, 0)); 3798 } 3799 subl(result, cnt1); 3800 jcc(Assembler::notZero, POP_LABEL); 3801 3802 if (ae == StrIntrinsicNode::UU) { 3803 // Divide length by 2 to get number of chars 3804 shrl(cnt2, 1); 3805 } 3806 cmpl(cnt2, 1); 3807 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3808 3809 // Check if the strings start at the same location and setup scale and stride 3810 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3811 cmpptr(str1, str2); 3812 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3813 if (ae == StrIntrinsicNode::LL) { 3814 scale = Address::times_1; 3815 stride = 16; 3816 } else { 3817 scale = Address::times_2; 3818 stride = 8; 3819 } 3820 } else { 3821 scale1 = Address::times_1; 3822 scale2 = Address::times_2; 3823 // scale not used 3824 stride = 8; 3825 } 3826 3827 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3828 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3829 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3830 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3831 Label COMPARE_TAIL_LONG; 3832 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3 3833 3834 int pcmpmask = 0x19; 3835 if (ae == StrIntrinsicNode::LL) { 3836 pcmpmask &= ~0x01; 3837 } 3838 3839 // Setup to compare 16-chars (32-bytes) vectors, 3840 // start from first character again because it has aligned address. 3841 if (ae == StrIntrinsicNode::LL) { 3842 stride2 = 32; 3843 } else { 3844 stride2 = 16; 3845 } 3846 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3847 adr_stride = stride << scale; 3848 } else { 3849 adr_stride1 = 8; //stride << scale1; 3850 adr_stride2 = 16; //stride << scale2; 3851 } 3852 3853 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3854 // rax and rdx are used by pcmpestri as elements counters 3855 movl(result, cnt2); 3856 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3857 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3858 3859 // fast path : compare first 2 8-char vectors. 3860 bind(COMPARE_16_CHARS); 3861 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3862 movdqu(vec1, Address(str1, 0)); 3863 } else { 3864 pmovzxbw(vec1, Address(str1, 0)); 3865 } 3866 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3867 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3868 3869 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3870 movdqu(vec1, Address(str1, adr_stride)); 3871 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3872 } else { 3873 pmovzxbw(vec1, Address(str1, adr_stride1)); 3874 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3875 } 3876 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3877 addl(cnt1, stride); 3878 3879 // Compare the characters at index in cnt1 3880 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3881 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3882 subl(result, cnt2); 3883 jmp(POP_LABEL); 3884 3885 // Setup the registers to start vector comparison loop 3886 bind(COMPARE_WIDE_VECTORS); 3887 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3888 lea(str1, Address(str1, result, scale)); 3889 lea(str2, Address(str2, result, scale)); 3890 } else { 3891 lea(str1, Address(str1, result, scale1)); 3892 lea(str2, Address(str2, result, scale2)); 3893 } 3894 subl(result, stride2); 3895 subl(cnt2, stride2); 3896 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3897 negptr(result); 3898 3899 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3900 bind(COMPARE_WIDE_VECTORS_LOOP); 3901 3902 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3903 cmpl(cnt2, stride2x2); 3904 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3905 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3906 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3907 3908 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3909 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3910 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3911 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3912 } else { 3913 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3914 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3915 } 3916 kortestql(mask, mask); 3917 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3918 addptr(result, stride2x2); // update since we already compared at this addr 3919 subl(cnt2, stride2x2); // and sub the size too 3920 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3921 3922 vpxor(vec1, vec1); 3923 jmpb(COMPARE_WIDE_TAIL); 3924 }//if (VM_Version::supports_avx512vlbw()) 3925 3926 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3927 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3928 vmovdqu(vec1, Address(str1, result, scale)); 3929 vpxor(vec1, Address(str2, result, scale)); 3930 } else { 3931 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3932 vpxor(vec1, Address(str2, result, scale2)); 3933 } 3934 vptest(vec1, vec1); 3935 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3936 addptr(result, stride2); 3937 subl(cnt2, stride2); 3938 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3939 // clean upper bits of YMM registers 3940 vpxor(vec1, vec1); 3941 3942 // compare wide vectors tail 3943 bind(COMPARE_WIDE_TAIL); 3944 testptr(result, result); 3945 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3946 3947 movl(result, stride2); 3948 movl(cnt2, result); 3949 negptr(result); 3950 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3951 3952 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3953 bind(VECTOR_NOT_EQUAL); 3954 // clean upper bits of YMM registers 3955 vpxor(vec1, vec1); 3956 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3957 lea(str1, Address(str1, result, scale)); 3958 lea(str2, Address(str2, result, scale)); 3959 } else { 3960 lea(str1, Address(str1, result, scale1)); 3961 lea(str2, Address(str2, result, scale2)); 3962 } 3963 jmp(COMPARE_16_CHARS); 3964 3965 // Compare tail chars, length between 1 to 15 chars 3966 bind(COMPARE_TAIL_LONG); 3967 movl(cnt2, result); 3968 cmpl(cnt2, stride); 3969 jcc(Assembler::less, COMPARE_SMALL_STR); 3970 3971 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3972 movdqu(vec1, Address(str1, 0)); 3973 } else { 3974 pmovzxbw(vec1, Address(str1, 0)); 3975 } 3976 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3977 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3978 subptr(cnt2, stride); 3979 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3980 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3981 lea(str1, Address(str1, result, scale)); 3982 lea(str2, Address(str2, result, scale)); 3983 } else { 3984 lea(str1, Address(str1, result, scale1)); 3985 lea(str2, Address(str2, result, scale2)); 3986 } 3987 negptr(cnt2); 3988 jmpb(WHILE_HEAD_LABEL); 3989 3990 bind(COMPARE_SMALL_STR); 3991 } else if (UseSSE42Intrinsics) { 3992 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3993 int pcmpmask = 0x19; 3994 // Setup to compare 8-char (16-byte) vectors, 3995 // start from first character again because it has aligned address. 3996 movl(result, cnt2); 3997 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3998 if (ae == StrIntrinsicNode::LL) { 3999 pcmpmask &= ~0x01; 4000 } 4001 jcc(Assembler::zero, COMPARE_TAIL); 4002 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4003 lea(str1, Address(str1, result, scale)); 4004 lea(str2, Address(str2, result, scale)); 4005 } else { 4006 lea(str1, Address(str1, result, scale1)); 4007 lea(str2, Address(str2, result, scale2)); 4008 } 4009 negptr(result); 4010 4011 // pcmpestri 4012 // inputs: 4013 // vec1- substring 4014 // rax - negative string length (elements count) 4015 // mem - scanned string 4016 // rdx - string length (elements count) 4017 // pcmpmask - cmp mode: 11000 (string compare with negated result) 4018 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 4019 // outputs: 4020 // rcx - first mismatched element index 4021 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 4022 4023 bind(COMPARE_WIDE_VECTORS); 4024 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4025 movdqu(vec1, Address(str1, result, scale)); 4026 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 4027 } else { 4028 pmovzxbw(vec1, Address(str1, result, scale1)); 4029 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4030 } 4031 // After pcmpestri cnt1(rcx) contains mismatched element index 4032 4033 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 4034 addptr(result, stride); 4035 subptr(cnt2, stride); 4036 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4037 4038 // compare wide vectors tail 4039 testptr(result, result); 4040 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 4041 4042 movl(cnt2, stride); 4043 movl(result, stride); 4044 negptr(result); 4045 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4046 movdqu(vec1, Address(str1, result, scale)); 4047 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 4048 } else { 4049 pmovzxbw(vec1, Address(str1, result, scale1)); 4050 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4051 } 4052 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 4053 4054 // Mismatched characters in the vectors 4055 bind(VECTOR_NOT_EQUAL); 4056 addptr(cnt1, result); 4057 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 4058 subl(result, cnt2); 4059 jmpb(POP_LABEL); 4060 4061 bind(COMPARE_TAIL); // limit is zero 4062 movl(cnt2, result); 4063 // Fallthru to tail compare 4064 } 4065 // Shift str2 and str1 to the end of the arrays, negate min 4066 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4067 lea(str1, Address(str1, cnt2, scale)); 4068 lea(str2, Address(str2, cnt2, scale)); 4069 } else { 4070 lea(str1, Address(str1, cnt2, scale1)); 4071 lea(str2, Address(str2, cnt2, scale2)); 4072 } 4073 decrementl(cnt2); // first character was compared already 4074 negptr(cnt2); 4075 4076 // Compare the rest of the elements 4077 bind(WHILE_HEAD_LABEL); 4078 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 4079 subl(result, cnt1); 4080 jccb(Assembler::notZero, POP_LABEL); 4081 increment(cnt2); 4082 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 4083 4084 // Strings are equal up to min length. Return the length difference. 4085 bind(LENGTH_DIFF_LABEL); 4086 pop(result); 4087 if (ae == StrIntrinsicNode::UU) { 4088 // Divide diff by 2 to get number of chars 4089 sarl(result, 1); 4090 } 4091 jmpb(DONE_LABEL); 4092 4093 if (VM_Version::supports_avx512vlbw()) { 4094 4095 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 4096 4097 kmovql(cnt1, mask); 4098 notq(cnt1); 4099 bsfq(cnt2, cnt1); 4100 if (ae != StrIntrinsicNode::LL) { 4101 // Divide diff by 2 to get number of chars 4102 sarl(cnt2, 1); 4103 } 4104 addq(result, cnt2); 4105 if (ae == StrIntrinsicNode::LL) { 4106 load_unsigned_byte(cnt1, Address(str2, result)); 4107 load_unsigned_byte(result, Address(str1, result)); 4108 } else if (ae == StrIntrinsicNode::UU) { 4109 load_unsigned_short(cnt1, Address(str2, result, scale)); 4110 load_unsigned_short(result, Address(str1, result, scale)); 4111 } else { 4112 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4113 load_unsigned_byte(result, Address(str1, result, scale1)); 4114 } 4115 subl(result, cnt1); 4116 jmpb(POP_LABEL); 4117 }//if (VM_Version::supports_avx512vlbw()) 4118 4119 // Discard the stored length difference 4120 bind(POP_LABEL); 4121 pop(cnt1); 4122 4123 // That's it 4124 bind(DONE_LABEL); 4125 if(ae == StrIntrinsicNode::UL) { 4126 negl(result); 4127 } 4128 4129 } 4130 4131 // Search for Non-ASCII character (Negative byte value) in a byte array, 4132 // return the index of the first such character, otherwise the length 4133 // of the array segment searched. 4134 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4135 // @IntrinsicCandidate 4136 // public static int countPositives(byte[] ba, int off, int len) { 4137 // for (int i = off; i < off + len; i++) { 4138 // if (ba[i] < 0) { 4139 // return i - off; 4140 // } 4141 // } 4142 // return len; 4143 // } 4144 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4145 Register result, Register tmp1, 4146 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4147 // rsi: byte array 4148 // rcx: len 4149 // rax: result 4150 ShortBranchVerifier sbv(this); 4151 assert_different_registers(ary1, len, result, tmp1); 4152 assert_different_registers(vec1, vec2); 4153 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4154 4155 movl(result, len); // copy 4156 // len == 0 4157 testl(len, len); 4158 jcc(Assembler::zero, DONE); 4159 4160 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4161 VM_Version::supports_avx512vlbw() && 4162 VM_Version::supports_bmi2()) { 4163 4164 Label test_64_loop, test_tail, BREAK_LOOP; 4165 movl(tmp1, len); 4166 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4167 4168 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4169 andl(len, 0xffffffc0); // vector count (in chars) 4170 jccb(Assembler::zero, test_tail); 4171 4172 lea(ary1, Address(ary1, len, Address::times_1)); 4173 negptr(len); 4174 4175 bind(test_64_loop); 4176 // Check whether our 64 elements of size byte contain negatives 4177 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4178 kortestql(mask1, mask1); 4179 jcc(Assembler::notZero, BREAK_LOOP); 4180 4181 addptr(len, 64); 4182 jccb(Assembler::notZero, test_64_loop); 4183 4184 bind(test_tail); 4185 // bail out when there is nothing to be done 4186 testl(tmp1, -1); 4187 jcc(Assembler::zero, DONE); 4188 4189 4190 // check the tail for absense of negatives 4191 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4192 { 4193 Register tmp3_aliased = len; 4194 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4195 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4196 notq(tmp3_aliased); 4197 kmovql(mask2, tmp3_aliased); 4198 } 4199 4200 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4201 ktestq(mask1, mask2); 4202 jcc(Assembler::zero, DONE); 4203 4204 // do a full check for negative registers in the tail 4205 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4206 // ary1 already pointing to the right place 4207 jmpb(TAIL_START); 4208 4209 bind(BREAK_LOOP); 4210 // At least one byte in the last 64 byte block was negative. 4211 // Set up to look at the last 64 bytes as if they were a tail 4212 lea(ary1, Address(ary1, len, Address::times_1)); 4213 addptr(result, len); 4214 // Ignore the very last byte: if all others are positive, 4215 // it must be negative, so we can skip right to the 2+1 byte 4216 // end comparison at this point 4217 orl(result, 63); 4218 movl(len, 63); 4219 // Fallthru to tail compare 4220 } else { 4221 4222 if (UseAVX >= 2) { 4223 // With AVX2, use 32-byte vector compare 4224 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4225 4226 // Compare 32-byte vectors 4227 testl(len, 0xffffffe0); // vector count (in bytes) 4228 jccb(Assembler::zero, TAIL_START); 4229 4230 andl(len, 0xffffffe0); 4231 lea(ary1, Address(ary1, len, Address::times_1)); 4232 negptr(len); 4233 4234 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4235 movdl(vec2, tmp1); 4236 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4237 4238 bind(COMPARE_WIDE_VECTORS); 4239 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4240 vptest(vec1, vec2); 4241 jccb(Assembler::notZero, BREAK_LOOP); 4242 addptr(len, 32); 4243 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4244 4245 testl(result, 0x0000001f); // any bytes remaining? 4246 jcc(Assembler::zero, DONE); 4247 4248 // Quick test using the already prepared vector mask 4249 movl(len, result); 4250 andl(len, 0x0000001f); 4251 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4252 vptest(vec1, vec2); 4253 jcc(Assembler::zero, DONE); 4254 // There are zeros, jump to the tail to determine exactly where 4255 jmpb(TAIL_START); 4256 4257 bind(BREAK_LOOP); 4258 // At least one byte in the last 32-byte vector is negative. 4259 // Set up to look at the last 32 bytes as if they were a tail 4260 lea(ary1, Address(ary1, len, Address::times_1)); 4261 addptr(result, len); 4262 // Ignore the very last byte: if all others are positive, 4263 // it must be negative, so we can skip right to the 2+1 byte 4264 // end comparison at this point 4265 orl(result, 31); 4266 movl(len, 31); 4267 // Fallthru to tail compare 4268 } else if (UseSSE42Intrinsics) { 4269 // With SSE4.2, use double quad vector compare 4270 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4271 4272 // Compare 16-byte vectors 4273 testl(len, 0xfffffff0); // vector count (in bytes) 4274 jcc(Assembler::zero, TAIL_START); 4275 4276 andl(len, 0xfffffff0); 4277 lea(ary1, Address(ary1, len, Address::times_1)); 4278 negptr(len); 4279 4280 movl(tmp1, 0x80808080); 4281 movdl(vec2, tmp1); 4282 pshufd(vec2, vec2, 0); 4283 4284 bind(COMPARE_WIDE_VECTORS); 4285 movdqu(vec1, Address(ary1, len, Address::times_1)); 4286 ptest(vec1, vec2); 4287 jccb(Assembler::notZero, BREAK_LOOP); 4288 addptr(len, 16); 4289 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4290 4291 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4292 jcc(Assembler::zero, DONE); 4293 4294 // Quick test using the already prepared vector mask 4295 movl(len, result); 4296 andl(len, 0x0000000f); // tail count (in bytes) 4297 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4298 ptest(vec1, vec2); 4299 jcc(Assembler::zero, DONE); 4300 jmpb(TAIL_START); 4301 4302 bind(BREAK_LOOP); 4303 // At least one byte in the last 16-byte vector is negative. 4304 // Set up and look at the last 16 bytes as if they were a tail 4305 lea(ary1, Address(ary1, len, Address::times_1)); 4306 addptr(result, len); 4307 // Ignore the very last byte: if all others are positive, 4308 // it must be negative, so we can skip right to the 2+1 byte 4309 // end comparison at this point 4310 orl(result, 15); 4311 movl(len, 15); 4312 // Fallthru to tail compare 4313 } 4314 } 4315 4316 bind(TAIL_START); 4317 // Compare 4-byte vectors 4318 andl(len, 0xfffffffc); // vector count (in bytes) 4319 jccb(Assembler::zero, COMPARE_CHAR); 4320 4321 lea(ary1, Address(ary1, len, Address::times_1)); 4322 negptr(len); 4323 4324 bind(COMPARE_VECTORS); 4325 movl(tmp1, Address(ary1, len, Address::times_1)); 4326 andl(tmp1, 0x80808080); 4327 jccb(Assembler::notZero, TAIL_ADJUST); 4328 addptr(len, 4); 4329 jccb(Assembler::notZero, COMPARE_VECTORS); 4330 4331 // Compare trailing char (final 2-3 bytes), if any 4332 bind(COMPARE_CHAR); 4333 4334 testl(result, 0x2); // tail char 4335 jccb(Assembler::zero, COMPARE_BYTE); 4336 load_unsigned_short(tmp1, Address(ary1, 0)); 4337 andl(tmp1, 0x00008080); 4338 jccb(Assembler::notZero, CHAR_ADJUST); 4339 lea(ary1, Address(ary1, 2)); 4340 4341 bind(COMPARE_BYTE); 4342 testl(result, 0x1); // tail byte 4343 jccb(Assembler::zero, DONE); 4344 load_unsigned_byte(tmp1, Address(ary1, 0)); 4345 testl(tmp1, 0x00000080); 4346 jccb(Assembler::zero, DONE); 4347 subptr(result, 1); 4348 jmpb(DONE); 4349 4350 bind(TAIL_ADJUST); 4351 // there are negative bits in the last 4 byte block. 4352 // Adjust result and check the next three bytes 4353 addptr(result, len); 4354 orl(result, 3); 4355 lea(ary1, Address(ary1, len, Address::times_1)); 4356 jmpb(COMPARE_CHAR); 4357 4358 bind(CHAR_ADJUST); 4359 // We are looking at a char + optional byte tail, and found that one 4360 // of the bytes in the char is negative. Adjust the result, check the 4361 // first byte and readjust if needed. 4362 andl(result, 0xfffffffc); 4363 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4364 jccb(Assembler::notZero, DONE); 4365 addptr(result, 1); 4366 4367 // That's it 4368 bind(DONE); 4369 if (UseAVX >= 2) { 4370 // clean upper bits of YMM registers 4371 vpxor(vec1, vec1); 4372 vpxor(vec2, vec2); 4373 } 4374 } 4375 4376 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4377 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4378 Register limit, Register result, Register chr, 4379 XMMRegister vec1, XMMRegister vec2, bool is_char, 4380 KRegister mask, bool expand_ary2) { 4381 // for expand_ary2, limit is the (smaller) size of the second array. 4382 ShortBranchVerifier sbv(this); 4383 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4384 4385 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4386 "Expansion only implemented for AVX2"); 4387 4388 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4389 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4390 4391 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4392 int scaleIncr = expand_ary2 ? 8 : 16; 4393 4394 if (is_array_equ) { 4395 // Check the input args 4396 cmpoop(ary1, ary2); 4397 jcc(Assembler::equal, TRUE_LABEL); 4398 4399 // Need additional checks for arrays_equals. 4400 testptr(ary1, ary1); 4401 jcc(Assembler::zero, FALSE_LABEL); 4402 testptr(ary2, ary2); 4403 jcc(Assembler::zero, FALSE_LABEL); 4404 4405 // Check the lengths 4406 movl(limit, Address(ary1, length_offset)); 4407 cmpl(limit, Address(ary2, length_offset)); 4408 jcc(Assembler::notEqual, FALSE_LABEL); 4409 } 4410 4411 // count == 0 4412 testl(limit, limit); 4413 jcc(Assembler::zero, TRUE_LABEL); 4414 4415 if (is_array_equ) { 4416 // Load array address 4417 lea(ary1, Address(ary1, base_offset)); 4418 lea(ary2, Address(ary2, base_offset)); 4419 } 4420 4421 if (is_array_equ && is_char) { 4422 // arrays_equals when used for char[]. 4423 shll(limit, 1); // byte count != 0 4424 } 4425 movl(result, limit); // copy 4426 4427 if (UseAVX >= 2) { 4428 // With AVX2, use 32-byte vector compare 4429 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4430 4431 // Compare 32-byte vectors 4432 if (expand_ary2) { 4433 andl(result, 0x0000000f); // tail count (in bytes) 4434 andl(limit, 0xfffffff0); // vector count (in bytes) 4435 jcc(Assembler::zero, COMPARE_TAIL); 4436 } else { 4437 andl(result, 0x0000001f); // tail count (in bytes) 4438 andl(limit, 0xffffffe0); // vector count (in bytes) 4439 jcc(Assembler::zero, COMPARE_TAIL_16); 4440 } 4441 4442 lea(ary1, Address(ary1, limit, scaleFactor)); 4443 lea(ary2, Address(ary2, limit, Address::times_1)); 4444 negptr(limit); 4445 4446 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4447 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4448 4449 cmpl(limit, -64); 4450 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4451 4452 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4453 4454 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4455 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4456 kortestql(mask, mask); 4457 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4458 addptr(limit, 64); // update since we already compared at this addr 4459 cmpl(limit, -64); 4460 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4461 4462 // At this point we may still need to compare -limit+result bytes. 4463 // We could execute the next two instruction and just continue via non-wide path: 4464 // cmpl(limit, 0); 4465 // jcc(Assembler::equal, COMPARE_TAIL); // true 4466 // But since we stopped at the points ary{1,2}+limit which are 4467 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4468 // (|limit| <= 32 and result < 32), 4469 // we may just compare the last 64 bytes. 4470 // 4471 addptr(result, -64); // it is safe, bc we just came from this area 4472 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4473 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4474 kortestql(mask, mask); 4475 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4476 4477 jmp(TRUE_LABEL); 4478 4479 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4480 4481 }//if (VM_Version::supports_avx512vlbw()) 4482 4483 bind(COMPARE_WIDE_VECTORS); 4484 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4485 if (expand_ary2) { 4486 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4487 } else { 4488 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4489 } 4490 vpxor(vec1, vec2); 4491 4492 vptest(vec1, vec1); 4493 jcc(Assembler::notZero, FALSE_LABEL); 4494 addptr(limit, scaleIncr * 2); 4495 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4496 4497 testl(result, result); 4498 jcc(Assembler::zero, TRUE_LABEL); 4499 4500 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4501 if (expand_ary2) { 4502 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4503 } else { 4504 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4505 } 4506 vpxor(vec1, vec2); 4507 4508 vptest(vec1, vec1); 4509 jcc(Assembler::notZero, FALSE_LABEL); 4510 jmp(TRUE_LABEL); 4511 4512 bind(COMPARE_TAIL_16); // limit is zero 4513 movl(limit, result); 4514 4515 // Compare 16-byte chunks 4516 andl(result, 0x0000000f); // tail count (in bytes) 4517 andl(limit, 0xfffffff0); // vector count (in bytes) 4518 jcc(Assembler::zero, COMPARE_TAIL); 4519 4520 lea(ary1, Address(ary1, limit, scaleFactor)); 4521 lea(ary2, Address(ary2, limit, Address::times_1)); 4522 negptr(limit); 4523 4524 bind(COMPARE_WIDE_VECTORS_16); 4525 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4526 if (expand_ary2) { 4527 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4528 } else { 4529 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4530 } 4531 pxor(vec1, vec2); 4532 4533 ptest(vec1, vec1); 4534 jcc(Assembler::notZero, FALSE_LABEL); 4535 addptr(limit, scaleIncr); 4536 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4537 4538 bind(COMPARE_TAIL); // limit is zero 4539 movl(limit, result); 4540 // Fallthru to tail compare 4541 } else if (UseSSE42Intrinsics) { 4542 // With SSE4.2, use double quad vector compare 4543 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4544 4545 // Compare 16-byte vectors 4546 andl(result, 0x0000000f); // tail count (in bytes) 4547 andl(limit, 0xfffffff0); // vector count (in bytes) 4548 jcc(Assembler::zero, COMPARE_TAIL); 4549 4550 lea(ary1, Address(ary1, limit, Address::times_1)); 4551 lea(ary2, Address(ary2, limit, Address::times_1)); 4552 negptr(limit); 4553 4554 bind(COMPARE_WIDE_VECTORS); 4555 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4556 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4557 pxor(vec1, vec2); 4558 4559 ptest(vec1, vec1); 4560 jcc(Assembler::notZero, FALSE_LABEL); 4561 addptr(limit, 16); 4562 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4563 4564 testl(result, result); 4565 jcc(Assembler::zero, TRUE_LABEL); 4566 4567 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4568 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4569 pxor(vec1, vec2); 4570 4571 ptest(vec1, vec1); 4572 jccb(Assembler::notZero, FALSE_LABEL); 4573 jmpb(TRUE_LABEL); 4574 4575 bind(COMPARE_TAIL); // limit is zero 4576 movl(limit, result); 4577 // Fallthru to tail compare 4578 } 4579 4580 // Compare 4-byte vectors 4581 if (expand_ary2) { 4582 testl(result, result); 4583 jccb(Assembler::zero, TRUE_LABEL); 4584 } else { 4585 andl(limit, 0xfffffffc); // vector count (in bytes) 4586 jccb(Assembler::zero, COMPARE_CHAR); 4587 } 4588 4589 lea(ary1, Address(ary1, limit, scaleFactor)); 4590 lea(ary2, Address(ary2, limit, Address::times_1)); 4591 negptr(limit); 4592 4593 bind(COMPARE_VECTORS); 4594 if (expand_ary2) { 4595 // There are no "vector" operations for bytes to shorts 4596 movzbl(chr, Address(ary2, limit, Address::times_1)); 4597 cmpw(Address(ary1, limit, Address::times_2), chr); 4598 jccb(Assembler::notEqual, FALSE_LABEL); 4599 addptr(limit, 1); 4600 jcc(Assembler::notZero, COMPARE_VECTORS); 4601 jmp(TRUE_LABEL); 4602 } else { 4603 movl(chr, Address(ary1, limit, Address::times_1)); 4604 cmpl(chr, Address(ary2, limit, Address::times_1)); 4605 jccb(Assembler::notEqual, FALSE_LABEL); 4606 addptr(limit, 4); 4607 jcc(Assembler::notZero, COMPARE_VECTORS); 4608 } 4609 4610 // Compare trailing char (final 2 bytes), if any 4611 bind(COMPARE_CHAR); 4612 testl(result, 0x2); // tail char 4613 jccb(Assembler::zero, COMPARE_BYTE); 4614 load_unsigned_short(chr, Address(ary1, 0)); 4615 load_unsigned_short(limit, Address(ary2, 0)); 4616 cmpl(chr, limit); 4617 jccb(Assembler::notEqual, FALSE_LABEL); 4618 4619 if (is_array_equ && is_char) { 4620 bind(COMPARE_BYTE); 4621 } else { 4622 lea(ary1, Address(ary1, 2)); 4623 lea(ary2, Address(ary2, 2)); 4624 4625 bind(COMPARE_BYTE); 4626 testl(result, 0x1); // tail byte 4627 jccb(Assembler::zero, TRUE_LABEL); 4628 load_unsigned_byte(chr, Address(ary1, 0)); 4629 load_unsigned_byte(limit, Address(ary2, 0)); 4630 cmpl(chr, limit); 4631 jccb(Assembler::notEqual, FALSE_LABEL); 4632 } 4633 bind(TRUE_LABEL); 4634 movl(result, 1); // return true 4635 jmpb(DONE); 4636 4637 bind(FALSE_LABEL); 4638 xorl(result, result); // return false 4639 4640 // That's it 4641 bind(DONE); 4642 if (UseAVX >= 2) { 4643 // clean upper bits of YMM registers 4644 vpxor(vec1, vec1); 4645 vpxor(vec2, vec2); 4646 } 4647 } 4648 4649 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4650 #define __ masm. 4651 Register dst = stub.data<0>(); 4652 XMMRegister src = stub.data<1>(); 4653 address target = stub.data<2>(); 4654 __ bind(stub.entry()); 4655 __ subptr(rsp, 8); 4656 __ movdbl(Address(rsp), src); 4657 __ call(RuntimeAddress(target)); 4658 __ pop(dst); 4659 __ jmp(stub.continuation()); 4660 #undef __ 4661 } 4662 4663 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4664 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4665 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4666 4667 address slowpath_target; 4668 if (dst_bt == T_INT) { 4669 if (src_bt == T_FLOAT) { 4670 cvttss2sil(dst, src); 4671 cmpl(dst, 0x80000000); 4672 slowpath_target = StubRoutines::x86::f2i_fixup(); 4673 } else { 4674 cvttsd2sil(dst, src); 4675 cmpl(dst, 0x80000000); 4676 slowpath_target = StubRoutines::x86::d2i_fixup(); 4677 } 4678 } else { 4679 if (src_bt == T_FLOAT) { 4680 cvttss2siq(dst, src); 4681 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4682 slowpath_target = StubRoutines::x86::f2l_fixup(); 4683 } else { 4684 cvttsd2siq(dst, src); 4685 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4686 slowpath_target = StubRoutines::x86::d2l_fixup(); 4687 } 4688 } 4689 4690 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4691 jcc(Assembler::equal, stub->entry()); 4692 bind(stub->continuation()); 4693 } 4694 4695 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4696 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4697 switch(ideal_opc) { 4698 case Op_LShiftVS: 4699 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4700 case Op_LShiftVI: 4701 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4702 case Op_LShiftVL: 4703 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4704 case Op_RShiftVS: 4705 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4706 case Op_RShiftVI: 4707 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4708 case Op_RShiftVL: 4709 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4710 case Op_URShiftVS: 4711 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4712 case Op_URShiftVI: 4713 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4714 case Op_URShiftVL: 4715 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4716 case Op_RotateRightV: 4717 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4718 case Op_RotateLeftV: 4719 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4720 default: 4721 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4722 break; 4723 } 4724 } 4725 4726 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4727 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) { 4728 if (is_unsigned) { 4729 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4730 } else { 4731 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4732 } 4733 } 4734 4735 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4736 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4737 switch (elem_bt) { 4738 case T_BYTE: 4739 if (ideal_opc == Op_SaturatingAddV) { 4740 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4741 } else { 4742 assert(ideal_opc == Op_SaturatingSubV, ""); 4743 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4744 } 4745 break; 4746 case T_SHORT: 4747 if (ideal_opc == Op_SaturatingAddV) { 4748 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4749 } else { 4750 assert(ideal_opc == Op_SaturatingSubV, ""); 4751 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4752 } 4753 break; 4754 default: 4755 fatal("Unsupported type %s", type2name(elem_bt)); 4756 break; 4757 } 4758 } 4759 4760 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4761 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4762 switch (elem_bt) { 4763 case T_BYTE: 4764 if (ideal_opc == Op_SaturatingAddV) { 4765 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4766 } else { 4767 assert(ideal_opc == Op_SaturatingSubV, ""); 4768 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4769 } 4770 break; 4771 case T_SHORT: 4772 if (ideal_opc == Op_SaturatingAddV) { 4773 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4774 } else { 4775 assert(ideal_opc == Op_SaturatingSubV, ""); 4776 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4777 } 4778 break; 4779 default: 4780 fatal("Unsupported type %s", type2name(elem_bt)); 4781 break; 4782 } 4783 } 4784 4785 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4786 Address src2, bool is_unsigned, bool merge, int vlen_enc) { 4787 if (is_unsigned) { 4788 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4789 } else { 4790 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4791 } 4792 } 4793 4794 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4795 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4796 switch (elem_bt) { 4797 case T_BYTE: 4798 if (ideal_opc == Op_SaturatingAddV) { 4799 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4800 } else { 4801 assert(ideal_opc == Op_SaturatingSubV, ""); 4802 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4803 } 4804 break; 4805 case T_SHORT: 4806 if (ideal_opc == Op_SaturatingAddV) { 4807 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4808 } else { 4809 assert(ideal_opc == Op_SaturatingSubV, ""); 4810 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4811 } 4812 break; 4813 default: 4814 fatal("Unsupported type %s", type2name(elem_bt)); 4815 break; 4816 } 4817 } 4818 4819 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4820 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4821 switch (elem_bt) { 4822 case T_BYTE: 4823 if (ideal_opc == Op_SaturatingAddV) { 4824 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4825 } else { 4826 assert(ideal_opc == Op_SaturatingSubV, ""); 4827 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4828 } 4829 break; 4830 case T_SHORT: 4831 if (ideal_opc == Op_SaturatingAddV) { 4832 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4833 } else { 4834 assert(ideal_opc == Op_SaturatingSubV, ""); 4835 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4836 } 4837 break; 4838 default: 4839 fatal("Unsupported type %s", type2name(elem_bt)); 4840 break; 4841 } 4842 } 4843 4844 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4845 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4846 bool is_varshift) { 4847 switch (ideal_opc) { 4848 case Op_AddVB: 4849 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4850 case Op_AddVS: 4851 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4852 case Op_AddVI: 4853 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4854 case Op_AddVL: 4855 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4856 case Op_AddVF: 4857 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4858 case Op_AddVD: 4859 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4860 case Op_SubVB: 4861 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4862 case Op_SubVS: 4863 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4864 case Op_SubVI: 4865 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4866 case Op_SubVL: 4867 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4868 case Op_SubVF: 4869 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4870 case Op_SubVD: 4871 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4872 case Op_MulVS: 4873 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4874 case Op_MulVI: 4875 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4876 case Op_MulVL: 4877 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4878 case Op_MulVF: 4879 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4880 case Op_MulVD: 4881 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4882 case Op_DivVF: 4883 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4884 case Op_DivVD: 4885 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4886 case Op_SqrtVF: 4887 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4888 case Op_SqrtVD: 4889 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4890 case Op_AbsVB: 4891 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4892 case Op_AbsVS: 4893 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4894 case Op_AbsVI: 4895 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4896 case Op_AbsVL: 4897 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4898 case Op_FmaVF: 4899 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4900 case Op_FmaVD: 4901 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4902 case Op_VectorRearrange: 4903 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4904 case Op_LShiftVS: 4905 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4906 case Op_LShiftVI: 4907 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4908 case Op_LShiftVL: 4909 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4910 case Op_RShiftVS: 4911 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4912 case Op_RShiftVI: 4913 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4914 case Op_RShiftVL: 4915 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4916 case Op_URShiftVS: 4917 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4918 case Op_URShiftVI: 4919 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4920 case Op_URShiftVL: 4921 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4922 case Op_RotateLeftV: 4923 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4924 case Op_RotateRightV: 4925 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4926 case Op_MaxV: 4927 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4928 case Op_MinV: 4929 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4930 case Op_UMinV: 4931 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4932 case Op_UMaxV: 4933 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4934 case Op_XorV: 4935 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4936 case Op_OrV: 4937 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4938 case Op_AndV: 4939 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4940 default: 4941 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4942 break; 4943 } 4944 } 4945 4946 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4947 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4948 switch (ideal_opc) { 4949 case Op_AddVB: 4950 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4951 case Op_AddVS: 4952 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4953 case Op_AddVI: 4954 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4955 case Op_AddVL: 4956 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4957 case Op_AddVF: 4958 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4959 case Op_AddVD: 4960 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4961 case Op_SubVB: 4962 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4963 case Op_SubVS: 4964 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4965 case Op_SubVI: 4966 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4967 case Op_SubVL: 4968 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4969 case Op_SubVF: 4970 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4971 case Op_SubVD: 4972 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4973 case Op_MulVS: 4974 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4975 case Op_MulVI: 4976 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4977 case Op_MulVL: 4978 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4979 case Op_MulVF: 4980 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4981 case Op_MulVD: 4982 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4983 case Op_DivVF: 4984 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4985 case Op_DivVD: 4986 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4987 case Op_FmaVF: 4988 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4989 case Op_FmaVD: 4990 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4991 case Op_MaxV: 4992 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4993 case Op_MinV: 4994 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4995 case Op_UMaxV: 4996 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4997 case Op_UMinV: 4998 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4999 case Op_XorV: 5000 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5001 case Op_OrV: 5002 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5003 case Op_AndV: 5004 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5005 default: 5006 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 5007 break; 5008 } 5009 } 5010 5011 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 5012 KRegister src1, KRegister src2) { 5013 BasicType etype = T_ILLEGAL; 5014 switch(mask_len) { 5015 case 2: 5016 case 4: 5017 case 8: etype = T_BYTE; break; 5018 case 16: etype = T_SHORT; break; 5019 case 32: etype = T_INT; break; 5020 case 64: etype = T_LONG; break; 5021 default: fatal("Unsupported type"); break; 5022 } 5023 assert(etype != T_ILLEGAL, ""); 5024 switch(ideal_opc) { 5025 case Op_AndVMask: 5026 kand(etype, dst, src1, src2); break; 5027 case Op_OrVMask: 5028 kor(etype, dst, src1, src2); break; 5029 case Op_XorVMask: 5030 kxor(etype, dst, src1, src2); break; 5031 default: 5032 fatal("Unsupported masked operation"); break; 5033 } 5034 } 5035 5036 /* 5037 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5038 * If src is NaN, the result is 0. 5039 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 5040 * the result is equal to the value of Integer.MIN_VALUE. 5041 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 5042 * the result is equal to the value of Integer.MAX_VALUE. 5043 */ 5044 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5045 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5046 Register rscratch, AddressLiteral float_sign_flip, 5047 int vec_enc) { 5048 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5049 Label done; 5050 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 5051 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5052 vptest(xtmp2, xtmp2, vec_enc); 5053 jccb(Assembler::equal, done); 5054 5055 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 5056 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 5057 5058 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5059 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 5060 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 5061 5062 // Recompute the mask for remaining special value. 5063 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 5064 // Extract SRC values corresponding to TRUE mask lanes. 5065 vpand(xtmp4, xtmp2, src, vec_enc); 5066 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 5067 // values are set. 5068 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 5069 5070 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 5071 bind(done); 5072 } 5073 5074 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5075 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5076 Register rscratch, AddressLiteral float_sign_flip, 5077 int vec_enc) { 5078 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5079 Label done; 5080 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5081 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5082 kortestwl(ktmp1, ktmp1); 5083 jccb(Assembler::equal, done); 5084 5085 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5086 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5087 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5088 5089 kxorwl(ktmp1, ktmp1, ktmp2); 5090 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5091 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5092 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5093 bind(done); 5094 } 5095 5096 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5097 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5098 Register rscratch, AddressLiteral double_sign_flip, 5099 int vec_enc) { 5100 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5101 5102 Label done; 5103 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5104 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 5105 kortestwl(ktmp1, ktmp1); 5106 jccb(Assembler::equal, done); 5107 5108 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5109 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5110 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5111 5112 kxorwl(ktmp1, ktmp1, ktmp2); 5113 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5114 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5115 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5116 bind(done); 5117 } 5118 5119 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5120 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5121 Register rscratch, AddressLiteral float_sign_flip, 5122 int vec_enc) { 5123 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5124 Label done; 5125 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5126 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5127 kortestwl(ktmp1, ktmp1); 5128 jccb(Assembler::equal, done); 5129 5130 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5131 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5132 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5133 5134 kxorwl(ktmp1, ktmp1, ktmp2); 5135 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5136 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5137 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5138 bind(done); 5139 } 5140 5141 /* 5142 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5143 * If src is NaN, the result is 0. 5144 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 5145 * the result is equal to the value of Long.MIN_VALUE. 5146 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 5147 * the result is equal to the value of Long.MAX_VALUE. 5148 */ 5149 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5150 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5151 Register rscratch, AddressLiteral double_sign_flip, 5152 int vec_enc) { 5153 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5154 5155 Label done; 5156 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5157 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 5158 kortestwl(ktmp1, ktmp1); 5159 jccb(Assembler::equal, done); 5160 5161 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5162 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5163 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5164 5165 kxorwl(ktmp1, ktmp1, ktmp2); 5166 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5167 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5168 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5169 bind(done); 5170 } 5171 5172 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5173 XMMRegister xtmp, int index, int vec_enc) { 5174 assert(vec_enc < Assembler::AVX_512bit, ""); 5175 if (vec_enc == Assembler::AVX_256bit) { 5176 vextractf128_high(xtmp, src); 5177 vshufps(dst, src, xtmp, index, vec_enc); 5178 } else { 5179 vshufps(dst, src, zero, index, vec_enc); 5180 } 5181 } 5182 5183 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5184 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5185 AddressLiteral float_sign_flip, int src_vec_enc) { 5186 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5187 5188 Label done; 5189 // Compare the destination lanes with float_sign_flip 5190 // value to get mask for all special values. 5191 movdqu(xtmp1, float_sign_flip, rscratch); 5192 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5193 ptest(xtmp2, xtmp2); 5194 jccb(Assembler::equal, done); 5195 5196 // Flip float_sign_flip to get max integer value. 5197 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5198 pxor(xtmp1, xtmp4); 5199 5200 // Set detination lanes corresponding to unordered source lanes as zero. 5201 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5202 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5203 5204 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5205 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5206 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5207 5208 // Recompute the mask for remaining special value. 5209 pxor(xtmp2, xtmp3); 5210 // Extract mask corresponding to non-negative source lanes. 5211 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5212 5213 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5214 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5215 pand(xtmp3, xtmp2); 5216 5217 // Replace destination lanes holding special value(0x80000000) with max int 5218 // if corresponding source lane holds a +ve value. 5219 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5220 bind(done); 5221 } 5222 5223 5224 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5225 XMMRegister xtmp, Register rscratch, int vec_enc) { 5226 switch(to_elem_bt) { 5227 case T_SHORT: 5228 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5229 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5230 vpackusdw(dst, dst, zero, vec_enc); 5231 if (vec_enc == Assembler::AVX_256bit) { 5232 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5233 } 5234 break; 5235 case T_BYTE: 5236 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5237 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5238 vpackusdw(dst, dst, zero, vec_enc); 5239 if (vec_enc == Assembler::AVX_256bit) { 5240 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5241 } 5242 vpackuswb(dst, dst, zero, vec_enc); 5243 break; 5244 default: assert(false, "%s", type2name(to_elem_bt)); 5245 } 5246 } 5247 5248 /* 5249 * Algorithm for vector D2L and F2I conversions:- 5250 * a) Perform vector D2L/F2I cast. 5251 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5252 * It signifies that source value could be any of the special floating point 5253 * values(NaN,-Inf,Inf,Max,-Min). 5254 * c) Set destination to zero if source is NaN value. 5255 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5256 */ 5257 5258 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5259 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5260 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5261 int to_elem_sz = type2aelembytes(to_elem_bt); 5262 assert(to_elem_sz <= 4, ""); 5263 vcvttps2dq(dst, src, vec_enc); 5264 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5265 if (to_elem_sz < 4) { 5266 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5267 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5268 } 5269 } 5270 5271 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5272 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5273 Register rscratch, int vec_enc) { 5274 int to_elem_sz = type2aelembytes(to_elem_bt); 5275 assert(to_elem_sz <= 4, ""); 5276 vcvttps2dq(dst, src, vec_enc); 5277 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5278 switch(to_elem_bt) { 5279 case T_INT: 5280 break; 5281 case T_SHORT: 5282 evpmovdw(dst, dst, vec_enc); 5283 break; 5284 case T_BYTE: 5285 evpmovdb(dst, dst, vec_enc); 5286 break; 5287 default: assert(false, "%s", type2name(to_elem_bt)); 5288 } 5289 } 5290 5291 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5292 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5293 Register rscratch, int vec_enc) { 5294 evcvttps2qq(dst, src, vec_enc); 5295 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5296 } 5297 5298 // Handling for downcasting from double to integer or sub-word types on AVX2. 5299 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5300 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5301 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5302 int to_elem_sz = type2aelembytes(to_elem_bt); 5303 assert(to_elem_sz < 8, ""); 5304 vcvttpd2dq(dst, src, vec_enc); 5305 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5306 float_sign_flip, vec_enc); 5307 if (to_elem_sz < 4) { 5308 // xtmp4 holds all zero lanes. 5309 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5310 } 5311 } 5312 5313 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5314 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5315 KRegister ktmp2, AddressLiteral sign_flip, 5316 Register rscratch, int vec_enc) { 5317 if (VM_Version::supports_avx512dq()) { 5318 evcvttpd2qq(dst, src, vec_enc); 5319 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5320 switch(to_elem_bt) { 5321 case T_LONG: 5322 break; 5323 case T_INT: 5324 evpmovsqd(dst, dst, vec_enc); 5325 break; 5326 case T_SHORT: 5327 evpmovsqd(dst, dst, vec_enc); 5328 evpmovdw(dst, dst, vec_enc); 5329 break; 5330 case T_BYTE: 5331 evpmovsqd(dst, dst, vec_enc); 5332 evpmovdb(dst, dst, vec_enc); 5333 break; 5334 default: assert(false, "%s", type2name(to_elem_bt)); 5335 } 5336 } else { 5337 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5338 vcvttpd2dq(dst, src, vec_enc); 5339 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5340 switch(to_elem_bt) { 5341 case T_INT: 5342 break; 5343 case T_SHORT: 5344 evpmovdw(dst, dst, vec_enc); 5345 break; 5346 case T_BYTE: 5347 evpmovdb(dst, dst, vec_enc); 5348 break; 5349 default: assert(false, "%s", type2name(to_elem_bt)); 5350 } 5351 } 5352 } 5353 5354 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5355 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5356 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5357 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5358 // and re-instantiate original MXCSR.RC mode after that. 5359 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5360 5361 mov64(tmp, julong_cast(0.5L)); 5362 evpbroadcastq(xtmp1, tmp, vec_enc); 5363 vaddpd(xtmp1, src , xtmp1, vec_enc); 5364 evcvtpd2qq(dst, xtmp1, vec_enc); 5365 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5366 double_sign_flip, vec_enc);; 5367 5368 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5369 } 5370 5371 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5372 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5373 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5374 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5375 // and re-instantiate original MXCSR.RC mode after that. 5376 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5377 5378 movl(tmp, jint_cast(0.5)); 5379 movq(xtmp1, tmp); 5380 vbroadcastss(xtmp1, xtmp1, vec_enc); 5381 vaddps(xtmp1, src , xtmp1, vec_enc); 5382 vcvtps2dq(dst, xtmp1, vec_enc); 5383 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5384 float_sign_flip, vec_enc); 5385 5386 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5387 } 5388 5389 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5390 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5391 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5392 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5393 // and re-instantiate original MXCSR.RC mode after that. 5394 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5395 5396 movl(tmp, jint_cast(0.5)); 5397 movq(xtmp1, tmp); 5398 vbroadcastss(xtmp1, xtmp1, vec_enc); 5399 vaddps(xtmp1, src , xtmp1, vec_enc); 5400 vcvtps2dq(dst, xtmp1, vec_enc); 5401 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5402 5403 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5404 } 5405 5406 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5407 BasicType from_elem_bt, BasicType to_elem_bt) { 5408 switch (from_elem_bt) { 5409 case T_BYTE: 5410 switch (to_elem_bt) { 5411 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5412 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5413 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5414 default: ShouldNotReachHere(); 5415 } 5416 break; 5417 case T_SHORT: 5418 switch (to_elem_bt) { 5419 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5420 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5421 default: ShouldNotReachHere(); 5422 } 5423 break; 5424 case T_INT: 5425 assert(to_elem_bt == T_LONG, ""); 5426 vpmovzxdq(dst, src, vlen_enc); 5427 break; 5428 default: 5429 ShouldNotReachHere(); 5430 } 5431 } 5432 5433 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5434 BasicType from_elem_bt, BasicType to_elem_bt) { 5435 switch (from_elem_bt) { 5436 case T_BYTE: 5437 switch (to_elem_bt) { 5438 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5439 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5440 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5441 default: ShouldNotReachHere(); 5442 } 5443 break; 5444 case T_SHORT: 5445 switch (to_elem_bt) { 5446 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5447 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5448 default: ShouldNotReachHere(); 5449 } 5450 break; 5451 case T_INT: 5452 assert(to_elem_bt == T_LONG, ""); 5453 vpmovsxdq(dst, src, vlen_enc); 5454 break; 5455 default: 5456 ShouldNotReachHere(); 5457 } 5458 } 5459 5460 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5461 BasicType dst_bt, BasicType src_bt, int vlen) { 5462 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5463 assert(vlen_enc != AVX_512bit, ""); 5464 5465 int dst_bt_size = type2aelembytes(dst_bt); 5466 int src_bt_size = type2aelembytes(src_bt); 5467 if (dst_bt_size > src_bt_size) { 5468 switch (dst_bt_size / src_bt_size) { 5469 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5470 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5471 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5472 default: ShouldNotReachHere(); 5473 } 5474 } else { 5475 assert(dst_bt_size < src_bt_size, ""); 5476 switch (src_bt_size / dst_bt_size) { 5477 case 2: { 5478 if (vlen_enc == AVX_128bit) { 5479 vpacksswb(dst, src, src, vlen_enc); 5480 } else { 5481 vpacksswb(dst, src, src, vlen_enc); 5482 vpermq(dst, dst, 0x08, vlen_enc); 5483 } 5484 break; 5485 } 5486 case 4: { 5487 if (vlen_enc == AVX_128bit) { 5488 vpackssdw(dst, src, src, vlen_enc); 5489 vpacksswb(dst, dst, dst, vlen_enc); 5490 } else { 5491 vpackssdw(dst, src, src, vlen_enc); 5492 vpermq(dst, dst, 0x08, vlen_enc); 5493 vpacksswb(dst, dst, dst, AVX_128bit); 5494 } 5495 break; 5496 } 5497 case 8: { 5498 if (vlen_enc == AVX_128bit) { 5499 vpshufd(dst, src, 0x08, vlen_enc); 5500 vpackssdw(dst, dst, dst, vlen_enc); 5501 vpacksswb(dst, dst, dst, vlen_enc); 5502 } else { 5503 vpshufd(dst, src, 0x08, vlen_enc); 5504 vpermq(dst, dst, 0x08, vlen_enc); 5505 vpackssdw(dst, dst, dst, AVX_128bit); 5506 vpacksswb(dst, dst, dst, AVX_128bit); 5507 } 5508 break; 5509 } 5510 default: ShouldNotReachHere(); 5511 } 5512 } 5513 } 5514 5515 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5516 bool merge, BasicType bt, int vlen_enc) { 5517 if (bt == T_INT) { 5518 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5519 } else { 5520 assert(bt == T_LONG, ""); 5521 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5522 } 5523 } 5524 5525 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5526 bool merge, BasicType bt, int vlen_enc) { 5527 if (bt == T_INT) { 5528 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5529 } else { 5530 assert(bt == T_LONG, ""); 5531 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5532 } 5533 } 5534 5535 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5536 Register rtmp2, XMMRegister xtmp, int mask_len, 5537 int vec_enc) { 5538 int index = 0; 5539 int vindex = 0; 5540 mov64(rtmp1, 0x0101010101010101L); 5541 pdepq(rtmp1, src, rtmp1); 5542 if (mask_len > 8) { 5543 movq(rtmp2, src); 5544 vpxor(xtmp, xtmp, xtmp, vec_enc); 5545 movq(xtmp, rtmp1); 5546 } 5547 movq(dst, rtmp1); 5548 5549 mask_len -= 8; 5550 while (mask_len > 0) { 5551 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5552 index++; 5553 if ((index % 2) == 0) { 5554 pxor(xtmp, xtmp); 5555 } 5556 mov64(rtmp1, 0x0101010101010101L); 5557 shrq(rtmp2, 8); 5558 pdepq(rtmp1, rtmp2, rtmp1); 5559 pinsrq(xtmp, rtmp1, index % 2); 5560 vindex = index / 2; 5561 if (vindex) { 5562 // Write entire 16 byte vector when both 64 bit 5563 // lanes are update to save redundant instructions. 5564 if (index % 2) { 5565 vinsertf128(dst, dst, xtmp, vindex); 5566 } 5567 } else { 5568 vmovdqu(dst, xtmp); 5569 } 5570 mask_len -= 8; 5571 } 5572 } 5573 5574 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5575 switch(opc) { 5576 case Op_VectorMaskTrueCount: 5577 popcntq(dst, tmp); 5578 break; 5579 case Op_VectorMaskLastTrue: 5580 if (VM_Version::supports_lzcnt()) { 5581 lzcntq(tmp, tmp); 5582 movl(dst, 63); 5583 subl(dst, tmp); 5584 } else { 5585 movl(dst, -1); 5586 bsrq(tmp, tmp); 5587 cmov32(Assembler::notZero, dst, tmp); 5588 } 5589 break; 5590 case Op_VectorMaskFirstTrue: 5591 if (VM_Version::supports_bmi1()) { 5592 if (masklen < 32) { 5593 orl(tmp, 1 << masklen); 5594 tzcntl(dst, tmp); 5595 } else if (masklen == 32) { 5596 tzcntl(dst, tmp); 5597 } else { 5598 assert(masklen == 64, ""); 5599 tzcntq(dst, tmp); 5600 } 5601 } else { 5602 if (masklen < 32) { 5603 orl(tmp, 1 << masklen); 5604 bsfl(dst, tmp); 5605 } else { 5606 assert(masklen == 32 || masklen == 64, ""); 5607 movl(dst, masklen); 5608 if (masklen == 32) { 5609 bsfl(tmp, tmp); 5610 } else { 5611 bsfq(tmp, tmp); 5612 } 5613 cmov32(Assembler::notZero, dst, tmp); 5614 } 5615 } 5616 break; 5617 case Op_VectorMaskToLong: 5618 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5619 break; 5620 default: assert(false, "Unhandled mask operation"); 5621 } 5622 } 5623 5624 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5625 int masklen, int masksize, int vec_enc) { 5626 assert(VM_Version::supports_popcnt(), ""); 5627 5628 if(VM_Version::supports_avx512bw()) { 5629 kmovql(tmp, mask); 5630 } else { 5631 assert(masklen <= 16, ""); 5632 kmovwl(tmp, mask); 5633 } 5634 5635 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5636 // operations needs to be clipped. 5637 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5638 andq(tmp, (1 << masklen) - 1); 5639 } 5640 5641 vector_mask_operation_helper(opc, dst, tmp, masklen); 5642 } 5643 5644 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5645 Register tmp, int masklen, BasicType bt, int vec_enc) { 5646 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5647 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5648 assert(VM_Version::supports_popcnt(), ""); 5649 5650 bool need_clip = false; 5651 switch(bt) { 5652 case T_BOOLEAN: 5653 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5654 vpxor(xtmp, xtmp, xtmp, vec_enc); 5655 vpsubb(xtmp, xtmp, mask, vec_enc); 5656 vpmovmskb(tmp, xtmp, vec_enc); 5657 need_clip = masklen < 16; 5658 break; 5659 case T_BYTE: 5660 vpmovmskb(tmp, mask, vec_enc); 5661 need_clip = masklen < 16; 5662 break; 5663 case T_SHORT: 5664 vpacksswb(xtmp, mask, mask, vec_enc); 5665 if (masklen >= 16) { 5666 vpermpd(xtmp, xtmp, 8, vec_enc); 5667 } 5668 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5669 need_clip = masklen < 16; 5670 break; 5671 case T_INT: 5672 case T_FLOAT: 5673 vmovmskps(tmp, mask, vec_enc); 5674 need_clip = masklen < 4; 5675 break; 5676 case T_LONG: 5677 case T_DOUBLE: 5678 vmovmskpd(tmp, mask, vec_enc); 5679 need_clip = masklen < 2; 5680 break; 5681 default: assert(false, "Unhandled type, %s", type2name(bt)); 5682 } 5683 5684 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5685 // operations needs to be clipped. 5686 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5687 // need_clip implies masklen < 32 5688 andq(tmp, (1 << masklen) - 1); 5689 } 5690 5691 vector_mask_operation_helper(opc, dst, tmp, masklen); 5692 } 5693 5694 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5695 Register rtmp2, int mask_len) { 5696 kmov(rtmp1, src); 5697 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5698 mov64(rtmp2, -1L); 5699 pextq(rtmp2, rtmp2, rtmp1); 5700 kmov(dst, rtmp2); 5701 } 5702 5703 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5704 XMMRegister mask, Register rtmp, Register rscratch, 5705 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5706 int vec_enc) { 5707 assert(type2aelembytes(bt) >= 4, ""); 5708 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5709 address compress_perm_table = nullptr; 5710 address expand_perm_table = nullptr; 5711 if (type2aelembytes(bt) == 8) { 5712 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5713 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5714 vmovmskpd(rtmp, mask, vec_enc); 5715 } else { 5716 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5717 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5718 vmovmskps(rtmp, mask, vec_enc); 5719 } 5720 shlq(rtmp, 5); // for 32 byte permute row. 5721 if (opcode == Op_CompressV) { 5722 lea(rscratch, ExternalAddress(compress_perm_table)); 5723 } else { 5724 lea(rscratch, ExternalAddress(expand_perm_table)); 5725 } 5726 addptr(rtmp, rscratch); 5727 vmovdqu(permv, Address(rtmp)); 5728 vpermps(dst, permv, src, Assembler::AVX_256bit); 5729 vpxor(xtmp, xtmp, xtmp, vec_enc); 5730 // Blend the result with zero vector using permute mask, each column entry 5731 // in a permute table row contains either a valid permute index or a -1 (default) 5732 // value, this can potentially be used as a blending mask after 5733 // compressing/expanding the source vector lanes. 5734 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5735 } 5736 5737 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5738 bool merge, BasicType bt, int vec_enc) { 5739 if (opcode == Op_CompressV) { 5740 switch(bt) { 5741 case T_BYTE: 5742 evpcompressb(dst, mask, src, merge, vec_enc); 5743 break; 5744 case T_CHAR: 5745 case T_SHORT: 5746 evpcompressw(dst, mask, src, merge, vec_enc); 5747 break; 5748 case T_INT: 5749 evpcompressd(dst, mask, src, merge, vec_enc); 5750 break; 5751 case T_FLOAT: 5752 evcompressps(dst, mask, src, merge, vec_enc); 5753 break; 5754 case T_LONG: 5755 evpcompressq(dst, mask, src, merge, vec_enc); 5756 break; 5757 case T_DOUBLE: 5758 evcompresspd(dst, mask, src, merge, vec_enc); 5759 break; 5760 default: 5761 fatal("Unsupported type %s", type2name(bt)); 5762 break; 5763 } 5764 } else { 5765 assert(opcode == Op_ExpandV, ""); 5766 switch(bt) { 5767 case T_BYTE: 5768 evpexpandb(dst, mask, src, merge, vec_enc); 5769 break; 5770 case T_CHAR: 5771 case T_SHORT: 5772 evpexpandw(dst, mask, src, merge, vec_enc); 5773 break; 5774 case T_INT: 5775 evpexpandd(dst, mask, src, merge, vec_enc); 5776 break; 5777 case T_FLOAT: 5778 evexpandps(dst, mask, src, merge, vec_enc); 5779 break; 5780 case T_LONG: 5781 evpexpandq(dst, mask, src, merge, vec_enc); 5782 break; 5783 case T_DOUBLE: 5784 evexpandpd(dst, mask, src, merge, vec_enc); 5785 break; 5786 default: 5787 fatal("Unsupported type %s", type2name(bt)); 5788 break; 5789 } 5790 } 5791 } 5792 5793 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5794 KRegister ktmp1, int vec_enc) { 5795 if (opcode == Op_SignumVD) { 5796 vsubpd(dst, zero, one, vec_enc); 5797 // if src < 0 ? -1 : 1 5798 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5799 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5800 // if src == NaN, -0.0 or 0.0 return src. 5801 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5802 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5803 } else { 5804 assert(opcode == Op_SignumVF, ""); 5805 vsubps(dst, zero, one, vec_enc); 5806 // if src < 0 ? -1 : 1 5807 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5808 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5809 // if src == NaN, -0.0 or 0.0 return src. 5810 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5811 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5812 } 5813 } 5814 5815 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5816 XMMRegister xtmp1, int vec_enc) { 5817 if (opcode == Op_SignumVD) { 5818 vsubpd(dst, zero, one, vec_enc); 5819 // if src < 0 ? -1 : 1 5820 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5821 // if src == NaN, -0.0 or 0.0 return src. 5822 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5823 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5824 } else { 5825 assert(opcode == Op_SignumVF, ""); 5826 vsubps(dst, zero, one, vec_enc); 5827 // if src < 0 ? -1 : 1 5828 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5829 // if src == NaN, -0.0 or 0.0 return src. 5830 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5831 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5832 } 5833 } 5834 5835 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5836 if (VM_Version::supports_avx512bw()) { 5837 if (mask_len > 32) { 5838 kmovql(dst, src); 5839 } else { 5840 kmovdl(dst, src); 5841 if (mask_len != 32) { 5842 kshiftrdl(dst, dst, 32 - mask_len); 5843 } 5844 } 5845 } else { 5846 assert(mask_len <= 16, ""); 5847 kmovwl(dst, src); 5848 if (mask_len != 16) { 5849 kshiftrwl(dst, dst, 16 - mask_len); 5850 } 5851 } 5852 } 5853 5854 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5855 int lane_size = type2aelembytes(bt); 5856 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5857 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) { 5858 movptr(rtmp, imm32); 5859 switch(lane_size) { 5860 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5861 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5862 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5863 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5864 fatal("Unsupported lane size %d", lane_size); 5865 break; 5866 } 5867 } else { 5868 movptr(rtmp, imm32); 5869 movq(dst, rtmp); 5870 switch(lane_size) { 5871 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5872 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5873 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5874 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5875 fatal("Unsupported lane size %d", lane_size); 5876 break; 5877 } 5878 } 5879 } 5880 5881 // 5882 // Following is lookup table based popcount computation algorithm:- 5883 // Index Bit set count 5884 // [ 0000 -> 0, 5885 // 0001 -> 1, 5886 // 0010 -> 1, 5887 // 0011 -> 2, 5888 // 0100 -> 1, 5889 // 0101 -> 2, 5890 // 0110 -> 2, 5891 // 0111 -> 3, 5892 // 1000 -> 1, 5893 // 1001 -> 2, 5894 // 1010 -> 3, 5895 // 1011 -> 3, 5896 // 1100 -> 2, 5897 // 1101 -> 3, 5898 // 1111 -> 4 ] 5899 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5900 // shuffle indices for lookup table access. 5901 // b. Right shift each byte of vector lane by 4 positions. 5902 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5903 // shuffle indices for lookup table access. 5904 // d. Add the bitset count of upper and lower 4 bits of each byte. 5905 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5906 // count of all the bytes of a quadword. 5907 // f. Perform step e. for upper 128bit vector lane. 5908 // g. Pack the bitset count of quadwords back to double word. 5909 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5910 5911 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5912 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5913 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5914 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5915 vpsrlw(dst, src, 4, vec_enc); 5916 vpand(dst, dst, xtmp1, vec_enc); 5917 vpand(xtmp1, src, xtmp1, vec_enc); 5918 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5919 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5920 vpshufb(dst, xtmp2, dst, vec_enc); 5921 vpaddb(dst, dst, xtmp1, vec_enc); 5922 } 5923 5924 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5925 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5926 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5927 // Following code is as per steps e,f,g and h of above algorithm. 5928 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5929 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5930 vpsadbw(dst, dst, xtmp2, vec_enc); 5931 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5932 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5933 vpackuswb(dst, xtmp1, dst, vec_enc); 5934 } 5935 5936 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5937 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5938 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5939 // Add the popcount of upper and lower bytes of word. 5940 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5941 vpsrlw(dst, xtmp1, 8, vec_enc); 5942 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5943 vpaddw(dst, dst, xtmp1, vec_enc); 5944 } 5945 5946 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5947 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5948 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5949 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5950 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5951 } 5952 5953 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5954 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5955 switch(bt) { 5956 case T_LONG: 5957 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5958 break; 5959 case T_INT: 5960 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5961 break; 5962 case T_CHAR: 5963 case T_SHORT: 5964 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5965 break; 5966 case T_BYTE: 5967 case T_BOOLEAN: 5968 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5969 break; 5970 default: 5971 fatal("Unsupported type %s", type2name(bt)); 5972 break; 5973 } 5974 } 5975 5976 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5977 KRegister mask, bool merge, int vec_enc) { 5978 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5979 switch(bt) { 5980 case T_LONG: 5981 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5982 evpopcntq(dst, mask, src, merge, vec_enc); 5983 break; 5984 case T_INT: 5985 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5986 evpopcntd(dst, mask, src, merge, vec_enc); 5987 break; 5988 case T_CHAR: 5989 case T_SHORT: 5990 assert(VM_Version::supports_avx512_bitalg(), ""); 5991 evpopcntw(dst, mask, src, merge, vec_enc); 5992 break; 5993 case T_BYTE: 5994 case T_BOOLEAN: 5995 assert(VM_Version::supports_avx512_bitalg(), ""); 5996 evpopcntb(dst, mask, src, merge, vec_enc); 5997 break; 5998 default: 5999 fatal("Unsupported type %s", type2name(bt)); 6000 break; 6001 } 6002 } 6003 6004 // Bit reversal algorithm first reverses the bits of each byte followed by 6005 // a byte level reversal for multi-byte primitive types (short/int/long). 6006 // Algorithm performs a lookup table access to get reverse bit sequence 6007 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 6008 // is obtained by swapping the reverse bit sequences of upper and lower 6009 // nibble of a byte. 6010 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6011 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6012 if (VM_Version::supports_avx512vlbw()) { 6013 6014 // Get the reverse bit sequence of lower nibble of each byte. 6015 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 6016 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6017 evpandq(dst, xtmp2, src, vec_enc); 6018 vpshufb(dst, xtmp1, dst, vec_enc); 6019 vpsllq(dst, dst, 4, vec_enc); 6020 6021 // Get the reverse bit sequence of upper nibble of each byte. 6022 vpandn(xtmp2, xtmp2, src, vec_enc); 6023 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6024 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6025 6026 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6027 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6028 evporq(xtmp2, dst, xtmp2, vec_enc); 6029 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6030 6031 } else if(vec_enc == Assembler::AVX_512bit) { 6032 // Shift based bit reversal. 6033 assert(bt == T_LONG || bt == T_INT, ""); 6034 6035 // Swap lower and upper nibble of each byte. 6036 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 6037 6038 // Swap two least and most significant bits of each nibble. 6039 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 6040 6041 // Swap adjacent pair of bits. 6042 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6043 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 6044 6045 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6046 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 6047 } else { 6048 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 6049 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6050 6051 // Get the reverse bit sequence of lower nibble of each byte. 6052 vpand(dst, xtmp2, src, vec_enc); 6053 vpshufb(dst, xtmp1, dst, vec_enc); 6054 vpsllq(dst, dst, 4, vec_enc); 6055 6056 // Get the reverse bit sequence of upper nibble of each byte. 6057 vpandn(xtmp2, xtmp2, src, vec_enc); 6058 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6059 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6060 6061 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6062 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6063 vpor(xtmp2, dst, xtmp2, vec_enc); 6064 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6065 } 6066 } 6067 6068 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 6069 XMMRegister xtmp, Register rscratch) { 6070 assert(VM_Version::supports_gfni(), ""); 6071 assert(rscratch != noreg || always_reachable(mask), "missing"); 6072 6073 // Galois field instruction based bit reversal based on following algorithm. 6074 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6075 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 6076 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 6077 vector_reverse_byte(bt, dst, xtmp, vec_enc); 6078 } 6079 6080 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 6081 XMMRegister xtmp1, Register rtmp, int vec_enc) { 6082 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 6083 evpandq(dst, xtmp1, src, vec_enc); 6084 vpsllq(dst, dst, nbits, vec_enc); 6085 vpandn(xtmp1, xtmp1, src, vec_enc); 6086 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 6087 evporq(dst, dst, xtmp1, vec_enc); 6088 } 6089 6090 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6091 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6092 // Shift based bit reversal. 6093 assert(VM_Version::supports_evex(), ""); 6094 switch(bt) { 6095 case T_LONG: 6096 // Swap upper and lower double word of each quad word. 6097 evprorq(xtmp1, k0, src, 32, true, vec_enc); 6098 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 6099 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6100 break; 6101 case T_INT: 6102 // Swap upper and lower word of each double word. 6103 evprord(xtmp1, k0, src, 16, true, vec_enc); 6104 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6105 break; 6106 case T_CHAR: 6107 case T_SHORT: 6108 // Swap upper and lower byte of each word. 6109 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 6110 break; 6111 case T_BYTE: 6112 evmovdquq(dst, k0, src, true, vec_enc); 6113 break; 6114 default: 6115 fatal("Unsupported type %s", type2name(bt)); 6116 break; 6117 } 6118 } 6119 6120 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 6121 if (bt == T_BYTE) { 6122 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 6123 evmovdquq(dst, k0, src, true, vec_enc); 6124 } else { 6125 vmovdqu(dst, src); 6126 } 6127 return; 6128 } 6129 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 6130 // pre-computed shuffle indices. 6131 switch(bt) { 6132 case T_LONG: 6133 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 6134 break; 6135 case T_INT: 6136 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 6137 break; 6138 case T_CHAR: 6139 case T_SHORT: 6140 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6141 break; 6142 default: 6143 fatal("Unsupported type %s", type2name(bt)); 6144 break; 6145 } 6146 vpshufb(dst, src, dst, vec_enc); 6147 } 6148 6149 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6150 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6151 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6152 assert(is_integral_type(bt), ""); 6153 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6154 assert(VM_Version::supports_avx512cd(), ""); 6155 switch(bt) { 6156 case T_LONG: 6157 evplzcntq(dst, ktmp, src, merge, vec_enc); 6158 break; 6159 case T_INT: 6160 evplzcntd(dst, ktmp, src, merge, vec_enc); 6161 break; 6162 case T_SHORT: 6163 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6164 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6165 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6166 vpunpckhwd(dst, xtmp1, src, vec_enc); 6167 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6168 vpackusdw(dst, xtmp2, dst, vec_enc); 6169 break; 6170 case T_BYTE: 6171 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6172 // accessing the lookup table. 6173 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6174 // accessing the lookup table. 6175 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6176 assert(VM_Version::supports_avx512bw(), ""); 6177 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6178 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6179 vpand(xtmp2, dst, src, vec_enc); 6180 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6181 vpsrlw(xtmp3, src, 4, vec_enc); 6182 vpand(xtmp3, dst, xtmp3, vec_enc); 6183 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6184 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6185 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6186 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6187 break; 6188 default: 6189 fatal("Unsupported type %s", type2name(bt)); 6190 break; 6191 } 6192 } 6193 6194 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6195 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6196 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6197 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6198 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6199 // accessing the lookup table. 6200 vpand(dst, xtmp2, src, vec_enc); 6201 vpshufb(dst, xtmp1, dst, vec_enc); 6202 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6203 // accessing the lookup table. 6204 vpsrlw(xtmp3, src, 4, vec_enc); 6205 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6206 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6207 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6208 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6209 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6210 vpaddb(dst, dst, xtmp2, vec_enc); 6211 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6212 } 6213 6214 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6215 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6216 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6217 // Add zero counts of lower byte and upper byte of a word if 6218 // upper byte holds a zero value. 6219 vpsrlw(xtmp3, src, 8, vec_enc); 6220 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6221 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6222 vpsllw(xtmp2, dst, 8, vec_enc); 6223 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6224 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6225 vpsrlw(dst, dst, 8, vec_enc); 6226 } 6227 6228 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6229 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6230 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6231 // hence biased exponent can be used to compute leading zero count as per 6232 // following formula:- 6233 // LZCNT = 31 - (biased_exp - 127) 6234 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6235 6236 // Broadcast 0xFF 6237 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6238 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6239 6240 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher 6241 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit 6242 // contributes to the leading number of zeros. 6243 vpsrld(xtmp2, src, 1, vec_enc); 6244 vpandn(xtmp3, xtmp2, src, vec_enc); 6245 6246 // Extract biased exponent. 6247 vcvtdq2ps(dst, xtmp3, vec_enc); 6248 vpsrld(dst, dst, 23, vec_enc); 6249 vpand(dst, dst, xtmp1, vec_enc); 6250 6251 // Broadcast 127. 6252 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6253 // Exponent = biased_exp - 127 6254 vpsubd(dst, dst, xtmp1, vec_enc); 6255 6256 // Exponent_plus_one = Exponent + 1 6257 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6258 vpaddd(dst, dst, xtmp3, vec_enc); 6259 6260 // Replace -ve exponent with zero, exponent is -ve when src 6261 // lane contains a zero value. 6262 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6263 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6264 6265 // Rematerialize broadcast 32. 6266 vpslld(xtmp1, xtmp3, 5, vec_enc); 6267 // Exponent is 32 if corresponding source lane contains max_int value. 6268 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6269 // LZCNT = 32 - exponent_plus_one 6270 vpsubd(dst, xtmp1, dst, vec_enc); 6271 6272 // Replace LZCNT with a value 1 if corresponding source lane 6273 // contains max_int value. 6274 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6275 6276 // Replace biased_exp with 0 if source lane value is less than zero. 6277 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6278 vblendvps(dst, dst, xtmp2, src, vec_enc); 6279 } 6280 6281 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6282 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6283 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6284 // Add zero counts of lower word and upper word of a double word if 6285 // upper word holds a zero value. 6286 vpsrld(xtmp3, src, 16, vec_enc); 6287 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6288 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6289 vpslld(xtmp2, dst, 16, vec_enc); 6290 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6291 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6292 vpsrld(dst, dst, 16, vec_enc); 6293 // Add zero counts of lower doubleword and upper doubleword of a 6294 // quadword if upper doubleword holds a zero value. 6295 vpsrlq(xtmp3, src, 32, vec_enc); 6296 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6297 vpsllq(xtmp2, dst, 32, vec_enc); 6298 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6299 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6300 vpsrlq(dst, dst, 32, vec_enc); 6301 } 6302 6303 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6304 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6305 Register rtmp, int vec_enc) { 6306 assert(is_integral_type(bt), "unexpected type"); 6307 assert(vec_enc < Assembler::AVX_512bit, ""); 6308 switch(bt) { 6309 case T_LONG: 6310 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6311 break; 6312 case T_INT: 6313 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6314 break; 6315 case T_SHORT: 6316 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6317 break; 6318 case T_BYTE: 6319 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6320 break; 6321 default: 6322 fatal("Unsupported type %s", type2name(bt)); 6323 break; 6324 } 6325 } 6326 6327 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6328 switch(bt) { 6329 case T_BYTE: 6330 vpsubb(dst, src1, src2, vec_enc); 6331 break; 6332 case T_SHORT: 6333 vpsubw(dst, src1, src2, vec_enc); 6334 break; 6335 case T_INT: 6336 vpsubd(dst, src1, src2, vec_enc); 6337 break; 6338 case T_LONG: 6339 vpsubq(dst, src1, src2, vec_enc); 6340 break; 6341 default: 6342 fatal("Unsupported type %s", type2name(bt)); 6343 break; 6344 } 6345 } 6346 6347 // Trailing zero count computation is based on leading zero count operation as per 6348 // following equation. All AVX3 targets support AVX512CD feature which offers 6349 // direct vector instruction to compute leading zero count. 6350 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6351 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6352 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6353 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6354 assert(is_integral_type(bt), ""); 6355 // xtmp = -1 6356 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6357 // xtmp = xtmp + src 6358 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6359 // xtmp = xtmp & ~src 6360 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6361 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6362 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6363 vpsub(bt, dst, xtmp4, dst, vec_enc); 6364 } 6365 6366 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6367 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6368 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6369 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6370 assert(is_integral_type(bt), ""); 6371 // xtmp = 0 6372 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6373 // xtmp = 0 - src 6374 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6375 // xtmp = xtmp | src 6376 vpor(xtmp3, xtmp3, src, vec_enc); 6377 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6378 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6379 vpsub(bt, dst, xtmp1, dst, vec_enc); 6380 } 6381 6382 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6383 Label done; 6384 Label neg_divisor_fastpath; 6385 cmpl(divisor, 0); 6386 jccb(Assembler::less, neg_divisor_fastpath); 6387 xorl(rdx, rdx); 6388 divl(divisor); 6389 jmpb(done); 6390 bind(neg_divisor_fastpath); 6391 // Fastpath for divisor < 0: 6392 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6393 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6394 movl(rdx, rax); 6395 subl(rdx, divisor); 6396 if (VM_Version::supports_bmi1()) { 6397 andnl(rax, rdx, rax); 6398 } else { 6399 notl(rdx); 6400 andl(rax, rdx); 6401 } 6402 shrl(rax, 31); 6403 bind(done); 6404 } 6405 6406 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6407 Label done; 6408 Label neg_divisor_fastpath; 6409 cmpl(divisor, 0); 6410 jccb(Assembler::less, neg_divisor_fastpath); 6411 xorl(rdx, rdx); 6412 divl(divisor); 6413 jmpb(done); 6414 bind(neg_divisor_fastpath); 6415 // Fastpath when divisor < 0: 6416 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6417 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6418 movl(rdx, rax); 6419 subl(rax, divisor); 6420 if (VM_Version::supports_bmi1()) { 6421 andnl(rax, rax, rdx); 6422 } else { 6423 notl(rax); 6424 andl(rax, rdx); 6425 } 6426 sarl(rax, 31); 6427 andl(rax, divisor); 6428 subl(rdx, rax); 6429 bind(done); 6430 } 6431 6432 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6433 Label done; 6434 Label neg_divisor_fastpath; 6435 6436 cmpl(divisor, 0); 6437 jccb(Assembler::less, neg_divisor_fastpath); 6438 xorl(rdx, rdx); 6439 divl(divisor); 6440 jmpb(done); 6441 bind(neg_divisor_fastpath); 6442 // Fastpath for divisor < 0: 6443 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6444 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6445 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6446 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6447 movl(rdx, rax); 6448 subl(rax, divisor); 6449 if (VM_Version::supports_bmi1()) { 6450 andnl(rax, rax, rdx); 6451 } else { 6452 notl(rax); 6453 andl(rax, rdx); 6454 } 6455 movl(tmp, rax); 6456 shrl(rax, 31); // quotient 6457 sarl(tmp, 31); 6458 andl(tmp, divisor); 6459 subl(rdx, tmp); // remainder 6460 bind(done); 6461 } 6462 6463 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6464 XMMRegister xtmp2, Register rtmp) { 6465 if(VM_Version::supports_gfni()) { 6466 // Galois field instruction based bit reversal based on following algorithm. 6467 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6468 mov64(rtmp, 0x8040201008040201L); 6469 movq(xtmp1, src); 6470 movq(xtmp2, rtmp); 6471 gf2p8affineqb(xtmp1, xtmp2, 0); 6472 movq(dst, xtmp1); 6473 } else { 6474 // Swap even and odd numbered bits. 6475 movl(rtmp, src); 6476 andl(rtmp, 0x55555555); 6477 shll(rtmp, 1); 6478 movl(dst, src); 6479 andl(dst, 0xAAAAAAAA); 6480 shrl(dst, 1); 6481 orl(dst, rtmp); 6482 6483 // Swap LSB and MSB 2 bits of each nibble. 6484 movl(rtmp, dst); 6485 andl(rtmp, 0x33333333); 6486 shll(rtmp, 2); 6487 andl(dst, 0xCCCCCCCC); 6488 shrl(dst, 2); 6489 orl(dst, rtmp); 6490 6491 // Swap LSB and MSB 4 bits of each byte. 6492 movl(rtmp, dst); 6493 andl(rtmp, 0x0F0F0F0F); 6494 shll(rtmp, 4); 6495 andl(dst, 0xF0F0F0F0); 6496 shrl(dst, 4); 6497 orl(dst, rtmp); 6498 } 6499 bswapl(dst); 6500 } 6501 6502 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6503 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6504 if(VM_Version::supports_gfni()) { 6505 // Galois field instruction based bit reversal based on following algorithm. 6506 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6507 mov64(rtmp1, 0x8040201008040201L); 6508 movq(xtmp1, src); 6509 movq(xtmp2, rtmp1); 6510 gf2p8affineqb(xtmp1, xtmp2, 0); 6511 movq(dst, xtmp1); 6512 } else { 6513 // Swap even and odd numbered bits. 6514 movq(rtmp1, src); 6515 mov64(rtmp2, 0x5555555555555555L); 6516 andq(rtmp1, rtmp2); 6517 shlq(rtmp1, 1); 6518 movq(dst, src); 6519 notq(rtmp2); 6520 andq(dst, rtmp2); 6521 shrq(dst, 1); 6522 orq(dst, rtmp1); 6523 6524 // Swap LSB and MSB 2 bits of each nibble. 6525 movq(rtmp1, dst); 6526 mov64(rtmp2, 0x3333333333333333L); 6527 andq(rtmp1, rtmp2); 6528 shlq(rtmp1, 2); 6529 notq(rtmp2); 6530 andq(dst, rtmp2); 6531 shrq(dst, 2); 6532 orq(dst, rtmp1); 6533 6534 // Swap LSB and MSB 4 bits of each byte. 6535 movq(rtmp1, dst); 6536 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6537 andq(rtmp1, rtmp2); 6538 shlq(rtmp1, 4); 6539 notq(rtmp2); 6540 andq(dst, rtmp2); 6541 shrq(dst, 4); 6542 orq(dst, rtmp1); 6543 } 6544 bswapq(dst); 6545 } 6546 6547 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6548 Label done; 6549 Label neg_divisor_fastpath; 6550 cmpq(divisor, 0); 6551 jccb(Assembler::less, neg_divisor_fastpath); 6552 xorl(rdx, rdx); 6553 divq(divisor); 6554 jmpb(done); 6555 bind(neg_divisor_fastpath); 6556 // Fastpath for divisor < 0: 6557 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6558 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6559 movq(rdx, rax); 6560 subq(rdx, divisor); 6561 if (VM_Version::supports_bmi1()) { 6562 andnq(rax, rdx, rax); 6563 } else { 6564 notq(rdx); 6565 andq(rax, rdx); 6566 } 6567 shrq(rax, 63); 6568 bind(done); 6569 } 6570 6571 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6572 Label done; 6573 Label neg_divisor_fastpath; 6574 cmpq(divisor, 0); 6575 jccb(Assembler::less, neg_divisor_fastpath); 6576 xorq(rdx, rdx); 6577 divq(divisor); 6578 jmp(done); 6579 bind(neg_divisor_fastpath); 6580 // Fastpath when divisor < 0: 6581 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6582 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6583 movq(rdx, rax); 6584 subq(rax, divisor); 6585 if (VM_Version::supports_bmi1()) { 6586 andnq(rax, rax, rdx); 6587 } else { 6588 notq(rax); 6589 andq(rax, rdx); 6590 } 6591 sarq(rax, 63); 6592 andq(rax, divisor); 6593 subq(rdx, rax); 6594 bind(done); 6595 } 6596 6597 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6598 Label done; 6599 Label neg_divisor_fastpath; 6600 cmpq(divisor, 0); 6601 jccb(Assembler::less, neg_divisor_fastpath); 6602 xorq(rdx, rdx); 6603 divq(divisor); 6604 jmp(done); 6605 bind(neg_divisor_fastpath); 6606 // Fastpath for divisor < 0: 6607 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6608 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6609 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6610 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6611 movq(rdx, rax); 6612 subq(rax, divisor); 6613 if (VM_Version::supports_bmi1()) { 6614 andnq(rax, rax, rdx); 6615 } else { 6616 notq(rax); 6617 andq(rax, rdx); 6618 } 6619 movq(tmp, rax); 6620 shrq(rax, 63); // quotient 6621 sarq(tmp, 63); 6622 andq(tmp, divisor); 6623 subq(rdx, tmp); // remainder 6624 bind(done); 6625 } 6626 6627 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6628 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6629 int vlen_enc) { 6630 assert(VM_Version::supports_avx512bw(), ""); 6631 // Byte shuffles are inlane operations and indices are determined using 6632 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6633 // normalized to index range 0-15. This makes sure that all the multiples 6634 // of an index value are placed at same relative position in 128 bit 6635 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6636 // will be 16th element in their respective 128 bit lanes. 6637 movl(rtmp, 16); 6638 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6639 6640 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6641 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6642 // original shuffle indices and move the shuffled lanes corresponding to true 6643 // mask to destination vector. 6644 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6645 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6646 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6647 6648 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6649 // and broadcasting second 128 bit lane. 6650 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6651 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6652 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6653 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6654 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6655 6656 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6657 // and broadcasting third 128 bit lane. 6658 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6659 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6660 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6661 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6662 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6663 6664 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6665 // and broadcasting third 128 bit lane. 6666 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6667 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6668 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6669 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6670 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6671 } 6672 6673 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6674 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6675 if (vlen_enc == AVX_128bit) { 6676 vpermilps(dst, src, shuffle, vlen_enc); 6677 } else if (bt == T_INT) { 6678 vpermd(dst, shuffle, src, vlen_enc); 6679 } else { 6680 assert(bt == T_FLOAT, ""); 6681 vpermps(dst, shuffle, src, vlen_enc); 6682 } 6683 } 6684 6685 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 6686 switch(opcode) { 6687 case Op_AddHF: vaddsh(dst, src1, src2); break; 6688 case Op_SubHF: vsubsh(dst, src1, src2); break; 6689 case Op_MulHF: vmulsh(dst, src1, src2); break; 6690 case Op_DivHF: vdivsh(dst, src1, src2); break; 6691 default: assert(false, "%s", NodeClassNames[opcode]); break; 6692 } 6693 } 6694 6695 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6696 switch(elem_bt) { 6697 case T_BYTE: 6698 if (ideal_opc == Op_SaturatingAddV) { 6699 vpaddsb(dst, src1, src2, vlen_enc); 6700 } else { 6701 assert(ideal_opc == Op_SaturatingSubV, ""); 6702 vpsubsb(dst, src1, src2, vlen_enc); 6703 } 6704 break; 6705 case T_SHORT: 6706 if (ideal_opc == Op_SaturatingAddV) { 6707 vpaddsw(dst, src1, src2, vlen_enc); 6708 } else { 6709 assert(ideal_opc == Op_SaturatingSubV, ""); 6710 vpsubsw(dst, src1, src2, vlen_enc); 6711 } 6712 break; 6713 default: 6714 fatal("Unsupported type %s", type2name(elem_bt)); 6715 break; 6716 } 6717 } 6718 6719 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6720 switch(elem_bt) { 6721 case T_BYTE: 6722 if (ideal_opc == Op_SaturatingAddV) { 6723 vpaddusb(dst, src1, src2, vlen_enc); 6724 } else { 6725 assert(ideal_opc == Op_SaturatingSubV, ""); 6726 vpsubusb(dst, src1, src2, vlen_enc); 6727 } 6728 break; 6729 case T_SHORT: 6730 if (ideal_opc == Op_SaturatingAddV) { 6731 vpaddusw(dst, src1, src2, vlen_enc); 6732 } else { 6733 assert(ideal_opc == Op_SaturatingSubV, ""); 6734 vpsubusw(dst, src1, src2, vlen_enc); 6735 } 6736 break; 6737 default: 6738 fatal("Unsupported type %s", type2name(elem_bt)); 6739 break; 6740 } 6741 } 6742 6743 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6744 XMMRegister src2, KRegister ktmp, int vlen_enc) { 6745 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input. 6746 // overflow_mask = Inp1 <u Inp2 6747 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc); 6748 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative) 6749 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false); 6750 } 6751 6752 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6753 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6754 // Emulate unsigned comparison using signed comparison 6755 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE 6756 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true); 6757 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc); 6758 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc); 6759 6760 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc); 6761 6762 // Res = INP1 - INP2 (non-commutative and non-associative) 6763 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6764 // Res = Mask ? Zero : Res 6765 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); 6766 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc); 6767 } 6768 6769 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6770 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) { 6771 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation. 6772 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2) 6773 // Res = Signed Add INP1, INP2 6774 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6775 // T1 = SRC1 | SRC2 6776 vpor(xtmp1, src1, src2, vlen_enc); 6777 // Max_Unsigned = -1 6778 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6779 // Unsigned compare: Mask = Res <u T1 6780 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc); 6781 // res = Mask ? Max_Unsigned : Res 6782 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc); 6783 } 6784 6785 // 6786 // Section 2-13 Hacker's Delight list following overflow detection check for saturating 6787 // unsigned addition operation. 6788 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1 6789 // 6790 // We empirically determined its semantic equivalence to following reduced expression 6791 // overflow_mask = (a + b) <u (a | b) 6792 // 6793 // and also verified it though Alive2 solver. 6794 // (https://alive2.llvm.org/ce/z/XDQ7dY) 6795 // 6796 6797 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6798 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) { 6799 // Res = Signed Add INP1, INP2 6800 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6801 // Compute T1 = INP1 | INP2 6802 vpor(xtmp3, src1, src2, vlen_enc); 6803 // T1 = Minimum signed value. 6804 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6805 // Convert T1 to signed value, T1 = T1 + MIN_VALUE 6806 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc); 6807 // Convert Res to signed value, Res<s> = Res + MIN_VALUE 6808 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc); 6809 // Compute overflow detection mask = Res<1> <s T1 6810 if (elem_bt == T_INT) { 6811 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc); 6812 } else { 6813 assert(elem_bt == T_LONG, ""); 6814 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc); 6815 } 6816 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6817 } 6818 6819 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6820 int vlen_enc, bool xtmp2_hold_M1) { 6821 if (VM_Version::supports_avx512dq()) { 6822 evpmovq2m(ktmp, src, vlen_enc); 6823 } else { 6824 assert(VM_Version::supports_evex(), ""); 6825 if (!xtmp2_hold_M1) { 6826 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6827 } 6828 evpsraq(xtmp1, src, 63, vlen_enc); 6829 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6830 } 6831 } 6832 6833 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6834 int vlen_enc, bool xtmp2_hold_M1) { 6835 if (VM_Version::supports_avx512dq()) { 6836 evpmovd2m(ktmp, src, vlen_enc); 6837 } else { 6838 assert(VM_Version::supports_evex(), ""); 6839 if (!xtmp2_hold_M1) { 6840 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6841 } 6842 vpsrad(xtmp1, src, 31, vlen_enc); 6843 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6844 } 6845 } 6846 6847 6848 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 6849 if (elem_bt == T_LONG) { 6850 if (VM_Version::supports_evex()) { 6851 evpsraq(dst, src, 63, vlen_enc); 6852 } else { 6853 vpsrad(dst, src, 31, vlen_enc); 6854 vpshufd(dst, dst, 0xF5, vlen_enc); 6855 } 6856 } else { 6857 assert(elem_bt == T_INT, ""); 6858 vpsrad(dst, src, 31, vlen_enc); 6859 } 6860 } 6861 6862 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6863 if (compute_allones) { 6864 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6865 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6866 } else { 6867 vpcmpeqq(allones, allones, allones, vlen_enc); 6868 } 6869 } 6870 if (elem_bt == T_LONG) { 6871 vpsrlq(dst, allones, 1, vlen_enc); 6872 } else { 6873 assert(elem_bt == T_INT, ""); 6874 vpsrld(dst, allones, 1, vlen_enc); 6875 } 6876 } 6877 6878 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6879 if (compute_allones) { 6880 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6881 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6882 } else { 6883 vpcmpeqq(allones, allones, allones, vlen_enc); 6884 } 6885 } 6886 if (elem_bt == T_LONG) { 6887 vpsllq(dst, allones, 63, vlen_enc); 6888 } else { 6889 assert(elem_bt == T_INT, ""); 6890 vpslld(dst, allones, 31, vlen_enc); 6891 } 6892 } 6893 6894 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2, 6895 Assembler::ComparisonPredicate cond, int vlen_enc) { 6896 switch(elem_bt) { 6897 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break; 6898 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break; 6899 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break; 6900 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break; 6901 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6902 } 6903 } 6904 6905 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6906 switch(elem_bt) { 6907 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break; 6908 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break; 6909 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break; 6910 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break; 6911 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6912 } 6913 } 6914 6915 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1, 6916 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) { 6917 if (elem_bt == T_LONG) { 6918 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6919 } else { 6920 assert(elem_bt == T_INT, ""); 6921 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6922 } 6923 } 6924 6925 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6926 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6927 KRegister ktmp1, KRegister ktmp2, int vlen_enc) { 6928 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6929 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6930 // Overflow detection based on Hacker's delight section 2-13. 6931 if (ideal_opc == Op_SaturatingAddV) { 6932 // res = src1 + src2 6933 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6934 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6935 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6936 vpxor(xtmp1, dst, src1, vlen_enc); 6937 vpxor(xtmp2, dst, src2, vlen_enc); 6938 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6939 } else { 6940 assert(ideal_opc == Op_SaturatingSubV, ""); 6941 // res = src1 - src2 6942 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6943 // Overflow occurs when both inputs have opposite polarity and 6944 // result polarity does not comply with first input polarity. 6945 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6946 vpxor(xtmp1, src1, src2, vlen_enc); 6947 vpxor(xtmp2, dst, src1, vlen_enc); 6948 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6949 } 6950 6951 // Compute overflow detection mask. 6952 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc); 6953 // Note: xtmp1 hold -1 in all its lanes after above call. 6954 6955 // Compute mask based on first input polarity. 6956 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true); 6957 6958 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6959 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6960 6961 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to 6962 // set bits in first input polarity mask holds a min value. 6963 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc); 6964 // Blend destination lanes with saturated values using overflow detection mask. 6965 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc); 6966 } 6967 6968 6969 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6970 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6971 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) { 6972 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6973 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6974 // Overflow detection based on Hacker's delight section 2-13. 6975 if (ideal_opc == Op_SaturatingAddV) { 6976 // res = src1 + src2 6977 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6978 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6979 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6980 vpxor(xtmp1, dst, src1, vlen_enc); 6981 vpxor(xtmp2, dst, src2, vlen_enc); 6982 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6983 } else { 6984 assert(ideal_opc == Op_SaturatingSubV, ""); 6985 // res = src1 - src2 6986 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6987 // Overflow occurs when both inputs have opposite polarity and 6988 // result polarity does not comply with first input polarity. 6989 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6990 vpxor(xtmp1, src1, src2, vlen_enc); 6991 vpxor(xtmp2, dst, src1, vlen_enc); 6992 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6993 } 6994 6995 // Sign-extend to compute overflow detection mask. 6996 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc); 6997 6998 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc); 6999 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc); 7000 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 7001 7002 // Compose saturating min/max vector using first input polarity mask. 7003 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc); 7004 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc); 7005 7006 // Blend result with saturating vector using overflow detection mask. 7007 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 7008 } 7009 7010 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7011 switch(elem_bt) { 7012 case T_BYTE: 7013 if (ideal_opc == Op_SaturatingAddV) { 7014 vpaddsb(dst, src1, src2, vlen_enc); 7015 } else { 7016 assert(ideal_opc == Op_SaturatingSubV, ""); 7017 vpsubsb(dst, src1, src2, vlen_enc); 7018 } 7019 break; 7020 case T_SHORT: 7021 if (ideal_opc == Op_SaturatingAddV) { 7022 vpaddsw(dst, src1, src2, vlen_enc); 7023 } else { 7024 assert(ideal_opc == Op_SaturatingSubV, ""); 7025 vpsubsw(dst, src1, src2, vlen_enc); 7026 } 7027 break; 7028 default: 7029 fatal("Unsupported type %s", type2name(elem_bt)); 7030 break; 7031 } 7032 } 7033 7034 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7035 switch(elem_bt) { 7036 case T_BYTE: 7037 if (ideal_opc == Op_SaturatingAddV) { 7038 vpaddusb(dst, src1, src2, vlen_enc); 7039 } else { 7040 assert(ideal_opc == Op_SaturatingSubV, ""); 7041 vpsubusb(dst, src1, src2, vlen_enc); 7042 } 7043 break; 7044 case T_SHORT: 7045 if (ideal_opc == Op_SaturatingAddV) { 7046 vpaddusw(dst, src1, src2, vlen_enc); 7047 } else { 7048 assert(ideal_opc == Op_SaturatingSubV, ""); 7049 vpsubusw(dst, src1, src2, vlen_enc); 7050 } 7051 break; 7052 default: 7053 fatal("Unsupported type %s", type2name(elem_bt)); 7054 break; 7055 } 7056 } 7057 7058 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 7059 XMMRegister src2, int vlen_enc) { 7060 switch(elem_bt) { 7061 case T_BYTE: 7062 evpermi2b(dst, src1, src2, vlen_enc); 7063 break; 7064 case T_SHORT: 7065 evpermi2w(dst, src1, src2, vlen_enc); 7066 break; 7067 case T_INT: 7068 evpermi2d(dst, src1, src2, vlen_enc); 7069 break; 7070 case T_LONG: 7071 evpermi2q(dst, src1, src2, vlen_enc); 7072 break; 7073 case T_FLOAT: 7074 evpermi2ps(dst, src1, src2, vlen_enc); 7075 break; 7076 case T_DOUBLE: 7077 evpermi2pd(dst, src1, src2, vlen_enc); 7078 break; 7079 default: 7080 fatal("Unsupported type %s", type2name(elem_bt)); 7081 break; 7082 } 7083 } 7084 7085 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) { 7086 if (is_unsigned) { 7087 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7088 } else { 7089 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7090 } 7091 } 7092 7093 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) { 7094 if (is_unsigned) { 7095 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7096 } else { 7097 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7098 } 7099 } 7100 7101 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 7102 switch(opcode) { 7103 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 7104 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 7105 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 7106 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 7107 default: assert(false, "%s", NodeClassNames[opcode]); break; 7108 } 7109 } 7110 7111 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7112 switch(opcode) { 7113 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 7114 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 7115 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 7116 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 7117 default: assert(false, "%s", NodeClassNames[opcode]); break; 7118 } 7119 } 7120 7121 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7122 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) { 7123 vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit); 7124 } 7125 7126 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7127 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 7128 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) { 7129 // Move sign bits of src2 to mask register. 7130 evpmovw2m(ktmp, src2, vlen_enc); 7131 // xtmp1 = src2 < 0 ? src2 : src1 7132 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7133 // xtmp2 = src2 < 0 ? ? src1 : src2 7134 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7135 // Idea behind above swapping is to make seconds source operand a +ve value. 7136 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in 7137 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction, 7138 // the second source operand, either a NaN or a valid floating-point value, is returned 7139 // dst = max(xtmp1, xtmp2) 7140 evmaxph(dst, xtmp1, xtmp2, vlen_enc); 7141 // isNaN = is_unordered_quiet(xtmp1) 7142 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7143 // Final result is same as first source if its a NaN value, 7144 // in case second operand holds a NaN value then as per above semantics 7145 // result is same as second operand. 7146 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7147 } else { 7148 assert(opcode == Op_MinVHF || opcode == Op_MinHF, ""); 7149 // Move sign bits of src1 to mask register. 7150 evpmovw2m(ktmp, src1, vlen_enc); 7151 // xtmp1 = src1 < 0 ? src2 : src1 7152 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7153 // xtmp2 = src1 < 0 ? src1 : src2 7154 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7155 // Idea behind above swapping is to make seconds source operand a -ve value. 7156 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in 7157 // the second source operand is returned. 7158 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN 7159 // or a valid floating-point value, is written to the result. 7160 // dst = min(xtmp1, xtmp2) 7161 evminph(dst, xtmp1, xtmp2, vlen_enc); 7162 // isNaN = is_unordered_quiet(xtmp1) 7163 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7164 // Final result is same as first source if its a NaN value, 7165 // in case second operand holds a NaN value then as per above semantics 7166 // result is same as second operand. 7167 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7168 } 7169 }