1 /* 2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "gc/shared/barrierSet.hpp" 28 #include "gc/shared/barrierSetAssembler.hpp" 29 #include "oops/methodData.hpp" 30 #include "opto/c2_MacroAssembler.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/output.hpp" 33 #include "opto/opcodes.hpp" 34 #include "opto/subnode.hpp" 35 #include "runtime/globals.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 #include "utilities/checkedCast.hpp" 39 #include "utilities/globalDefinitions.hpp" 40 #include "utilities/powerOfTwo.hpp" 41 #include "utilities/sizes.hpp" 42 43 #ifdef PRODUCT 44 #define BLOCK_COMMENT(str) /* nothing */ 45 #define STOP(error) stop(error) 46 #else 47 #define BLOCK_COMMENT(str) block_comment(str) 48 #define STOP(error) block_comment(error); stop(error) 49 #endif 50 51 // C2 compiled method's prolog code. 52 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 53 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 54 55 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 56 // Remove word for return addr 57 framesize -= wordSize; 58 stack_bang_size -= wordSize; 59 60 // Calls to C2R adapters often do not accept exceptional returns. 61 // We require that their callers must bang for them. But be careful, because 62 // some VM calls (such as call site linkage) can use several kilobytes of 63 // stack. But the stack safety zone should account for that. 64 // See bugs 4446381, 4468289, 4497237. 65 if (stack_bang_size > 0) { 66 generate_stack_overflow_check(stack_bang_size); 67 68 // We always push rbp, so that on return to interpreter rbp, will be 69 // restored correctly and we can correct the stack. 70 push(rbp); 71 // Save caller's stack pointer into RBP if the frame pointer is preserved. 72 if (PreserveFramePointer) { 73 mov(rbp, rsp); 74 } 75 // Remove word for ebp 76 framesize -= wordSize; 77 78 // Create frame 79 if (framesize) { 80 subptr(rsp, framesize); 81 } 82 } else { 83 subptr(rsp, framesize); 84 85 // Save RBP register now. 86 framesize -= wordSize; 87 movptr(Address(rsp, framesize), rbp); 88 // Save caller's stack pointer into RBP if the frame pointer is preserved. 89 if (PreserveFramePointer) { 90 movptr(rbp, rsp); 91 if (framesize > 0) { 92 addptr(rbp, framesize); 93 } 94 } 95 } 96 97 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 98 framesize -= wordSize; 99 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 100 } 101 102 #ifdef ASSERT 103 if (VerifyStackAtCalls) { 104 Label L; 105 push(rax); 106 mov(rax, rsp); 107 andptr(rax, StackAlignmentInBytes-1); 108 cmpptr(rax, StackAlignmentInBytes-wordSize); 109 pop(rax); 110 jcc(Assembler::equal, L); 111 STOP("Stack is not properly aligned!"); 112 bind(L); 113 } 114 #endif 115 116 if (!is_stub) { 117 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 118 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 119 Label dummy_slow_path; 120 Label dummy_continuation; 121 Label* slow_path = &dummy_slow_path; 122 Label* continuation = &dummy_continuation; 123 if (!Compile::current()->output()->in_scratch_emit_size()) { 124 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 125 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 126 Compile::current()->output()->add_stub(stub); 127 slow_path = &stub->entry(); 128 continuation = &stub->continuation(); 129 } 130 bs->nmethod_entry_barrier(this, slow_path, continuation); 131 } 132 } 133 134 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 135 switch (vlen_in_bytes) { 136 case 4: // fall-through 137 case 8: // fall-through 138 case 16: return Assembler::AVX_128bit; 139 case 32: return Assembler::AVX_256bit; 140 case 64: return Assembler::AVX_512bit; 141 142 default: { 143 ShouldNotReachHere(); 144 return Assembler::AVX_NoVec; 145 } 146 } 147 } 148 149 // fast_lock and fast_unlock used by C2 150 151 // Because the transitions from emitted code to the runtime 152 // monitorenter/exit helper stubs are so slow it's critical that 153 // we inline both the stack-locking fast path and the inflated fast path. 154 // 155 // See also: cmpFastLock and cmpFastUnlock. 156 // 157 // What follows is a specialized inline transliteration of the code 158 // in enter() and exit(). If we're concerned about I$ bloat another 159 // option would be to emit TrySlowEnter and TrySlowExit methods 160 // at startup-time. These methods would accept arguments as 161 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 162 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 163 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 164 // In practice, however, the # of lock sites is bounded and is usually small. 165 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 166 // if the processor uses simple bimodal branch predictors keyed by EIP 167 // Since the helper routines would be called from multiple synchronization 168 // sites. 169 // 170 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 171 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 172 // to those specialized methods. That'd give us a mostly platform-independent 173 // implementation that the JITs could optimize and inline at their pleasure. 174 // Done correctly, the only time we'd need to cross to native could would be 175 // to park() or unpark() threads. We'd also need a few more unsafe operators 176 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 177 // (b) explicit barriers or fence operations. 178 // 179 // TODO: 180 // 181 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 182 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 183 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 184 // the lock operators would typically be faster than reifying Self. 185 // 186 // * Ideally I'd define the primitives as: 187 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 188 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 189 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 190 // Instead, we're stuck with a rather awkward and brittle register assignments below. 191 // Furthermore the register assignments are overconstrained, possibly resulting in 192 // sub-optimal code near the synchronization site. 193 // 194 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 195 // Alternately, use a better sp-proximity test. 196 // 197 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 198 // Either one is sufficient to uniquely identify a thread. 199 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 200 // 201 // * Intrinsify notify() and notifyAll() for the common cases where the 202 // object is locked by the calling thread but the waitlist is empty. 203 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 204 // 205 // * use jccb and jmpb instead of jcc and jmp to improve code density. 206 // But beware of excessive branch density on AMD Opterons. 207 // 208 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 209 // or failure of the fast path. If the fast path fails then we pass 210 // control to the slow path, typically in C. In fast_lock and 211 // fast_unlock we often branch to DONE_LABEL, just to find that C2 212 // will emit a conditional branch immediately after the node. 213 // So we have branches to branches and lots of ICC.ZF games. 214 // Instead, it might be better to have C2 pass a "FailureLabel" 215 // into fast_lock and fast_unlock. In the case of success, control 216 // will drop through the node. ICC.ZF is undefined at exit. 217 // In the case of failure, the node will branch directly to the 218 // FailureLabel 219 220 221 // obj: object to lock 222 // box: on-stack box address (displaced header location) - KILLED 223 // rax,: tmp -- KILLED 224 // scr: tmp -- KILLED 225 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 226 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 227 Metadata* method_data) { 228 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 229 // Ensure the register assignments are disjoint 230 assert(tmpReg == rax, ""); 231 assert(cx1Reg == noreg, ""); 232 assert(cx2Reg == noreg, ""); 233 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 234 235 // Possible cases that we'll encounter in fast_lock 236 // ------------------------------------------------ 237 // * Inflated 238 // -- unlocked 239 // -- Locked 240 // = by self 241 // = by other 242 // * neutral 243 // * stack-locked 244 // -- by self 245 // = sp-proximity test hits 246 // = sp-proximity test generates false-negative 247 // -- by other 248 // 249 250 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 251 252 if (DiagnoseSyncOnValueBasedClasses != 0) { 253 load_klass(tmpReg, objReg, scrReg); 254 testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 255 jcc(Assembler::notZero, DONE_LABEL); 256 } 257 258 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 259 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 260 jcc(Assembler::notZero, IsInflated); 261 262 if (LockingMode == LM_MONITOR) { 263 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 264 testptr(objReg, objReg); 265 } else { 266 assert(LockingMode == LM_LEGACY, "must be"); 267 // Attempt stack-locking ... 268 orptr (tmpReg, markWord::unlocked_value); 269 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 270 lock(); 271 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 272 jcc(Assembler::equal, COUNT); // Success 273 274 // Recursive locking. 275 // The object is stack-locked: markword contains stack pointer to BasicLock. 276 // Locked by current thread if difference with current SP is less than one page. 277 subptr(tmpReg, rsp); 278 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 279 andptr(tmpReg, (int32_t) (7 - (int)os::vm_page_size()) ); 280 movptr(Address(boxReg, 0), tmpReg); 281 } 282 jmp(DONE_LABEL); 283 284 bind(IsInflated); 285 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 286 287 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 288 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 289 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 290 291 // It's inflated and we use scrReg for ObjectMonitor* in this section. 292 movptr(boxReg, Address(r15_thread, JavaThread::monitor_owner_id_offset())); 293 movq(scrReg, tmpReg); 294 xorq(tmpReg, tmpReg); 295 lock(); 296 cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 297 298 // Propagate ICC.ZF from CAS above into DONE_LABEL. 299 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 300 301 cmpptr(boxReg, rax); // Check if we are already the owner (recursive lock) 302 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 303 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 304 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 305 bind(DONE_LABEL); 306 307 // ZFlag == 1 count in fast path 308 // ZFlag == 0 count in slow path 309 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 310 311 bind(COUNT); 312 if (LockingMode == LM_LEGACY) { 313 // Count monitors in fast path 314 increment(Address(thread, JavaThread::held_monitor_count_offset())); 315 } 316 xorl(tmpReg, tmpReg); // Set ZF == 1 317 318 bind(NO_COUNT); 319 320 // At NO_COUNT the icc ZFlag is set as follows ... 321 // fast_unlock uses the same protocol. 322 // ZFlag == 1 -> Success 323 // ZFlag == 0 -> Failure - force control through the slow path 324 } 325 326 // obj: object to unlock 327 // box: box address (displaced header location), killed. Must be EAX. 328 // tmp: killed, cannot be obj nor box. 329 // 330 // Some commentary on balanced locking: 331 // 332 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 333 // Methods that don't have provably balanced locking are forced to run in the 334 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 335 // The interpreter provides two properties: 336 // I1: At return-time the interpreter automatically and quietly unlocks any 337 // objects acquired the current activation (frame). Recall that the 338 // interpreter maintains an on-stack list of locks currently held by 339 // a frame. 340 // I2: If a method attempts to unlock an object that is not held by the 341 // the frame the interpreter throws IMSX. 342 // 343 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 344 // B() doesn't have provably balanced locking so it runs in the interpreter. 345 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 346 // is still locked by A(). 347 // 348 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 349 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 350 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 351 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 352 // Arguably given that the spec legislates the JNI case as undefined our implementation 353 // could reasonably *avoid* checking owner in fast_unlock(). 354 // In the interest of performance we elide m->Owner==Self check in unlock. 355 // A perfectly viable alternative is to elide the owner check except when 356 // Xcheck:jni is enabled. 357 358 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { 359 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 360 assert(boxReg == rax, ""); 361 assert_different_registers(objReg, boxReg, tmpReg); 362 363 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 364 365 if (LockingMode == LM_LEGACY) { 366 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 367 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 368 } 369 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 370 if (LockingMode != LM_MONITOR) { 371 testptr(tmpReg, markWord::monitor_value); // Inflated? 372 jcc(Assembler::zero, Stacked); 373 } 374 375 // It's inflated. 376 377 // Despite our balanced locking property we still check that m->_owner == Self 378 // as java routines or native JNI code called by this thread might 379 // have released the lock. 380 // 381 // If there's no contention try a 1-0 exit. That is, exit without 382 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 383 // we detect and recover from the race that the 1-0 exit admits. 384 // 385 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 386 // before it STs null into _owner, releasing the lock. Updates 387 // to data protected by the critical section must be visible before 388 // we drop the lock (and thus before any other thread could acquire 389 // the lock and observe the fields protected by the lock). 390 // IA32's memory-model is SPO, so STs are ordered with respect to 391 // each other and there's no need for an explicit barrier (fence). 392 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 393 Label LSuccess, LNotRecursive; 394 395 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 396 jccb(Assembler::equal, LNotRecursive); 397 398 // Recursive inflated unlock 399 decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 400 jmpb(LSuccess); 401 402 bind(LNotRecursive); 403 404 // Set owner to null. 405 // Release to satisfy the JMM 406 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 407 // We need a full fence after clearing owner to avoid stranding. 408 // StoreLoad achieves this. 409 membar(StoreLoad); 410 411 // Check if the entry_list is empty. 412 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(entry_list)), NULL_WORD); 413 jccb(Assembler::zero, LSuccess); // If so we are done. 414 415 // Check if there is a successor. 416 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 417 jccb(Assembler::notZero, LSuccess); // If so we are done. 418 419 // Save the monitor pointer in the current thread, so we can try to 420 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 421 andptr(tmpReg, ~(int32_t)markWord::monitor_value); 422 movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg); 423 424 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 425 jmpb (DONE_LABEL); 426 427 bind (LSuccess); 428 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 429 jmpb (DONE_LABEL); 430 431 if (LockingMode == LM_LEGACY) { 432 bind (Stacked); 433 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 434 lock(); 435 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 436 // Intentional fall-thru into DONE_LABEL 437 } 438 439 bind(DONE_LABEL); 440 441 // ZFlag == 1 count in fast path 442 // ZFlag == 0 count in slow path 443 jccb(Assembler::notZero, NO_COUNT); 444 445 bind(COUNT); 446 447 if (LockingMode == LM_LEGACY) { 448 // Count monitors in fast path 449 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 450 } 451 452 xorl(tmpReg, tmpReg); // Set ZF == 1 453 454 bind(NO_COUNT); 455 } 456 457 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 458 Register t, Register thread) { 459 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 460 assert(rax_reg == rax, "Used for CAS"); 461 assert_different_registers(obj, box, rax_reg, t, thread); 462 463 // Handle inflated monitor. 464 Label inflated; 465 // Finish fast lock successfully. ZF value is irrelevant. 466 Label locked; 467 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 468 Label slow_path; 469 470 if (UseObjectMonitorTable) { 471 // Clear cache in case fast locking succeeds or we need to take the slow-path. 472 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 473 } 474 475 if (DiagnoseSyncOnValueBasedClasses != 0) { 476 load_klass(rax_reg, obj, t); 477 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 478 jcc(Assembler::notZero, slow_path); 479 } 480 481 const Register mark = t; 482 483 { // Lightweight Lock 484 485 Label push; 486 487 const Register top = UseObjectMonitorTable ? rax_reg : box; 488 489 // Load the mark. 490 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 491 492 // Prefetch top. 493 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 494 495 // Check for monitor (0b10). 496 testptr(mark, markWord::monitor_value); 497 jcc(Assembler::notZero, inflated); 498 499 // Check if lock-stack is full. 500 cmpl(top, LockStack::end_offset() - 1); 501 jcc(Assembler::greater, slow_path); 502 503 // Check if recursive. 504 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 505 jccb(Assembler::equal, push); 506 507 // Try to lock. Transition lock bits 0b01 => 0b00 508 movptr(rax_reg, mark); 509 orptr(rax_reg, markWord::unlocked_value); 510 andptr(mark, ~(int32_t)markWord::unlocked_value); 511 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 512 jcc(Assembler::notEqual, slow_path); 513 514 if (UseObjectMonitorTable) { 515 // Need to reload top, clobbered by CAS. 516 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 517 } 518 bind(push); 519 // After successful lock, push object on lock-stack. 520 movptr(Address(thread, top), obj); 521 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 522 jmpb(locked); 523 } 524 525 { // Handle inflated monitor. 526 bind(inflated); 527 528 const Register monitor = t; 529 530 if (!UseObjectMonitorTable) { 531 assert(mark == monitor, "should be the same here"); 532 } else { 533 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 534 // Fetch ObjectMonitor* from the cache or take the slow-path. 535 Label monitor_found; 536 537 // Load cache address 538 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 539 540 const int num_unrolled = 2; 541 for (int i = 0; i < num_unrolled; i++) { 542 cmpptr(obj, Address(t)); 543 jccb(Assembler::equal, monitor_found); 544 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 545 } 546 547 Label loop; 548 549 // Search for obj in cache. 550 bind(loop); 551 552 // Check for match. 553 cmpptr(obj, Address(t)); 554 jccb(Assembler::equal, monitor_found); 555 556 // Search until null encountered, guaranteed _null_sentinel at end. 557 cmpptr(Address(t), 1); 558 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 559 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 560 jmpb(loop); 561 562 // Cache hit. 563 bind(monitor_found); 564 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 565 } 566 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 567 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 568 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 569 570 Label monitor_locked; 571 // Lock the monitor. 572 573 if (UseObjectMonitorTable) { 574 // Cache the monitor for unlock before trashing box. On failure to acquire 575 // the lock, the slow path will reset the entry accordingly (see CacheSetter). 576 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 577 } 578 579 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 580 xorptr(rax_reg, rax_reg); 581 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset())); 582 lock(); cmpxchgptr(box, owner_address); 583 jccb(Assembler::equal, monitor_locked); 584 585 // Check if recursive. 586 cmpptr(box, rax_reg); 587 jccb(Assembler::notEqual, slow_path); 588 589 // Recursive. 590 increment(recursions_address); 591 592 bind(monitor_locked); 593 } 594 595 bind(locked); 596 // Set ZF = 1 597 xorl(rax_reg, rax_reg); 598 599 #ifdef ASSERT 600 // Check that locked label is reached with ZF set. 601 Label zf_correct; 602 Label zf_bad_zero; 603 jcc(Assembler::zero, zf_correct); 604 jmp(zf_bad_zero); 605 #endif 606 607 bind(slow_path); 608 #ifdef ASSERT 609 // Check that slow_path label is reached with ZF not set. 610 jcc(Assembler::notZero, zf_correct); 611 stop("Fast Lock ZF != 0"); 612 bind(zf_bad_zero); 613 stop("Fast Lock ZF != 1"); 614 bind(zf_correct); 615 #endif 616 // C2 uses the value of ZF to determine the continuation. 617 } 618 619 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 620 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 621 assert(reg_rax == rax, "Used for CAS"); 622 assert_different_registers(obj, reg_rax, t); 623 624 // Handle inflated monitor. 625 Label inflated, inflated_check_lock_stack; 626 // Finish fast unlock successfully. MUST jump with ZF == 1 627 Label unlocked, slow_path; 628 629 const Register mark = t; 630 const Register monitor = t; 631 const Register top = UseObjectMonitorTable ? t : reg_rax; 632 const Register box = reg_rax; 633 634 Label dummy; 635 C2FastUnlockLightweightStub* stub = nullptr; 636 637 if (!Compile::current()->output()->in_scratch_emit_size()) { 638 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 639 Compile::current()->output()->add_stub(stub); 640 } 641 642 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 643 644 { // Lightweight Unlock 645 646 // Load top. 647 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 648 649 if (!UseObjectMonitorTable) { 650 // Prefetch mark. 651 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 652 } 653 654 // Check if obj is top of lock-stack. 655 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 656 // Top of lock stack was not obj. Must be monitor. 657 jcc(Assembler::notEqual, inflated_check_lock_stack); 658 659 // Pop lock-stack. 660 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 661 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 662 663 // Check if recursive. 664 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 665 jcc(Assembler::equal, unlocked); 666 667 // We elide the monitor check, let the CAS fail instead. 668 669 if (UseObjectMonitorTable) { 670 // Load mark. 671 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 672 } 673 674 // Try to unlock. Transition lock bits 0b00 => 0b01 675 movptr(reg_rax, mark); 676 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 677 orptr(mark, markWord::unlocked_value); 678 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 679 jcc(Assembler::notEqual, push_and_slow_path); 680 jmp(unlocked); 681 } 682 683 684 { // Handle inflated monitor. 685 bind(inflated_check_lock_stack); 686 #ifdef ASSERT 687 Label check_done; 688 subl(top, oopSize); 689 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 690 jcc(Assembler::below, check_done); 691 cmpptr(obj, Address(thread, top)); 692 jccb(Assembler::notEqual, inflated_check_lock_stack); 693 stop("Fast Unlock lock on stack"); 694 bind(check_done); 695 if (UseObjectMonitorTable) { 696 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 697 } 698 testptr(mark, markWord::monitor_value); 699 jccb(Assembler::notZero, inflated); 700 stop("Fast Unlock not monitor"); 701 #endif 702 703 bind(inflated); 704 705 if (!UseObjectMonitorTable) { 706 assert(mark == monitor, "should be the same here"); 707 } else { 708 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 709 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 710 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 711 cmpptr(monitor, alignof(ObjectMonitor*)); 712 jcc(Assembler::below, slow_path); 713 } 714 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 715 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 716 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 717 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag}; 718 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 719 720 Label recursive; 721 722 // Check if recursive. 723 cmpptr(recursions_address, 0); 724 jccb(Assembler::notZero, recursive); 725 726 // Set owner to null. 727 // Release to satisfy the JMM 728 movptr(owner_address, NULL_WORD); 729 // We need a full fence after clearing owner to avoid stranding. 730 // StoreLoad achieves this. 731 membar(StoreLoad); 732 733 // Check if the entry_list is empty. 734 cmpptr(entry_list_address, NULL_WORD); 735 jccb(Assembler::zero, unlocked); // If so we are done. 736 737 // Check if there is a successor. 738 cmpptr(succ_address, NULL_WORD); 739 jccb(Assembler::notZero, unlocked); // If so we are done. 740 741 // Save the monitor pointer in the current thread, so we can try to 742 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 743 if (!UseObjectMonitorTable) { 744 andptr(monitor, ~(int32_t)markWord::monitor_value); 745 } 746 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 747 748 orl(t, 1); // Fast Unlock ZF = 0 749 jmpb(slow_path); 750 751 // Recursive unlock. 752 bind(recursive); 753 decrement(recursions_address); 754 } 755 756 bind(unlocked); 757 xorl(t, t); // Fast Unlock ZF = 1 758 759 #ifdef ASSERT 760 // Check that unlocked label is reached with ZF set. 761 Label zf_correct; 762 Label zf_bad_zero; 763 jcc(Assembler::zero, zf_correct); 764 jmp(zf_bad_zero); 765 #endif 766 767 bind(slow_path); 768 if (stub != nullptr) { 769 bind(stub->slow_path_continuation()); 770 } 771 #ifdef ASSERT 772 // Check that stub->continuation() label is reached with ZF not set. 773 jcc(Assembler::notZero, zf_correct); 774 stop("Fast Unlock ZF != 0"); 775 bind(zf_bad_zero); 776 stop("Fast Unlock ZF != 1"); 777 bind(zf_correct); 778 #endif 779 // C2 uses the value of ZF to determine the continuation. 780 } 781 782 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) { 783 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi); 784 } 785 786 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) { 787 const int framesize = Compile::current()->output()->frame_size_in_bytes(); 788 masm->movptr(dst, rsp); 789 if (framesize > 2 * wordSize) { 790 masm->addptr(dst, framesize - 2 * wordSize); 791 } 792 } 793 794 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) { 795 if (PreserveFramePointer) { 796 // frame pointer is valid 797 #ifdef ASSERT 798 // Verify frame pointer value in rbp. 799 reconstruct_frame_pointer_helper(this, rtmp); 800 Label L_success; 801 cmpq(rbp, rtmp); 802 jccb(Assembler::equal, L_success); 803 STOP("frame pointer mismatch"); 804 bind(L_success); 805 #endif // ASSERT 806 } else { 807 reconstruct_frame_pointer_helper(this, rbp); 808 } 809 } 810 811 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) { 812 jint lo = t->_lo; 813 jint hi = t->_hi; 814 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi); 815 if (t == TypeInt::INT) { 816 return; 817 } 818 819 BLOCK_COMMENT("CastII {"); 820 Label fail; 821 Label succeed; 822 if (hi == max_jint) { 823 cmpl(val, lo); 824 jccb(Assembler::greaterEqual, succeed); 825 } else { 826 if (lo != min_jint) { 827 cmpl(val, lo); 828 jccb(Assembler::less, fail); 829 } 830 cmpl(val, hi); 831 jccb(Assembler::lessEqual, succeed); 832 } 833 834 bind(fail); 835 movl(c_rarg0, idx); 836 movl(c_rarg1, val); 837 movl(c_rarg2, lo); 838 movl(c_rarg3, hi); 839 reconstruct_frame_pointer(rscratch1); 840 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range))); 841 hlt(); 842 bind(succeed); 843 BLOCK_COMMENT("} // CastII"); 844 } 845 846 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) { 847 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi); 848 } 849 850 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) { 851 jlong lo = t->_lo; 852 jlong hi = t->_hi; 853 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi); 854 if (t == TypeLong::LONG) { 855 return; 856 } 857 858 BLOCK_COMMENT("CastLL {"); 859 Label fail; 860 Label succeed; 861 862 auto cmp_val = [&](jlong bound) { 863 if (is_simm32(bound)) { 864 cmpq(val, checked_cast<int>(bound)); 865 } else { 866 mov64(tmp, bound); 867 cmpq(val, tmp); 868 } 869 }; 870 871 if (hi == max_jlong) { 872 cmp_val(lo); 873 jccb(Assembler::greaterEqual, succeed); 874 } else { 875 if (lo != min_jlong) { 876 cmp_val(lo); 877 jccb(Assembler::less, fail); 878 } 879 cmp_val(hi); 880 jccb(Assembler::lessEqual, succeed); 881 } 882 883 bind(fail); 884 movl(c_rarg0, idx); 885 movq(c_rarg1, val); 886 mov64(c_rarg2, lo); 887 mov64(c_rarg3, hi); 888 reconstruct_frame_pointer(rscratch1); 889 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range))); 890 hlt(); 891 bind(succeed); 892 BLOCK_COMMENT("} // CastLL"); 893 } 894 895 //------------------------------------------------------------------------------------------- 896 // Generic instructions support for use in .ad files C2 code generation 897 898 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 899 if (dst != src) { 900 movdqu(dst, src); 901 } 902 if (opcode == Op_AbsVD) { 903 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 904 } else { 905 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 906 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 907 } 908 } 909 910 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 911 if (opcode == Op_AbsVD) { 912 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 913 } else { 914 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 915 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 916 } 917 } 918 919 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 920 if (dst != src) { 921 movdqu(dst, src); 922 } 923 if (opcode == Op_AbsVF) { 924 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 925 } else { 926 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 927 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 928 } 929 } 930 931 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 932 if (opcode == Op_AbsVF) { 933 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 934 } else { 935 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 936 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 937 } 938 } 939 940 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 941 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 942 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 943 944 if (opcode == Op_MinV) { 945 if (elem_bt == T_BYTE) { 946 pminsb(dst, src); 947 } else if (elem_bt == T_SHORT) { 948 pminsw(dst, src); 949 } else if (elem_bt == T_INT) { 950 pminsd(dst, src); 951 } else { 952 assert(elem_bt == T_LONG, "required"); 953 assert(tmp == xmm0, "required"); 954 assert_different_registers(dst, src, tmp); 955 movdqu(xmm0, dst); 956 pcmpgtq(xmm0, src); 957 blendvpd(dst, src); // xmm0 as mask 958 } 959 } else { // opcode == Op_MaxV 960 if (elem_bt == T_BYTE) { 961 pmaxsb(dst, src); 962 } else if (elem_bt == T_SHORT) { 963 pmaxsw(dst, src); 964 } else if (elem_bt == T_INT) { 965 pmaxsd(dst, src); 966 } else { 967 assert(elem_bt == T_LONG, "required"); 968 assert(tmp == xmm0, "required"); 969 assert_different_registers(dst, src, tmp); 970 movdqu(xmm0, src); 971 pcmpgtq(xmm0, dst); 972 blendvpd(dst, src); // xmm0 as mask 973 } 974 } 975 } 976 977 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 978 XMMRegister src1, Address src2, int vlen_enc) { 979 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 980 if (opcode == Op_UMinV) { 981 switch(elem_bt) { 982 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 983 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 984 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 985 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 986 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 987 } 988 } else { 989 assert(opcode == Op_UMaxV, "required"); 990 switch(elem_bt) { 991 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 992 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 993 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 994 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 995 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 996 } 997 } 998 } 999 1000 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 1001 // For optimality, leverage a full vector width of 512 bits 1002 // for operations over smaller vector sizes on AVX512 targets. 1003 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { 1004 if (opcode == Op_UMaxV) { 1005 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 1006 } else { 1007 assert(opcode == Op_UMinV, "required"); 1008 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 1009 } 1010 } else { 1011 // T1 = -1 1012 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc); 1013 // T1 = -1 << 63 1014 vpsllq(xtmp1, xtmp1, 63, vlen_enc); 1015 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2 1016 vpaddq(xtmp2, xtmp1, src2, vlen_enc); 1017 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1 1018 vpaddq(xtmp1, xtmp1, src1, vlen_enc); 1019 // Mask = T2 > T1 1020 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc); 1021 if (opcode == Op_UMaxV) { 1022 // Res = Mask ? Src2 : Src1 1023 vpblendvb(dst, src1, src2, xtmp1, vlen_enc); 1024 } else { 1025 // Res = Mask ? Src1 : Src2 1026 vpblendvb(dst, src2, src1, xtmp1, vlen_enc); 1027 } 1028 } 1029 } 1030 1031 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 1032 XMMRegister src1, XMMRegister src2, int vlen_enc) { 1033 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 1034 if (opcode == Op_UMinV) { 1035 switch(elem_bt) { 1036 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 1037 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 1038 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 1039 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 1040 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 1041 } 1042 } else { 1043 assert(opcode == Op_UMaxV, "required"); 1044 switch(elem_bt) { 1045 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 1046 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 1047 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 1048 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 1049 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 1050 } 1051 } 1052 } 1053 1054 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1055 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1056 int vlen_enc) { 1057 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1058 1059 if (opcode == Op_MinV) { 1060 if (elem_bt == T_BYTE) { 1061 vpminsb(dst, src1, src2, vlen_enc); 1062 } else if (elem_bt == T_SHORT) { 1063 vpminsw(dst, src1, src2, vlen_enc); 1064 } else if (elem_bt == T_INT) { 1065 vpminsd(dst, src1, src2, vlen_enc); 1066 } else { 1067 assert(elem_bt == T_LONG, "required"); 1068 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1069 vpminsq(dst, src1, src2, vlen_enc); 1070 } else { 1071 assert_different_registers(dst, src1, src2); 1072 vpcmpgtq(dst, src1, src2, vlen_enc); 1073 vblendvpd(dst, src1, src2, dst, vlen_enc); 1074 } 1075 } 1076 } else { // opcode == Op_MaxV 1077 if (elem_bt == T_BYTE) { 1078 vpmaxsb(dst, src1, src2, vlen_enc); 1079 } else if (elem_bt == T_SHORT) { 1080 vpmaxsw(dst, src1, src2, vlen_enc); 1081 } else if (elem_bt == T_INT) { 1082 vpmaxsd(dst, src1, src2, vlen_enc); 1083 } else { 1084 assert(elem_bt == T_LONG, "required"); 1085 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1086 vpmaxsq(dst, src1, src2, vlen_enc); 1087 } else { 1088 assert_different_registers(dst, src1, src2); 1089 vpcmpgtq(dst, src1, src2, vlen_enc); 1090 vblendvpd(dst, src2, src1, dst, vlen_enc); 1091 } 1092 } 1093 } 1094 } 1095 1096 // Float/Double min max 1097 1098 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1099 XMMRegister dst, XMMRegister a, XMMRegister b, 1100 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1101 int vlen_enc) { 1102 assert(UseAVX > 0, "required"); 1103 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1104 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1105 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1106 assert_different_registers(a, tmp, atmp, btmp); 1107 assert_different_registers(b, tmp, atmp, btmp); 1108 1109 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1110 bool is_double_word = is_double_word_type(elem_bt); 1111 1112 /* Note on 'non-obvious' assembly sequence: 1113 * 1114 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1115 * and Java on how they handle floats: 1116 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1117 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1118 * 1119 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1120 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1121 * (only useful when signs differ, noop otherwise) 1122 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1123 1124 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1125 * btmp = (b < +0.0) ? a : b 1126 * atmp = (b < +0.0) ? b : a 1127 * Tmp = Max_Float(atmp , btmp) 1128 * Res = (atmp == NaN) ? atmp : Tmp 1129 */ 1130 1131 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1132 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1133 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1134 XMMRegister mask; 1135 1136 if (!is_double_word && is_min) { 1137 mask = a; 1138 vblend = &MacroAssembler::vblendvps; 1139 vmaxmin = &MacroAssembler::vminps; 1140 vcmp = &MacroAssembler::vcmpps; 1141 } else if (!is_double_word && !is_min) { 1142 mask = b; 1143 vblend = &MacroAssembler::vblendvps; 1144 vmaxmin = &MacroAssembler::vmaxps; 1145 vcmp = &MacroAssembler::vcmpps; 1146 } else if (is_double_word && is_min) { 1147 mask = a; 1148 vblend = &MacroAssembler::vblendvpd; 1149 vmaxmin = &MacroAssembler::vminpd; 1150 vcmp = &MacroAssembler::vcmppd; 1151 } else { 1152 assert(is_double_word && !is_min, "sanity"); 1153 mask = b; 1154 vblend = &MacroAssembler::vblendvpd; 1155 vmaxmin = &MacroAssembler::vmaxpd; 1156 vcmp = &MacroAssembler::vcmppd; 1157 } 1158 1159 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1160 XMMRegister maxmin, scratch; 1161 if (dst == btmp) { 1162 maxmin = btmp; 1163 scratch = tmp; 1164 } else { 1165 maxmin = tmp; 1166 scratch = btmp; 1167 } 1168 1169 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1170 if (precompute_mask && !is_double_word) { 1171 vpsrad(tmp, mask, 32, vlen_enc); 1172 mask = tmp; 1173 } else if (precompute_mask && is_double_word) { 1174 vpxor(tmp, tmp, tmp, vlen_enc); 1175 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1176 mask = tmp; 1177 } 1178 1179 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1180 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1181 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1182 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1183 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1184 } 1185 1186 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1187 XMMRegister dst, XMMRegister a, XMMRegister b, 1188 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1189 int vlen_enc) { 1190 assert(UseAVX > 2, "required"); 1191 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1192 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1193 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1194 assert_different_registers(dst, a, atmp, btmp); 1195 assert_different_registers(dst, b, atmp, btmp); 1196 1197 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1198 bool is_double_word = is_double_word_type(elem_bt); 1199 bool merge = true; 1200 1201 if (!is_double_word && is_min) { 1202 evpmovd2m(ktmp, a, vlen_enc); 1203 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1204 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1205 vminps(dst, atmp, btmp, vlen_enc); 1206 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1207 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1208 } else if (!is_double_word && !is_min) { 1209 evpmovd2m(ktmp, b, vlen_enc); 1210 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1211 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1212 vmaxps(dst, atmp, btmp, vlen_enc); 1213 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1214 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1215 } else if (is_double_word && is_min) { 1216 evpmovq2m(ktmp, a, vlen_enc); 1217 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1218 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1219 vminpd(dst, atmp, btmp, vlen_enc); 1220 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1221 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1222 } else { 1223 assert(is_double_word && !is_min, "sanity"); 1224 evpmovq2m(ktmp, b, vlen_enc); 1225 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1226 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1227 vmaxpd(dst, atmp, btmp, vlen_enc); 1228 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1229 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1230 } 1231 } 1232 1233 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask, 1234 XMMRegister src1, XMMRegister src2, int vlen_enc) { 1235 assert(opc == Op_MinV || opc == Op_MinReductionV || 1236 opc == Op_MaxV || opc == Op_MaxReductionV, "sanity"); 1237 1238 int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_MINMAX_MIN_COMPARE_SIGN 1239 : AVX10_MINMAX_MAX_COMPARE_SIGN; 1240 if (elem_bt == T_FLOAT) { 1241 evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc); 1242 } else { 1243 assert(elem_bt == T_DOUBLE, ""); 1244 evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc); 1245 } 1246 } 1247 1248 // Float/Double signum 1249 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1250 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1251 1252 Label DONE_LABEL; 1253 1254 if (opcode == Op_SignumF) { 1255 ucomiss(dst, zero); 1256 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1257 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1258 movflt(dst, one); 1259 jcc(Assembler::above, DONE_LABEL); 1260 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1261 } else if (opcode == Op_SignumD) { 1262 ucomisd(dst, zero); 1263 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1264 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1265 movdbl(dst, one); 1266 jcc(Assembler::above, DONE_LABEL); 1267 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1268 } 1269 1270 bind(DONE_LABEL); 1271 } 1272 1273 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1274 if (sign) { 1275 pmovsxbw(dst, src); 1276 } else { 1277 pmovzxbw(dst, src); 1278 } 1279 } 1280 1281 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1282 if (sign) { 1283 vpmovsxbw(dst, src, vector_len); 1284 } else { 1285 vpmovzxbw(dst, src, vector_len); 1286 } 1287 } 1288 1289 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1290 if (sign) { 1291 vpmovsxbd(dst, src, vector_len); 1292 } else { 1293 vpmovzxbd(dst, src, vector_len); 1294 } 1295 } 1296 1297 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1298 if (sign) { 1299 vpmovsxwd(dst, src, vector_len); 1300 } else { 1301 vpmovzxwd(dst, src, vector_len); 1302 } 1303 } 1304 1305 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1306 int shift, int vector_len) { 1307 if (opcode == Op_RotateLeftV) { 1308 if (etype == T_INT) { 1309 evprold(dst, src, shift, vector_len); 1310 } else { 1311 assert(etype == T_LONG, "expected type T_LONG"); 1312 evprolq(dst, src, shift, vector_len); 1313 } 1314 } else { 1315 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1316 if (etype == T_INT) { 1317 evprord(dst, src, shift, vector_len); 1318 } else { 1319 assert(etype == T_LONG, "expected type T_LONG"); 1320 evprorq(dst, src, shift, vector_len); 1321 } 1322 } 1323 } 1324 1325 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1326 XMMRegister shift, int vector_len) { 1327 if (opcode == Op_RotateLeftV) { 1328 if (etype == T_INT) { 1329 evprolvd(dst, src, shift, vector_len); 1330 } else { 1331 assert(etype == T_LONG, "expected type T_LONG"); 1332 evprolvq(dst, src, shift, vector_len); 1333 } 1334 } else { 1335 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1336 if (etype == T_INT) { 1337 evprorvd(dst, src, shift, vector_len); 1338 } else { 1339 assert(etype == T_LONG, "expected type T_LONG"); 1340 evprorvq(dst, src, shift, vector_len); 1341 } 1342 } 1343 } 1344 1345 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1346 if (opcode == Op_RShiftVI) { 1347 psrad(dst, shift); 1348 } else if (opcode == Op_LShiftVI) { 1349 pslld(dst, shift); 1350 } else { 1351 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1352 psrld(dst, shift); 1353 } 1354 } 1355 1356 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1357 switch (opcode) { 1358 case Op_RShiftVI: psrad(dst, shift); break; 1359 case Op_LShiftVI: pslld(dst, shift); break; 1360 case Op_URShiftVI: psrld(dst, shift); break; 1361 1362 default: assert(false, "%s", NodeClassNames[opcode]); 1363 } 1364 } 1365 1366 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1367 if (opcode == Op_RShiftVI) { 1368 vpsrad(dst, nds, shift, vector_len); 1369 } else if (opcode == Op_LShiftVI) { 1370 vpslld(dst, nds, shift, vector_len); 1371 } else { 1372 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1373 vpsrld(dst, nds, shift, vector_len); 1374 } 1375 } 1376 1377 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1378 switch (opcode) { 1379 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1380 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1381 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1382 1383 default: assert(false, "%s", NodeClassNames[opcode]); 1384 } 1385 } 1386 1387 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1388 switch (opcode) { 1389 case Op_RShiftVB: // fall-through 1390 case Op_RShiftVS: psraw(dst, shift); break; 1391 1392 case Op_LShiftVB: // fall-through 1393 case Op_LShiftVS: psllw(dst, shift); break; 1394 1395 case Op_URShiftVS: // fall-through 1396 case Op_URShiftVB: psrlw(dst, shift); break; 1397 1398 default: assert(false, "%s", NodeClassNames[opcode]); 1399 } 1400 } 1401 1402 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1403 switch (opcode) { 1404 case Op_RShiftVB: // fall-through 1405 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1406 1407 case Op_LShiftVB: // fall-through 1408 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1409 1410 case Op_URShiftVS: // fall-through 1411 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1412 1413 default: assert(false, "%s", NodeClassNames[opcode]); 1414 } 1415 } 1416 1417 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1418 switch (opcode) { 1419 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1420 case Op_LShiftVL: psllq(dst, shift); break; 1421 case Op_URShiftVL: psrlq(dst, shift); break; 1422 1423 default: assert(false, "%s", NodeClassNames[opcode]); 1424 } 1425 } 1426 1427 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1428 if (opcode == Op_RShiftVL) { 1429 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1430 } else if (opcode == Op_LShiftVL) { 1431 psllq(dst, shift); 1432 } else { 1433 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1434 psrlq(dst, shift); 1435 } 1436 } 1437 1438 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1439 switch (opcode) { 1440 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1441 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1442 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1443 1444 default: assert(false, "%s", NodeClassNames[opcode]); 1445 } 1446 } 1447 1448 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1449 if (opcode == Op_RShiftVL) { 1450 evpsraq(dst, nds, shift, vector_len); 1451 } else if (opcode == Op_LShiftVL) { 1452 vpsllq(dst, nds, shift, vector_len); 1453 } else { 1454 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1455 vpsrlq(dst, nds, shift, vector_len); 1456 } 1457 } 1458 1459 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1460 switch (opcode) { 1461 case Op_RShiftVB: // fall-through 1462 case Op_RShiftVS: // fall-through 1463 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1464 1465 case Op_LShiftVB: // fall-through 1466 case Op_LShiftVS: // fall-through 1467 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1468 1469 case Op_URShiftVB: // fall-through 1470 case Op_URShiftVS: // fall-through 1471 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1472 1473 default: assert(false, "%s", NodeClassNames[opcode]); 1474 } 1475 } 1476 1477 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1478 switch (opcode) { 1479 case Op_RShiftVB: // fall-through 1480 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1481 1482 case Op_LShiftVB: // fall-through 1483 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1484 1485 case Op_URShiftVB: // fall-through 1486 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1487 1488 default: assert(false, "%s", NodeClassNames[opcode]); 1489 } 1490 } 1491 1492 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1493 assert(UseAVX >= 2, "required"); 1494 switch (opcode) { 1495 case Op_RShiftVL: { 1496 if (UseAVX > 2) { 1497 assert(tmp == xnoreg, "not used"); 1498 if (!VM_Version::supports_avx512vl()) { 1499 vlen_enc = Assembler::AVX_512bit; 1500 } 1501 evpsravq(dst, src, shift, vlen_enc); 1502 } else { 1503 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1504 vpsrlvq(dst, src, shift, vlen_enc); 1505 vpsrlvq(tmp, tmp, shift, vlen_enc); 1506 vpxor(dst, dst, tmp, vlen_enc); 1507 vpsubq(dst, dst, tmp, vlen_enc); 1508 } 1509 break; 1510 } 1511 case Op_LShiftVL: { 1512 assert(tmp == xnoreg, "not used"); 1513 vpsllvq(dst, src, shift, vlen_enc); 1514 break; 1515 } 1516 case Op_URShiftVL: { 1517 assert(tmp == xnoreg, "not used"); 1518 vpsrlvq(dst, src, shift, vlen_enc); 1519 break; 1520 } 1521 default: assert(false, "%s", NodeClassNames[opcode]); 1522 } 1523 } 1524 1525 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1526 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1527 assert(opcode == Op_LShiftVB || 1528 opcode == Op_RShiftVB || 1529 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1530 bool sign = (opcode != Op_URShiftVB); 1531 assert(vector_len == 0, "required"); 1532 vextendbd(sign, dst, src, 1); 1533 vpmovzxbd(vtmp, shift, 1); 1534 varshiftd(opcode, dst, dst, vtmp, 1); 1535 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1536 vextracti128_high(vtmp, dst); 1537 vpackusdw(dst, dst, vtmp, 0); 1538 } 1539 1540 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1541 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1542 assert(opcode == Op_LShiftVB || 1543 opcode == Op_RShiftVB || 1544 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1545 bool sign = (opcode != Op_URShiftVB); 1546 int ext_vector_len = vector_len + 1; 1547 vextendbw(sign, dst, src, ext_vector_len); 1548 vpmovzxbw(vtmp, shift, ext_vector_len); 1549 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1550 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1551 if (vector_len == 0) { 1552 vextracti128_high(vtmp, dst); 1553 vpackuswb(dst, dst, vtmp, vector_len); 1554 } else { 1555 vextracti64x4_high(vtmp, dst); 1556 vpackuswb(dst, dst, vtmp, vector_len); 1557 vpermq(dst, dst, 0xD8, vector_len); 1558 } 1559 } 1560 1561 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1562 switch(typ) { 1563 case T_BYTE: 1564 pinsrb(dst, val, idx); 1565 break; 1566 case T_SHORT: 1567 pinsrw(dst, val, idx); 1568 break; 1569 case T_INT: 1570 pinsrd(dst, val, idx); 1571 break; 1572 case T_LONG: 1573 pinsrq(dst, val, idx); 1574 break; 1575 default: 1576 assert(false,"Should not reach here."); 1577 break; 1578 } 1579 } 1580 1581 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1582 switch(typ) { 1583 case T_BYTE: 1584 vpinsrb(dst, src, val, idx); 1585 break; 1586 case T_SHORT: 1587 vpinsrw(dst, src, val, idx); 1588 break; 1589 case T_INT: 1590 vpinsrd(dst, src, val, idx); 1591 break; 1592 case T_LONG: 1593 vpinsrq(dst, src, val, idx); 1594 break; 1595 default: 1596 assert(false,"Should not reach here."); 1597 break; 1598 } 1599 } 1600 1601 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1602 XMMRegister dst, Register base, 1603 Register idx_base, 1604 Register offset, Register mask, 1605 Register mask_idx, Register rtmp, 1606 int vlen_enc) { 1607 vpxor(dst, dst, dst, vlen_enc); 1608 if (elem_bt == T_SHORT) { 1609 for (int i = 0; i < 4; i++) { 1610 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1611 Label skip_load; 1612 btq(mask, mask_idx); 1613 jccb(Assembler::carryClear, skip_load); 1614 movl(rtmp, Address(idx_base, i * 4)); 1615 if (offset != noreg) { 1616 addl(rtmp, offset); 1617 } 1618 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1619 bind(skip_load); 1620 incq(mask_idx); 1621 } 1622 } else { 1623 assert(elem_bt == T_BYTE, ""); 1624 for (int i = 0; i < 8; i++) { 1625 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1626 Label skip_load; 1627 btq(mask, mask_idx); 1628 jccb(Assembler::carryClear, skip_load); 1629 movl(rtmp, Address(idx_base, i * 4)); 1630 if (offset != noreg) { 1631 addl(rtmp, offset); 1632 } 1633 pinsrb(dst, Address(base, rtmp), i); 1634 bind(skip_load); 1635 incq(mask_idx); 1636 } 1637 } 1638 } 1639 1640 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1641 Register base, Register idx_base, 1642 Register offset, Register rtmp, 1643 int vlen_enc) { 1644 vpxor(dst, dst, dst, vlen_enc); 1645 if (elem_bt == T_SHORT) { 1646 for (int i = 0; i < 4; i++) { 1647 // dst[i] = src[offset + idx_base[i]] 1648 movl(rtmp, Address(idx_base, i * 4)); 1649 if (offset != noreg) { 1650 addl(rtmp, offset); 1651 } 1652 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1653 } 1654 } else { 1655 assert(elem_bt == T_BYTE, ""); 1656 for (int i = 0; i < 8; i++) { 1657 // dst[i] = src[offset + idx_base[i]] 1658 movl(rtmp, Address(idx_base, i * 4)); 1659 if (offset != noreg) { 1660 addl(rtmp, offset); 1661 } 1662 pinsrb(dst, Address(base, rtmp), i); 1663 } 1664 } 1665 } 1666 1667 /* 1668 * Gather using hybrid algorithm, first partially unroll scalar loop 1669 * to accumulate values from gather indices into a quad-word(64bit) slice. 1670 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1671 * permutation to place the slice into appropriate vector lane 1672 * locations in destination vector. Following pseudo code describes the 1673 * algorithm in detail: 1674 * 1675 * DST_VEC = ZERO_VEC 1676 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1677 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1678 * FOREACH_ITER: 1679 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1680 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1681 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1682 * PERM_INDEX = PERM_INDEX - TWO_VEC 1683 * 1684 * With each iteration, doubleword permute indices (0,1) corresponding 1685 * to gathered quadword gets right shifted by two lane positions. 1686 * 1687 */ 1688 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1689 Register base, Register idx_base, 1690 Register offset, Register mask, 1691 XMMRegister xtmp1, XMMRegister xtmp2, 1692 XMMRegister temp_dst, Register rtmp, 1693 Register mask_idx, Register length, 1694 int vector_len, int vlen_enc) { 1695 Label GATHER8_LOOP; 1696 assert(is_subword_type(elem_ty), ""); 1697 movl(length, vector_len); 1698 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1699 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1700 vallones(xtmp2, vlen_enc); 1701 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1702 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1703 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1704 1705 bind(GATHER8_LOOP); 1706 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1707 if (mask == noreg) { 1708 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1709 } else { 1710 vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc); 1711 } 1712 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1713 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1714 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1715 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1716 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1717 vpor(dst, dst, temp_dst, vlen_enc); 1718 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1719 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1720 jcc(Assembler::notEqual, GATHER8_LOOP); 1721 } 1722 1723 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1724 switch(typ) { 1725 case T_INT: 1726 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1727 break; 1728 case T_FLOAT: 1729 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1730 break; 1731 case T_LONG: 1732 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1733 break; 1734 case T_DOUBLE: 1735 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1736 break; 1737 default: 1738 assert(false,"Should not reach here."); 1739 break; 1740 } 1741 } 1742 1743 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1744 switch(typ) { 1745 case T_INT: 1746 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1747 break; 1748 case T_FLOAT: 1749 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1750 break; 1751 case T_LONG: 1752 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1753 break; 1754 case T_DOUBLE: 1755 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1756 break; 1757 default: 1758 assert(false,"Should not reach here."); 1759 break; 1760 } 1761 } 1762 1763 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1764 switch(typ) { 1765 case T_INT: 1766 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1767 break; 1768 case T_FLOAT: 1769 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1770 break; 1771 case T_LONG: 1772 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1773 break; 1774 case T_DOUBLE: 1775 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1776 break; 1777 default: 1778 assert(false,"Should not reach here."); 1779 break; 1780 } 1781 } 1782 1783 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1784 if (vlen_in_bytes <= 16) { 1785 pxor (dst, dst); 1786 psubb(dst, src); 1787 switch (elem_bt) { 1788 case T_BYTE: /* nothing to do */ break; 1789 case T_SHORT: pmovsxbw(dst, dst); break; 1790 case T_INT: pmovsxbd(dst, dst); break; 1791 case T_FLOAT: pmovsxbd(dst, dst); break; 1792 case T_LONG: pmovsxbq(dst, dst); break; 1793 case T_DOUBLE: pmovsxbq(dst, dst); break; 1794 1795 default: assert(false, "%s", type2name(elem_bt)); 1796 } 1797 } else { 1798 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1799 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1800 1801 vpxor (dst, dst, dst, vlen_enc); 1802 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1803 1804 switch (elem_bt) { 1805 case T_BYTE: /* nothing to do */ break; 1806 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1807 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1808 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1809 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1810 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1811 1812 default: assert(false, "%s", type2name(elem_bt)); 1813 } 1814 } 1815 } 1816 1817 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1818 if (novlbwdq) { 1819 vpmovsxbd(xtmp, src, vlen_enc); 1820 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1821 Assembler::eq, true, vlen_enc, noreg); 1822 } else { 1823 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1824 vpsubb(xtmp, xtmp, src, vlen_enc); 1825 evpmovb2m(dst, xtmp, vlen_enc); 1826 } 1827 } 1828 1829 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) { 1830 if (is_integral_type(bt)) { 1831 switch (vlen_in_bytes) { 1832 case 4: movdl(dst, src); break; 1833 case 8: movq(dst, src); break; 1834 case 16: movdqu(dst, src); break; 1835 case 32: vmovdqu(dst, src); break; 1836 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1837 default: ShouldNotReachHere(); 1838 } 1839 } else { 1840 switch (vlen_in_bytes) { 1841 case 4: movflt(dst, src); break; 1842 case 8: movdbl(dst, src); break; 1843 case 16: movups(dst, src); break; 1844 case 32: vmovups(dst, src, Assembler::AVX_256bit); break; 1845 case 64: vmovups(dst, src, Assembler::AVX_512bit); break; 1846 default: ShouldNotReachHere(); 1847 } 1848 } 1849 } 1850 1851 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1852 assert(rscratch != noreg || always_reachable(src), "missing"); 1853 1854 if (reachable(src)) { 1855 load_vector(bt, dst, as_Address(src), vlen_in_bytes); 1856 } else { 1857 lea(rscratch, src); 1858 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes); 1859 } 1860 } 1861 1862 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1863 int vlen_enc = vector_length_encoding(vlen); 1864 if (VM_Version::supports_avx()) { 1865 if (bt == T_LONG) { 1866 if (VM_Version::supports_avx2()) { 1867 vpbroadcastq(dst, src, vlen_enc); 1868 } else { 1869 vmovddup(dst, src, vlen_enc); 1870 } 1871 } else if (bt == T_DOUBLE) { 1872 if (vlen_enc != Assembler::AVX_128bit) { 1873 vbroadcastsd(dst, src, vlen_enc, noreg); 1874 } else { 1875 vmovddup(dst, src, vlen_enc); 1876 } 1877 } else { 1878 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1879 vpbroadcastd(dst, src, vlen_enc); 1880 } else { 1881 vbroadcastss(dst, src, vlen_enc); 1882 } 1883 } 1884 } else if (VM_Version::supports_sse3()) { 1885 movddup(dst, src); 1886 } else { 1887 load_vector(bt, dst, src, vlen); 1888 } 1889 } 1890 1891 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1892 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1893 int offset = exact_log2(type2aelembytes(bt)) << 6; 1894 if (is_floating_point_type(bt)) { 1895 offset += 128; 1896 } 1897 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1898 load_vector(T_BYTE, dst, addr, vlen_in_bytes); 1899 } 1900 1901 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1902 1903 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1904 int vector_len = Assembler::AVX_128bit; 1905 1906 switch (opcode) { 1907 case Op_AndReductionV: pand(dst, src); break; 1908 case Op_OrReductionV: por (dst, src); break; 1909 case Op_XorReductionV: pxor(dst, src); break; 1910 case Op_MinReductionV: 1911 switch (typ) { 1912 case T_BYTE: pminsb(dst, src); break; 1913 case T_SHORT: pminsw(dst, src); break; 1914 case T_INT: pminsd(dst, src); break; 1915 case T_LONG: assert(UseAVX > 2, "required"); 1916 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1917 default: assert(false, "wrong type"); 1918 } 1919 break; 1920 case Op_MaxReductionV: 1921 switch (typ) { 1922 case T_BYTE: pmaxsb(dst, src); break; 1923 case T_SHORT: pmaxsw(dst, src); break; 1924 case T_INT: pmaxsd(dst, src); break; 1925 case T_LONG: assert(UseAVX > 2, "required"); 1926 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1927 default: assert(false, "wrong type"); 1928 } 1929 break; 1930 case Op_AddReductionVF: addss(dst, src); break; 1931 case Op_AddReductionVD: addsd(dst, src); break; 1932 case Op_AddReductionVI: 1933 switch (typ) { 1934 case T_BYTE: paddb(dst, src); break; 1935 case T_SHORT: paddw(dst, src); break; 1936 case T_INT: paddd(dst, src); break; 1937 default: assert(false, "wrong type"); 1938 } 1939 break; 1940 case Op_AddReductionVL: paddq(dst, src); break; 1941 case Op_MulReductionVF: mulss(dst, src); break; 1942 case Op_MulReductionVD: mulsd(dst, src); break; 1943 case Op_MulReductionVI: 1944 switch (typ) { 1945 case T_SHORT: pmullw(dst, src); break; 1946 case T_INT: pmulld(dst, src); break; 1947 default: assert(false, "wrong type"); 1948 } 1949 break; 1950 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1951 evpmullq(dst, dst, src, vector_len); break; 1952 default: assert(false, "wrong opcode"); 1953 } 1954 } 1955 1956 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1957 switch (opcode) { 1958 case Op_AddReductionVF: addps(dst, src); break; 1959 case Op_AddReductionVD: addpd(dst, src); break; 1960 case Op_MulReductionVF: mulps(dst, src); break; 1961 case Op_MulReductionVD: mulpd(dst, src); break; 1962 default: assert(false, "%s", NodeClassNames[opcode]); 1963 } 1964 } 1965 1966 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1967 int vector_len = Assembler::AVX_256bit; 1968 1969 switch (opcode) { 1970 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1971 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1972 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1973 case Op_MinReductionV: 1974 switch (typ) { 1975 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1976 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1977 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1978 case T_LONG: assert(UseAVX > 2, "required"); 1979 vpminsq(dst, src1, src2, vector_len); break; 1980 default: assert(false, "wrong type"); 1981 } 1982 break; 1983 case Op_MaxReductionV: 1984 switch (typ) { 1985 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1986 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1987 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1988 case T_LONG: assert(UseAVX > 2, "required"); 1989 vpmaxsq(dst, src1, src2, vector_len); break; 1990 default: assert(false, "wrong type"); 1991 } 1992 break; 1993 case Op_AddReductionVI: 1994 switch (typ) { 1995 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1996 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1997 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1998 default: assert(false, "wrong type"); 1999 } 2000 break; 2001 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 2002 case Op_MulReductionVI: 2003 switch (typ) { 2004 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 2005 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 2006 default: assert(false, "wrong type"); 2007 } 2008 break; 2009 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 2010 default: assert(false, "wrong opcode"); 2011 } 2012 } 2013 2014 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 2015 int vector_len = Assembler::AVX_256bit; 2016 2017 switch (opcode) { 2018 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 2019 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 2020 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 2021 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 2022 default: assert(false, "%s", NodeClassNames[opcode]); 2023 } 2024 } 2025 2026 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 2027 XMMRegister dst, XMMRegister src, 2028 XMMRegister vtmp1, XMMRegister vtmp2) { 2029 switch (opcode) { 2030 case Op_AddReductionVF: 2031 case Op_MulReductionVF: 2032 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 2033 break; 2034 2035 case Op_AddReductionVD: 2036 case Op_MulReductionVD: 2037 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 2038 break; 2039 2040 default: assert(false, "wrong opcode"); 2041 } 2042 } 2043 2044 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 2045 XMMRegister dst, XMMRegister src, 2046 XMMRegister vtmp1, XMMRegister vtmp2) { 2047 switch (opcode) { 2048 case Op_AddReductionVF: 2049 case Op_MulReductionVF: 2050 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 2051 break; 2052 2053 case Op_AddReductionVD: 2054 case Op_MulReductionVD: 2055 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 2056 break; 2057 2058 default: assert(false, "%s", NodeClassNames[opcode]); 2059 } 2060 } 2061 2062 void C2_MacroAssembler::reduceB(int opcode, int vlen, 2063 Register dst, Register src1, XMMRegister src2, 2064 XMMRegister vtmp1, XMMRegister vtmp2) { 2065 switch (vlen) { 2066 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2067 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2068 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2069 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2070 2071 default: assert(false, "wrong vector length"); 2072 } 2073 } 2074 2075 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 2076 Register dst, Register src1, XMMRegister src2, 2077 XMMRegister vtmp1, XMMRegister vtmp2) { 2078 switch (vlen) { 2079 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2080 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2081 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2082 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2083 2084 default: assert(false, "wrong vector length"); 2085 } 2086 } 2087 2088 void C2_MacroAssembler::reduceS(int opcode, int vlen, 2089 Register dst, Register src1, XMMRegister src2, 2090 XMMRegister vtmp1, XMMRegister vtmp2) { 2091 switch (vlen) { 2092 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2093 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2094 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2095 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2096 2097 default: assert(false, "wrong vector length"); 2098 } 2099 } 2100 2101 void C2_MacroAssembler::reduceI(int opcode, int vlen, 2102 Register dst, Register src1, XMMRegister src2, 2103 XMMRegister vtmp1, XMMRegister vtmp2) { 2104 switch (vlen) { 2105 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2106 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2107 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2108 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2109 2110 default: assert(false, "wrong vector length"); 2111 } 2112 } 2113 2114 void C2_MacroAssembler::reduceL(int opcode, int vlen, 2115 Register dst, Register src1, XMMRegister src2, 2116 XMMRegister vtmp1, XMMRegister vtmp2) { 2117 switch (vlen) { 2118 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2119 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2120 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2121 2122 default: assert(false, "wrong vector length"); 2123 } 2124 } 2125 2126 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2127 switch (vlen) { 2128 case 2: 2129 assert(vtmp2 == xnoreg, ""); 2130 reduce2F(opcode, dst, src, vtmp1); 2131 break; 2132 case 4: 2133 assert(vtmp2 == xnoreg, ""); 2134 reduce4F(opcode, dst, src, vtmp1); 2135 break; 2136 case 8: 2137 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2138 break; 2139 case 16: 2140 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2141 break; 2142 default: assert(false, "wrong vector length"); 2143 } 2144 } 2145 2146 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2147 switch (vlen) { 2148 case 2: 2149 assert(vtmp2 == xnoreg, ""); 2150 reduce2D(opcode, dst, src, vtmp1); 2151 break; 2152 case 4: 2153 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2154 break; 2155 case 8: 2156 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2157 break; 2158 default: assert(false, "wrong vector length"); 2159 } 2160 } 2161 2162 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2163 switch (vlen) { 2164 case 2: 2165 assert(vtmp1 == xnoreg, ""); 2166 assert(vtmp2 == xnoreg, ""); 2167 unorderedReduce2F(opcode, dst, src); 2168 break; 2169 case 4: 2170 assert(vtmp2 == xnoreg, ""); 2171 unorderedReduce4F(opcode, dst, src, vtmp1); 2172 break; 2173 case 8: 2174 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 2175 break; 2176 case 16: 2177 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 2178 break; 2179 default: assert(false, "wrong vector length"); 2180 } 2181 } 2182 2183 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2184 switch (vlen) { 2185 case 2: 2186 assert(vtmp1 == xnoreg, ""); 2187 assert(vtmp2 == xnoreg, ""); 2188 unorderedReduce2D(opcode, dst, src); 2189 break; 2190 case 4: 2191 assert(vtmp2 == xnoreg, ""); 2192 unorderedReduce4D(opcode, dst, src, vtmp1); 2193 break; 2194 case 8: 2195 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2196 break; 2197 default: assert(false, "wrong vector length"); 2198 } 2199 } 2200 2201 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2202 if (opcode == Op_AddReductionVI) { 2203 if (vtmp1 != src2) { 2204 movdqu(vtmp1, src2); 2205 } 2206 phaddd(vtmp1, vtmp1); 2207 } else { 2208 pshufd(vtmp1, src2, 0x1); 2209 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2210 } 2211 movdl(vtmp2, src1); 2212 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2213 movdl(dst, vtmp1); 2214 } 2215 2216 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2217 if (opcode == Op_AddReductionVI) { 2218 if (vtmp1 != src2) { 2219 movdqu(vtmp1, src2); 2220 } 2221 phaddd(vtmp1, src2); 2222 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2223 } else { 2224 pshufd(vtmp2, src2, 0xE); 2225 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2226 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2227 } 2228 } 2229 2230 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2231 if (opcode == Op_AddReductionVI) { 2232 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2233 vextracti128_high(vtmp2, vtmp1); 2234 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2235 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2236 } else { 2237 vextracti128_high(vtmp1, src2); 2238 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2239 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2240 } 2241 } 2242 2243 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2244 vextracti64x4_high(vtmp2, src2); 2245 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2246 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2247 } 2248 2249 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2250 pshufd(vtmp2, src2, 0x1); 2251 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2252 movdqu(vtmp1, vtmp2); 2253 psrldq(vtmp1, 2); 2254 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2255 movdqu(vtmp2, vtmp1); 2256 psrldq(vtmp2, 1); 2257 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2258 movdl(vtmp2, src1); 2259 pmovsxbd(vtmp1, vtmp1); 2260 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2261 pextrb(dst, vtmp1, 0x0); 2262 movsbl(dst, dst); 2263 } 2264 2265 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2266 pshufd(vtmp1, src2, 0xE); 2267 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2268 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2269 } 2270 2271 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2272 vextracti128_high(vtmp2, src2); 2273 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2274 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2275 } 2276 2277 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2278 vextracti64x4_high(vtmp1, src2); 2279 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2280 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2281 } 2282 2283 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2284 pmovsxbw(vtmp2, src2); 2285 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2286 } 2287 2288 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2289 if (UseAVX > 1) { 2290 int vector_len = Assembler::AVX_256bit; 2291 vpmovsxbw(vtmp1, src2, vector_len); 2292 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2293 } else { 2294 pmovsxbw(vtmp2, src2); 2295 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2296 pshufd(vtmp2, src2, 0x1); 2297 pmovsxbw(vtmp2, src2); 2298 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2299 } 2300 } 2301 2302 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2303 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2304 int vector_len = Assembler::AVX_512bit; 2305 vpmovsxbw(vtmp1, src2, vector_len); 2306 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2307 } else { 2308 assert(UseAVX >= 2,"Should not reach here."); 2309 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2310 vextracti128_high(vtmp2, src2); 2311 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2312 } 2313 } 2314 2315 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2316 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2317 vextracti64x4_high(vtmp2, src2); 2318 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2319 } 2320 2321 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2322 if (opcode == Op_AddReductionVI) { 2323 if (vtmp1 != src2) { 2324 movdqu(vtmp1, src2); 2325 } 2326 phaddw(vtmp1, vtmp1); 2327 phaddw(vtmp1, vtmp1); 2328 } else { 2329 pshufd(vtmp2, src2, 0x1); 2330 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2331 movdqu(vtmp1, vtmp2); 2332 psrldq(vtmp1, 2); 2333 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2334 } 2335 movdl(vtmp2, src1); 2336 pmovsxwd(vtmp1, vtmp1); 2337 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2338 pextrw(dst, vtmp1, 0x0); 2339 movswl(dst, dst); 2340 } 2341 2342 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2343 if (opcode == Op_AddReductionVI) { 2344 if (vtmp1 != src2) { 2345 movdqu(vtmp1, src2); 2346 } 2347 phaddw(vtmp1, src2); 2348 } else { 2349 pshufd(vtmp1, src2, 0xE); 2350 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2351 } 2352 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2353 } 2354 2355 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2356 if (opcode == Op_AddReductionVI) { 2357 int vector_len = Assembler::AVX_256bit; 2358 vphaddw(vtmp2, src2, src2, vector_len); 2359 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2360 } else { 2361 vextracti128_high(vtmp2, src2); 2362 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2363 } 2364 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2365 } 2366 2367 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2368 int vector_len = Assembler::AVX_256bit; 2369 vextracti64x4_high(vtmp1, src2); 2370 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2371 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2372 } 2373 2374 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2375 pshufd(vtmp2, src2, 0xE); 2376 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2377 movdq(vtmp1, src1); 2378 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2379 movdq(dst, vtmp1); 2380 } 2381 2382 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2383 vextracti128_high(vtmp1, src2); 2384 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2385 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2386 } 2387 2388 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2389 vextracti64x4_high(vtmp2, src2); 2390 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2391 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2392 } 2393 2394 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2395 mov64(temp, -1L); 2396 bzhiq(temp, temp, len); 2397 kmovql(dst, temp); 2398 } 2399 2400 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2401 reduce_operation_128(T_FLOAT, opcode, dst, src); 2402 pshufd(vtmp, src, 0x1); 2403 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2404 } 2405 2406 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2407 reduce2F(opcode, dst, src, vtmp); 2408 pshufd(vtmp, src, 0x2); 2409 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2410 pshufd(vtmp, src, 0x3); 2411 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2412 } 2413 2414 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2415 reduce4F(opcode, dst, src, vtmp2); 2416 vextractf128_high(vtmp2, src); 2417 reduce4F(opcode, dst, vtmp2, vtmp1); 2418 } 2419 2420 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2421 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2422 vextracti64x4_high(vtmp1, src); 2423 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2424 } 2425 2426 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2427 pshufd(dst, src, 0x1); 2428 reduce_operation_128(T_FLOAT, opcode, dst, src); 2429 } 2430 2431 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2432 pshufd(vtmp, src, 0xE); 2433 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2434 unorderedReduce2F(opcode, dst, vtmp); 2435 } 2436 2437 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2438 vextractf128_high(vtmp1, src); 2439 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2440 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2441 } 2442 2443 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2444 vextractf64x4_high(vtmp2, src); 2445 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2446 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2447 } 2448 2449 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2450 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2451 pshufd(vtmp, src, 0xE); 2452 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2453 } 2454 2455 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2456 reduce2D(opcode, dst, src, vtmp2); 2457 vextractf128_high(vtmp2, src); 2458 reduce2D(opcode, dst, vtmp2, vtmp1); 2459 } 2460 2461 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2462 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2463 vextracti64x4_high(vtmp1, src); 2464 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2465 } 2466 2467 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2468 pshufd(dst, src, 0xE); 2469 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2470 } 2471 2472 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2473 vextractf128_high(vtmp, src); 2474 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2475 unorderedReduce2D(opcode, dst, vtmp); 2476 } 2477 2478 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2479 vextractf64x4_high(vtmp2, src); 2480 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2481 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2482 } 2483 2484 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2485 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2486 } 2487 2488 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2489 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2490 } 2491 2492 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 2493 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2494 } 2495 2496 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2497 int vec_enc) { 2498 switch(elem_bt) { 2499 case T_INT: 2500 case T_FLOAT: 2501 vmaskmovps(dst, src, mask, vec_enc); 2502 break; 2503 case T_LONG: 2504 case T_DOUBLE: 2505 vmaskmovpd(dst, src, mask, vec_enc); 2506 break; 2507 default: 2508 fatal("Unsupported type %s", type2name(elem_bt)); 2509 break; 2510 } 2511 } 2512 2513 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2514 int vec_enc) { 2515 switch(elem_bt) { 2516 case T_INT: 2517 case T_FLOAT: 2518 vmaskmovps(dst, src, mask, vec_enc); 2519 break; 2520 case T_LONG: 2521 case T_DOUBLE: 2522 vmaskmovpd(dst, src, mask, vec_enc); 2523 break; 2524 default: 2525 fatal("Unsupported type %s", type2name(elem_bt)); 2526 break; 2527 } 2528 } 2529 2530 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2531 XMMRegister dst, XMMRegister src, 2532 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2533 XMMRegister xmm_0, XMMRegister xmm_1) { 2534 const int permconst[] = {1, 14}; 2535 XMMRegister wsrc = src; 2536 XMMRegister wdst = xmm_0; 2537 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2538 2539 int vlen_enc = Assembler::AVX_128bit; 2540 if (vlen == 16) { 2541 vlen_enc = Assembler::AVX_256bit; 2542 } 2543 2544 for (int i = log2(vlen) - 1; i >=0; i--) { 2545 if (i == 0 && !is_dst_valid) { 2546 wdst = dst; 2547 } 2548 if (i == 3) { 2549 vextracti64x4_high(wtmp, wsrc); 2550 } else if (i == 2) { 2551 vextracti128_high(wtmp, wsrc); 2552 } else { // i = [0,1] 2553 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2554 } 2555 2556 if (VM_Version::supports_avx10_2()) { 2557 vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc); 2558 } else { 2559 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2560 } 2561 wsrc = wdst; 2562 vlen_enc = Assembler::AVX_128bit; 2563 } 2564 if (is_dst_valid) { 2565 if (VM_Version::supports_avx10_2()) { 2566 vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit); 2567 } else { 2568 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2569 } 2570 } 2571 } 2572 2573 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2574 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2575 XMMRegister xmm_0, XMMRegister xmm_1) { 2576 XMMRegister wsrc = src; 2577 XMMRegister wdst = xmm_0; 2578 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2579 int vlen_enc = Assembler::AVX_128bit; 2580 if (vlen == 8) { 2581 vlen_enc = Assembler::AVX_256bit; 2582 } 2583 for (int i = log2(vlen) - 1; i >=0; i--) { 2584 if (i == 0 && !is_dst_valid) { 2585 wdst = dst; 2586 } 2587 if (i == 1) { 2588 vextracti128_high(wtmp, wsrc); 2589 } else if (i == 2) { 2590 vextracti64x4_high(wtmp, wsrc); 2591 } else { 2592 assert(i == 0, "%d", i); 2593 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2594 } 2595 2596 if (VM_Version::supports_avx10_2()) { 2597 vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc); 2598 } else { 2599 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2600 } 2601 2602 wsrc = wdst; 2603 vlen_enc = Assembler::AVX_128bit; 2604 } 2605 2606 if (is_dst_valid) { 2607 if (VM_Version::supports_avx10_2()) { 2608 vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit); 2609 } else { 2610 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2611 } 2612 } 2613 } 2614 2615 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2616 switch (bt) { 2617 case T_BYTE: pextrb(dst, src, idx); break; 2618 case T_SHORT: pextrw(dst, src, idx); break; 2619 case T_INT: pextrd(dst, src, idx); break; 2620 case T_LONG: pextrq(dst, src, idx); break; 2621 2622 default: 2623 assert(false,"Should not reach here."); 2624 break; 2625 } 2626 } 2627 2628 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2629 int esize = type2aelembytes(typ); 2630 int elem_per_lane = 16/esize; 2631 int lane = elemindex / elem_per_lane; 2632 int eindex = elemindex % elem_per_lane; 2633 2634 if (lane >= 2) { 2635 assert(UseAVX > 2, "required"); 2636 vextractf32x4(dst, src, lane & 3); 2637 return dst; 2638 } else if (lane > 0) { 2639 assert(UseAVX > 0, "required"); 2640 vextractf128(dst, src, lane); 2641 return dst; 2642 } else { 2643 return src; 2644 } 2645 } 2646 2647 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2648 if (typ == T_BYTE) { 2649 movsbl(dst, dst); 2650 } else if (typ == T_SHORT) { 2651 movswl(dst, dst); 2652 } 2653 } 2654 2655 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2656 int esize = type2aelembytes(typ); 2657 int elem_per_lane = 16/esize; 2658 int eindex = elemindex % elem_per_lane; 2659 assert(is_integral_type(typ),"required"); 2660 2661 if (eindex == 0) { 2662 if (typ == T_LONG) { 2663 movq(dst, src); 2664 } else { 2665 movdl(dst, src); 2666 movsxl(typ, dst); 2667 } 2668 } else { 2669 extract(typ, dst, src, eindex); 2670 movsxl(typ, dst); 2671 } 2672 } 2673 2674 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2675 int esize = type2aelembytes(typ); 2676 int elem_per_lane = 16/esize; 2677 int eindex = elemindex % elem_per_lane; 2678 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2679 2680 if (eindex == 0) { 2681 movq(dst, src); 2682 } else { 2683 if (typ == T_FLOAT) { 2684 if (UseAVX == 0) { 2685 movdqu(dst, src); 2686 shufps(dst, dst, eindex); 2687 } else { 2688 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2689 } 2690 } else { 2691 if (UseAVX == 0) { 2692 movdqu(dst, src); 2693 psrldq(dst, eindex*esize); 2694 } else { 2695 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2696 } 2697 movq(dst, dst); 2698 } 2699 } 2700 // Zero upper bits 2701 if (typ == T_FLOAT) { 2702 if (UseAVX == 0) { 2703 assert(vtmp != xnoreg, "required."); 2704 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2705 pand(dst, vtmp); 2706 } else { 2707 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2708 } 2709 } 2710 } 2711 2712 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2713 switch(typ) { 2714 case T_BYTE: 2715 case T_BOOLEAN: 2716 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2717 break; 2718 case T_SHORT: 2719 case T_CHAR: 2720 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2721 break; 2722 case T_INT: 2723 case T_FLOAT: 2724 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2725 break; 2726 case T_LONG: 2727 case T_DOUBLE: 2728 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2729 break; 2730 default: 2731 assert(false,"Should not reach here."); 2732 break; 2733 } 2734 } 2735 2736 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2737 assert(rscratch != noreg || always_reachable(src2), "missing"); 2738 2739 switch(typ) { 2740 case T_BOOLEAN: 2741 case T_BYTE: 2742 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2743 break; 2744 case T_CHAR: 2745 case T_SHORT: 2746 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2747 break; 2748 case T_INT: 2749 case T_FLOAT: 2750 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2751 break; 2752 case T_LONG: 2753 case T_DOUBLE: 2754 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2755 break; 2756 default: 2757 assert(false,"Should not reach here."); 2758 break; 2759 } 2760 } 2761 2762 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2763 switch(typ) { 2764 case T_BYTE: 2765 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2766 break; 2767 case T_SHORT: 2768 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2769 break; 2770 case T_INT: 2771 case T_FLOAT: 2772 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2773 break; 2774 case T_LONG: 2775 case T_DOUBLE: 2776 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2777 break; 2778 default: 2779 assert(false,"Should not reach here."); 2780 break; 2781 } 2782 } 2783 2784 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2785 assert(vlen_in_bytes <= 32, ""); 2786 int esize = type2aelembytes(bt); 2787 if (vlen_in_bytes == 32) { 2788 assert(vtmp == xnoreg, "required."); 2789 if (esize >= 4) { 2790 vtestps(src1, src2, AVX_256bit); 2791 } else { 2792 vptest(src1, src2, AVX_256bit); 2793 } 2794 return; 2795 } 2796 if (vlen_in_bytes < 16) { 2797 // Duplicate the lower part to fill the whole register, 2798 // Don't need to do so for src2 2799 assert(vtmp != xnoreg, "required"); 2800 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2801 pshufd(vtmp, src1, shuffle_imm); 2802 } else { 2803 assert(vtmp == xnoreg, "required"); 2804 vtmp = src1; 2805 } 2806 if (esize >= 4 && VM_Version::supports_avx()) { 2807 vtestps(vtmp, src2, AVX_128bit); 2808 } else { 2809 ptest(vtmp, src2); 2810 } 2811 } 2812 2813 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2814 #ifdef ASSERT 2815 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2816 bool is_bw_supported = VM_Version::supports_avx512bw(); 2817 if (is_bw && !is_bw_supported) { 2818 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2819 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2820 "XMM register should be 0-15"); 2821 } 2822 #endif // ASSERT 2823 switch (elem_bt) { 2824 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2825 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2826 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2827 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2828 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2829 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2830 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2831 } 2832 } 2833 2834 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2835 assert(UseAVX >= 2, "required"); 2836 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2837 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2838 if ((UseAVX > 2) && 2839 (!is_bw || VM_Version::supports_avx512bw()) && 2840 (!is_vl || VM_Version::supports_avx512vl())) { 2841 switch (elem_bt) { 2842 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2843 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2844 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2845 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2846 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2847 } 2848 } else { 2849 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2850 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2851 switch (elem_bt) { 2852 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2853 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2854 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2855 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2856 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2857 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2858 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2859 } 2860 } 2861 } 2862 2863 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2864 switch (to_elem_bt) { 2865 case T_SHORT: 2866 vpmovsxbw(dst, src, vlen_enc); 2867 break; 2868 case T_INT: 2869 vpmovsxbd(dst, src, vlen_enc); 2870 break; 2871 case T_FLOAT: 2872 vpmovsxbd(dst, src, vlen_enc); 2873 vcvtdq2ps(dst, dst, vlen_enc); 2874 break; 2875 case T_LONG: 2876 vpmovsxbq(dst, src, vlen_enc); 2877 break; 2878 case T_DOUBLE: { 2879 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2880 vpmovsxbd(dst, src, mid_vlen_enc); 2881 vcvtdq2pd(dst, dst, vlen_enc); 2882 break; 2883 } 2884 default: 2885 fatal("Unsupported type %s", type2name(to_elem_bt)); 2886 break; 2887 } 2888 } 2889 2890 //------------------------------------------------------------------------------------------- 2891 2892 // IndexOf for constant substrings with size >= 8 chars 2893 // which don't need to be loaded through stack. 2894 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2895 Register cnt1, Register cnt2, 2896 int int_cnt2, Register result, 2897 XMMRegister vec, Register tmp, 2898 int ae) { 2899 ShortBranchVerifier sbv(this); 2900 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2901 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2902 2903 // This method uses the pcmpestri instruction with bound registers 2904 // inputs: 2905 // xmm - substring 2906 // rax - substring length (elements count) 2907 // mem - scanned string 2908 // rdx - string length (elements count) 2909 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2910 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2911 // outputs: 2912 // rcx - matched index in string 2913 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2914 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2915 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2916 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2917 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2918 2919 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2920 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2921 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2922 2923 // Note, inline_string_indexOf() generates checks: 2924 // if (substr.count > string.count) return -1; 2925 // if (substr.count == 0) return 0; 2926 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2927 2928 // Load substring. 2929 if (ae == StrIntrinsicNode::UL) { 2930 pmovzxbw(vec, Address(str2, 0)); 2931 } else { 2932 movdqu(vec, Address(str2, 0)); 2933 } 2934 movl(cnt2, int_cnt2); 2935 movptr(result, str1); // string addr 2936 2937 if (int_cnt2 > stride) { 2938 jmpb(SCAN_TO_SUBSTR); 2939 2940 // Reload substr for rescan, this code 2941 // is executed only for large substrings (> 8 chars) 2942 bind(RELOAD_SUBSTR); 2943 if (ae == StrIntrinsicNode::UL) { 2944 pmovzxbw(vec, Address(str2, 0)); 2945 } else { 2946 movdqu(vec, Address(str2, 0)); 2947 } 2948 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2949 2950 bind(RELOAD_STR); 2951 // We came here after the beginning of the substring was 2952 // matched but the rest of it was not so we need to search 2953 // again. Start from the next element after the previous match. 2954 2955 // cnt2 is number of substring reminding elements and 2956 // cnt1 is number of string reminding elements when cmp failed. 2957 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2958 subl(cnt1, cnt2); 2959 addl(cnt1, int_cnt2); 2960 movl(cnt2, int_cnt2); // Now restore cnt2 2961 2962 decrementl(cnt1); // Shift to next element 2963 cmpl(cnt1, cnt2); 2964 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2965 2966 addptr(result, (1<<scale1)); 2967 2968 } // (int_cnt2 > 8) 2969 2970 // Scan string for start of substr in 16-byte vectors 2971 bind(SCAN_TO_SUBSTR); 2972 pcmpestri(vec, Address(result, 0), mode); 2973 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2974 subl(cnt1, stride); 2975 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2976 cmpl(cnt1, cnt2); 2977 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2978 addptr(result, 16); 2979 jmpb(SCAN_TO_SUBSTR); 2980 2981 // Found a potential substr 2982 bind(FOUND_CANDIDATE); 2983 // Matched whole vector if first element matched (tmp(rcx) == 0). 2984 if (int_cnt2 == stride) { 2985 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2986 } else { // int_cnt2 > 8 2987 jccb(Assembler::overflow, FOUND_SUBSTR); 2988 } 2989 // After pcmpestri tmp(rcx) contains matched element index 2990 // Compute start addr of substr 2991 lea(result, Address(result, tmp, scale1)); 2992 2993 // Make sure string is still long enough 2994 subl(cnt1, tmp); 2995 cmpl(cnt1, cnt2); 2996 if (int_cnt2 == stride) { 2997 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2998 } else { // int_cnt2 > 8 2999 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 3000 } 3001 // Left less then substring. 3002 3003 bind(RET_NOT_FOUND); 3004 movl(result, -1); 3005 jmp(EXIT); 3006 3007 if (int_cnt2 > stride) { 3008 // This code is optimized for the case when whole substring 3009 // is matched if its head is matched. 3010 bind(MATCH_SUBSTR_HEAD); 3011 pcmpestri(vec, Address(result, 0), mode); 3012 // Reload only string if does not match 3013 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 3014 3015 Label CONT_SCAN_SUBSTR; 3016 // Compare the rest of substring (> 8 chars). 3017 bind(FOUND_SUBSTR); 3018 // First 8 chars are already matched. 3019 negptr(cnt2); 3020 addptr(cnt2, stride); 3021 3022 bind(SCAN_SUBSTR); 3023 subl(cnt1, stride); 3024 cmpl(cnt2, -stride); // Do not read beyond substring 3025 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 3026 // Back-up strings to avoid reading beyond substring: 3027 // cnt1 = cnt1 - cnt2 + 8 3028 addl(cnt1, cnt2); // cnt2 is negative 3029 addl(cnt1, stride); 3030 movl(cnt2, stride); negptr(cnt2); 3031 bind(CONT_SCAN_SUBSTR); 3032 if (int_cnt2 < (int)G) { 3033 int tail_off1 = int_cnt2<<scale1; 3034 int tail_off2 = int_cnt2<<scale2; 3035 if (ae == StrIntrinsicNode::UL) { 3036 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 3037 } else { 3038 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 3039 } 3040 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 3041 } else { 3042 // calculate index in register to avoid integer overflow (int_cnt2*2) 3043 movl(tmp, int_cnt2); 3044 addptr(tmp, cnt2); 3045 if (ae == StrIntrinsicNode::UL) { 3046 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 3047 } else { 3048 movdqu(vec, Address(str2, tmp, scale2, 0)); 3049 } 3050 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 3051 } 3052 // Need to reload strings pointers if not matched whole vector 3053 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3054 addptr(cnt2, stride); 3055 jcc(Assembler::negative, SCAN_SUBSTR); 3056 // Fall through if found full substring 3057 3058 } // (int_cnt2 > 8) 3059 3060 bind(RET_FOUND); 3061 // Found result if we matched full small substring. 3062 // Compute substr offset 3063 subptr(result, str1); 3064 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3065 shrl(result, 1); // index 3066 } 3067 bind(EXIT); 3068 3069 } // string_indexofC8 3070 3071 // Small strings are loaded through stack if they cross page boundary. 3072 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 3073 Register cnt1, Register cnt2, 3074 int int_cnt2, Register result, 3075 XMMRegister vec, Register tmp, 3076 int ae) { 3077 ShortBranchVerifier sbv(this); 3078 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3079 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3080 3081 // 3082 // int_cnt2 is length of small (< 8 chars) constant substring 3083 // or (-1) for non constant substring in which case its length 3084 // is in cnt2 register. 3085 // 3086 // Note, inline_string_indexOf() generates checks: 3087 // if (substr.count > string.count) return -1; 3088 // if (substr.count == 0) return 0; 3089 // 3090 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 3091 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 3092 // This method uses the pcmpestri instruction with bound registers 3093 // inputs: 3094 // xmm - substring 3095 // rax - substring length (elements count) 3096 // mem - scanned string 3097 // rdx - string length (elements count) 3098 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 3099 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 3100 // outputs: 3101 // rcx - matched index in string 3102 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3103 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 3104 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 3105 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 3106 3107 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 3108 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 3109 FOUND_CANDIDATE; 3110 3111 { //======================================================== 3112 // We don't know where these strings are located 3113 // and we can't read beyond them. Load them through stack. 3114 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 3115 3116 movptr(tmp, rsp); // save old SP 3117 3118 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 3119 if (int_cnt2 == (1>>scale2)) { // One byte 3120 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 3121 load_unsigned_byte(result, Address(str2, 0)); 3122 movdl(vec, result); // move 32 bits 3123 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 3124 // Not enough header space in 32-bit VM: 12+3 = 15. 3125 movl(result, Address(str2, -1)); 3126 shrl(result, 8); 3127 movdl(vec, result); // move 32 bits 3128 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 3129 load_unsigned_short(result, Address(str2, 0)); 3130 movdl(vec, result); // move 32 bits 3131 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 3132 movdl(vec, Address(str2, 0)); // move 32 bits 3133 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 3134 movq(vec, Address(str2, 0)); // move 64 bits 3135 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 3136 // Array header size is 12 bytes in 32-bit VM 3137 // + 6 bytes for 3 chars == 18 bytes, 3138 // enough space to load vec and shift. 3139 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3140 if (ae == StrIntrinsicNode::UL) { 3141 int tail_off = int_cnt2-8; 3142 pmovzxbw(vec, Address(str2, tail_off)); 3143 psrldq(vec, -2*tail_off); 3144 } 3145 else { 3146 int tail_off = int_cnt2*(1<<scale2); 3147 movdqu(vec, Address(str2, tail_off-16)); 3148 psrldq(vec, 16-tail_off); 3149 } 3150 } 3151 } else { // not constant substring 3152 cmpl(cnt2, stride); 3153 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3154 3155 // We can read beyond string if srt+16 does not cross page boundary 3156 // since heaps are aligned and mapped by pages. 3157 assert(os::vm_page_size() < (int)G, "default page should be small"); 3158 movl(result, str2); // We need only low 32 bits 3159 andl(result, ((int)os::vm_page_size()-1)); 3160 cmpl(result, ((int)os::vm_page_size()-16)); 3161 jccb(Assembler::belowEqual, CHECK_STR); 3162 3163 // Move small strings to stack to allow load 16 bytes into vec. 3164 subptr(rsp, 16); 3165 int stk_offset = wordSize-(1<<scale2); 3166 push(cnt2); 3167 3168 bind(COPY_SUBSTR); 3169 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3170 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3171 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3172 } else if (ae == StrIntrinsicNode::UU) { 3173 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3174 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3175 } 3176 decrement(cnt2); 3177 jccb(Assembler::notZero, COPY_SUBSTR); 3178 3179 pop(cnt2); 3180 movptr(str2, rsp); // New substring address 3181 } // non constant 3182 3183 bind(CHECK_STR); 3184 cmpl(cnt1, stride); 3185 jccb(Assembler::aboveEqual, BIG_STRINGS); 3186 3187 // Check cross page boundary. 3188 movl(result, str1); // We need only low 32 bits 3189 andl(result, ((int)os::vm_page_size()-1)); 3190 cmpl(result, ((int)os::vm_page_size()-16)); 3191 jccb(Assembler::belowEqual, BIG_STRINGS); 3192 3193 subptr(rsp, 16); 3194 int stk_offset = -(1<<scale1); 3195 if (int_cnt2 < 0) { // not constant 3196 push(cnt2); 3197 stk_offset += wordSize; 3198 } 3199 movl(cnt2, cnt1); 3200 3201 bind(COPY_STR); 3202 if (ae == StrIntrinsicNode::LL) { 3203 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3204 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3205 } else { 3206 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3207 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3208 } 3209 decrement(cnt2); 3210 jccb(Assembler::notZero, COPY_STR); 3211 3212 if (int_cnt2 < 0) { // not constant 3213 pop(cnt2); 3214 } 3215 movptr(str1, rsp); // New string address 3216 3217 bind(BIG_STRINGS); 3218 // Load substring. 3219 if (int_cnt2 < 0) { // -1 3220 if (ae == StrIntrinsicNode::UL) { 3221 pmovzxbw(vec, Address(str2, 0)); 3222 } else { 3223 movdqu(vec, Address(str2, 0)); 3224 } 3225 push(cnt2); // substr count 3226 push(str2); // substr addr 3227 push(str1); // string addr 3228 } else { 3229 // Small (< 8 chars) constant substrings are loaded already. 3230 movl(cnt2, int_cnt2); 3231 } 3232 push(tmp); // original SP 3233 3234 } // Finished loading 3235 3236 //======================================================== 3237 // Start search 3238 // 3239 3240 movptr(result, str1); // string addr 3241 3242 if (int_cnt2 < 0) { // Only for non constant substring 3243 jmpb(SCAN_TO_SUBSTR); 3244 3245 // SP saved at sp+0 3246 // String saved at sp+1*wordSize 3247 // Substr saved at sp+2*wordSize 3248 // Substr count saved at sp+3*wordSize 3249 3250 // Reload substr for rescan, this code 3251 // is executed only for large substrings (> 8 chars) 3252 bind(RELOAD_SUBSTR); 3253 movptr(str2, Address(rsp, 2*wordSize)); 3254 movl(cnt2, Address(rsp, 3*wordSize)); 3255 if (ae == StrIntrinsicNode::UL) { 3256 pmovzxbw(vec, Address(str2, 0)); 3257 } else { 3258 movdqu(vec, Address(str2, 0)); 3259 } 3260 // We came here after the beginning of the substring was 3261 // matched but the rest of it was not so we need to search 3262 // again. Start from the next element after the previous match. 3263 subptr(str1, result); // Restore counter 3264 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3265 shrl(str1, 1); 3266 } 3267 addl(cnt1, str1); 3268 decrementl(cnt1); // Shift to next element 3269 cmpl(cnt1, cnt2); 3270 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3271 3272 addptr(result, (1<<scale1)); 3273 } // non constant 3274 3275 // Scan string for start of substr in 16-byte vectors 3276 bind(SCAN_TO_SUBSTR); 3277 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3278 pcmpestri(vec, Address(result, 0), mode); 3279 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3280 subl(cnt1, stride); 3281 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3282 cmpl(cnt1, cnt2); 3283 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3284 addptr(result, 16); 3285 3286 bind(ADJUST_STR); 3287 cmpl(cnt1, stride); // Do not read beyond string 3288 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3289 // Back-up string to avoid reading beyond string. 3290 lea(result, Address(result, cnt1, scale1, -16)); 3291 movl(cnt1, stride); 3292 jmpb(SCAN_TO_SUBSTR); 3293 3294 // Found a potential substr 3295 bind(FOUND_CANDIDATE); 3296 // After pcmpestri tmp(rcx) contains matched element index 3297 3298 // Make sure string is still long enough 3299 subl(cnt1, tmp); 3300 cmpl(cnt1, cnt2); 3301 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3302 // Left less then substring. 3303 3304 bind(RET_NOT_FOUND); 3305 movl(result, -1); 3306 jmp(CLEANUP); 3307 3308 bind(FOUND_SUBSTR); 3309 // Compute start addr of substr 3310 lea(result, Address(result, tmp, scale1)); 3311 if (int_cnt2 > 0) { // Constant substring 3312 // Repeat search for small substring (< 8 chars) 3313 // from new point without reloading substring. 3314 // Have to check that we don't read beyond string. 3315 cmpl(tmp, stride-int_cnt2); 3316 jccb(Assembler::greater, ADJUST_STR); 3317 // Fall through if matched whole substring. 3318 } else { // non constant 3319 assert(int_cnt2 == -1, "should be != 0"); 3320 3321 addl(tmp, cnt2); 3322 // Found result if we matched whole substring. 3323 cmpl(tmp, stride); 3324 jcc(Assembler::lessEqual, RET_FOUND); 3325 3326 // Repeat search for small substring (<= 8 chars) 3327 // from new point 'str1' without reloading substring. 3328 cmpl(cnt2, stride); 3329 // Have to check that we don't read beyond string. 3330 jccb(Assembler::lessEqual, ADJUST_STR); 3331 3332 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3333 // Compare the rest of substring (> 8 chars). 3334 movptr(str1, result); 3335 3336 cmpl(tmp, cnt2); 3337 // First 8 chars are already matched. 3338 jccb(Assembler::equal, CHECK_NEXT); 3339 3340 bind(SCAN_SUBSTR); 3341 pcmpestri(vec, Address(str1, 0), mode); 3342 // Need to reload strings pointers if not matched whole vector 3343 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3344 3345 bind(CHECK_NEXT); 3346 subl(cnt2, stride); 3347 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3348 addptr(str1, 16); 3349 if (ae == StrIntrinsicNode::UL) { 3350 addptr(str2, 8); 3351 } else { 3352 addptr(str2, 16); 3353 } 3354 subl(cnt1, stride); 3355 cmpl(cnt2, stride); // Do not read beyond substring 3356 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3357 // Back-up strings to avoid reading beyond substring. 3358 3359 if (ae == StrIntrinsicNode::UL) { 3360 lea(str2, Address(str2, cnt2, scale2, -8)); 3361 lea(str1, Address(str1, cnt2, scale1, -16)); 3362 } else { 3363 lea(str2, Address(str2, cnt2, scale2, -16)); 3364 lea(str1, Address(str1, cnt2, scale1, -16)); 3365 } 3366 subl(cnt1, cnt2); 3367 movl(cnt2, stride); 3368 addl(cnt1, stride); 3369 bind(CONT_SCAN_SUBSTR); 3370 if (ae == StrIntrinsicNode::UL) { 3371 pmovzxbw(vec, Address(str2, 0)); 3372 } else { 3373 movdqu(vec, Address(str2, 0)); 3374 } 3375 jmp(SCAN_SUBSTR); 3376 3377 bind(RET_FOUND_LONG); 3378 movptr(str1, Address(rsp, wordSize)); 3379 } // non constant 3380 3381 bind(RET_FOUND); 3382 // Compute substr offset 3383 subptr(result, str1); 3384 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3385 shrl(result, 1); // index 3386 } 3387 bind(CLEANUP); 3388 pop(rsp); // restore SP 3389 3390 } // string_indexof 3391 3392 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3393 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3394 ShortBranchVerifier sbv(this); 3395 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3396 3397 int stride = 8; 3398 3399 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3400 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3401 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3402 FOUND_SEQ_CHAR, DONE_LABEL; 3403 3404 movptr(result, str1); 3405 if (UseAVX >= 2) { 3406 cmpl(cnt1, stride); 3407 jcc(Assembler::less, SCAN_TO_CHAR); 3408 cmpl(cnt1, 2*stride); 3409 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3410 movdl(vec1, ch); 3411 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3412 vpxor(vec2, vec2); 3413 movl(tmp, cnt1); 3414 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3415 andl(cnt1,0x0000000F); //tail count (in chars) 3416 3417 bind(SCAN_TO_16_CHAR_LOOP); 3418 vmovdqu(vec3, Address(result, 0)); 3419 vpcmpeqw(vec3, vec3, vec1, 1); 3420 vptest(vec2, vec3); 3421 jcc(Assembler::carryClear, FOUND_CHAR); 3422 addptr(result, 32); 3423 subl(tmp, 2*stride); 3424 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3425 jmp(SCAN_TO_8_CHAR); 3426 bind(SCAN_TO_8_CHAR_INIT); 3427 movdl(vec1, ch); 3428 pshuflw(vec1, vec1, 0x00); 3429 pshufd(vec1, vec1, 0); 3430 pxor(vec2, vec2); 3431 } 3432 bind(SCAN_TO_8_CHAR); 3433 cmpl(cnt1, stride); 3434 jcc(Assembler::less, SCAN_TO_CHAR); 3435 if (UseAVX < 2) { 3436 movdl(vec1, ch); 3437 pshuflw(vec1, vec1, 0x00); 3438 pshufd(vec1, vec1, 0); 3439 pxor(vec2, vec2); 3440 } 3441 movl(tmp, cnt1); 3442 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3443 andl(cnt1,0x00000007); //tail count (in chars) 3444 3445 bind(SCAN_TO_8_CHAR_LOOP); 3446 movdqu(vec3, Address(result, 0)); 3447 pcmpeqw(vec3, vec1); 3448 ptest(vec2, vec3); 3449 jcc(Assembler::carryClear, FOUND_CHAR); 3450 addptr(result, 16); 3451 subl(tmp, stride); 3452 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3453 bind(SCAN_TO_CHAR); 3454 testl(cnt1, cnt1); 3455 jcc(Assembler::zero, RET_NOT_FOUND); 3456 bind(SCAN_TO_CHAR_LOOP); 3457 load_unsigned_short(tmp, Address(result, 0)); 3458 cmpl(ch, tmp); 3459 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3460 addptr(result, 2); 3461 subl(cnt1, 1); 3462 jccb(Assembler::zero, RET_NOT_FOUND); 3463 jmp(SCAN_TO_CHAR_LOOP); 3464 3465 bind(RET_NOT_FOUND); 3466 movl(result, -1); 3467 jmpb(DONE_LABEL); 3468 3469 bind(FOUND_CHAR); 3470 if (UseAVX >= 2) { 3471 vpmovmskb(tmp, vec3); 3472 } else { 3473 pmovmskb(tmp, vec3); 3474 } 3475 bsfl(ch, tmp); 3476 addptr(result, ch); 3477 3478 bind(FOUND_SEQ_CHAR); 3479 subptr(result, str1); 3480 shrl(result, 1); 3481 3482 bind(DONE_LABEL); 3483 } // string_indexof_char 3484 3485 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3486 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3487 ShortBranchVerifier sbv(this); 3488 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3489 3490 int stride = 16; 3491 3492 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3493 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3494 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3495 FOUND_SEQ_CHAR, DONE_LABEL; 3496 3497 movptr(result, str1); 3498 if (UseAVX >= 2) { 3499 cmpl(cnt1, stride); 3500 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3501 cmpl(cnt1, stride*2); 3502 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3503 movdl(vec1, ch); 3504 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3505 vpxor(vec2, vec2); 3506 movl(tmp, cnt1); 3507 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3508 andl(cnt1,0x0000001F); //tail count (in chars) 3509 3510 bind(SCAN_TO_32_CHAR_LOOP); 3511 vmovdqu(vec3, Address(result, 0)); 3512 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3513 vptest(vec2, vec3); 3514 jcc(Assembler::carryClear, FOUND_CHAR); 3515 addptr(result, 32); 3516 subl(tmp, stride*2); 3517 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3518 jmp(SCAN_TO_16_CHAR); 3519 3520 bind(SCAN_TO_16_CHAR_INIT); 3521 movdl(vec1, ch); 3522 pxor(vec2, vec2); 3523 pshufb(vec1, vec2); 3524 } 3525 3526 bind(SCAN_TO_16_CHAR); 3527 cmpl(cnt1, stride); 3528 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3529 if (UseAVX < 2) { 3530 movdl(vec1, ch); 3531 pxor(vec2, vec2); 3532 pshufb(vec1, vec2); 3533 } 3534 movl(tmp, cnt1); 3535 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3536 andl(cnt1,0x0000000F); //tail count (in bytes) 3537 3538 bind(SCAN_TO_16_CHAR_LOOP); 3539 movdqu(vec3, Address(result, 0)); 3540 pcmpeqb(vec3, vec1); 3541 ptest(vec2, vec3); 3542 jcc(Assembler::carryClear, FOUND_CHAR); 3543 addptr(result, 16); 3544 subl(tmp, stride); 3545 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3546 3547 bind(SCAN_TO_CHAR_INIT); 3548 testl(cnt1, cnt1); 3549 jcc(Assembler::zero, RET_NOT_FOUND); 3550 bind(SCAN_TO_CHAR_LOOP); 3551 load_unsigned_byte(tmp, Address(result, 0)); 3552 cmpl(ch, tmp); 3553 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3554 addptr(result, 1); 3555 subl(cnt1, 1); 3556 jccb(Assembler::zero, RET_NOT_FOUND); 3557 jmp(SCAN_TO_CHAR_LOOP); 3558 3559 bind(RET_NOT_FOUND); 3560 movl(result, -1); 3561 jmpb(DONE_LABEL); 3562 3563 bind(FOUND_CHAR); 3564 if (UseAVX >= 2) { 3565 vpmovmskb(tmp, vec3); 3566 } else { 3567 pmovmskb(tmp, vec3); 3568 } 3569 bsfl(ch, tmp); 3570 addptr(result, ch); 3571 3572 bind(FOUND_SEQ_CHAR); 3573 subptr(result, str1); 3574 3575 bind(DONE_LABEL); 3576 } // stringL_indexof_char 3577 3578 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3579 switch (eltype) { 3580 case T_BOOLEAN: return sizeof(jboolean); 3581 case T_BYTE: return sizeof(jbyte); 3582 case T_SHORT: return sizeof(jshort); 3583 case T_CHAR: return sizeof(jchar); 3584 case T_INT: return sizeof(jint); 3585 default: 3586 ShouldNotReachHere(); 3587 return -1; 3588 } 3589 } 3590 3591 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3592 switch (eltype) { 3593 // T_BOOLEAN used as surrogate for unsigned byte 3594 case T_BOOLEAN: movzbl(dst, src); break; 3595 case T_BYTE: movsbl(dst, src); break; 3596 case T_SHORT: movswl(dst, src); break; 3597 case T_CHAR: movzwl(dst, src); break; 3598 case T_INT: movl(dst, src); break; 3599 default: 3600 ShouldNotReachHere(); 3601 } 3602 } 3603 3604 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3605 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3606 } 3607 3608 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3609 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3610 } 3611 3612 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3613 const int vlen = Assembler::AVX_256bit; 3614 switch (eltype) { 3615 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3616 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3617 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3618 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3619 case T_INT: 3620 // do nothing 3621 break; 3622 default: 3623 ShouldNotReachHere(); 3624 } 3625 } 3626 3627 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3628 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3629 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3630 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3631 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3632 BasicType eltype) { 3633 ShortBranchVerifier sbv(this); 3634 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3635 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3636 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3637 3638 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3639 SHORT_UNROLLED_LOOP_EXIT, 3640 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3641 UNROLLED_VECTOR_LOOP_BEGIN, 3642 END; 3643 switch (eltype) { 3644 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3645 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3646 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3647 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3648 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3649 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3650 } 3651 3652 // For "renaming" for readibility of the code 3653 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3654 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3655 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3656 3657 const int elsize = arrays_hashcode_elsize(eltype); 3658 3659 /* 3660 if (cnt1 >= 2) { 3661 if (cnt1 >= 32) { 3662 UNROLLED VECTOR LOOP 3663 } 3664 UNROLLED SCALAR LOOP 3665 } 3666 SINGLE SCALAR 3667 */ 3668 3669 cmpl(cnt1, 32); 3670 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3671 3672 // cnt1 >= 32 && generate_vectorized_loop 3673 xorl(index, index); 3674 3675 // vresult = IntVector.zero(I256); 3676 for (int idx = 0; idx < 4; idx++) { 3677 vpxor(vresult[idx], vresult[idx]); 3678 } 3679 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3680 Register bound = tmp2; 3681 Register next = tmp3; 3682 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3683 movl(next, Address(tmp2, 0)); 3684 movdl(vnext, next); 3685 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3686 3687 // index = 0; 3688 // bound = cnt1 & ~(32 - 1); 3689 movl(bound, cnt1); 3690 andl(bound, ~(32 - 1)); 3691 // for (; index < bound; index += 32) { 3692 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3693 // result *= next; 3694 imull(result, next); 3695 // loop fission to upfront the cost of fetching from memory, OOO execution 3696 // can then hopefully do a better job of prefetching 3697 for (int idx = 0; idx < 4; idx++) { 3698 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3699 } 3700 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3701 for (int idx = 0; idx < 4; idx++) { 3702 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3703 arrays_hashcode_elvcast(vtmp[idx], eltype); 3704 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3705 } 3706 // index += 32; 3707 addl(index, 32); 3708 // index < bound; 3709 cmpl(index, bound); 3710 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3711 // } 3712 3713 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3714 subl(cnt1, bound); 3715 // release bound 3716 3717 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3718 for (int idx = 0; idx < 4; idx++) { 3719 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3720 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3721 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3722 } 3723 // result += vresult.reduceLanes(ADD); 3724 for (int idx = 0; idx < 4; idx++) { 3725 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3726 } 3727 3728 // } else if (cnt1 < 32) { 3729 3730 bind(SHORT_UNROLLED_BEGIN); 3731 // int i = 1; 3732 movl(index, 1); 3733 cmpl(index, cnt1); 3734 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3735 3736 // for (; i < cnt1 ; i += 2) { 3737 bind(SHORT_UNROLLED_LOOP_BEGIN); 3738 movl(tmp3, 961); 3739 imull(result, tmp3); 3740 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3741 movl(tmp3, tmp2); 3742 shll(tmp3, 5); 3743 subl(tmp3, tmp2); 3744 addl(result, tmp3); 3745 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3746 addl(result, tmp3); 3747 addl(index, 2); 3748 cmpl(index, cnt1); 3749 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3750 3751 // } 3752 // if (i >= cnt1) { 3753 bind(SHORT_UNROLLED_LOOP_EXIT); 3754 jccb(Assembler::greater, END); 3755 movl(tmp2, result); 3756 shll(result, 5); 3757 subl(result, tmp2); 3758 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3759 addl(result, tmp3); 3760 // } 3761 bind(END); 3762 3763 BLOCK_COMMENT("} // arrays_hashcode"); 3764 3765 } // arrays_hashcode 3766 3767 // helper function for string_compare 3768 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3769 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3770 Address::ScaleFactor scale2, Register index, int ae) { 3771 if (ae == StrIntrinsicNode::LL) { 3772 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3773 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3774 } else if (ae == StrIntrinsicNode::UU) { 3775 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3776 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3777 } else { 3778 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3779 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3780 } 3781 } 3782 3783 // Compare strings, used for char[] and byte[]. 3784 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3785 Register cnt1, Register cnt2, Register result, 3786 XMMRegister vec1, int ae, KRegister mask) { 3787 ShortBranchVerifier sbv(this); 3788 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3789 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3 3790 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3791 int stride2x2 = 0x40; 3792 Address::ScaleFactor scale = Address::no_scale; 3793 Address::ScaleFactor scale1 = Address::no_scale; 3794 Address::ScaleFactor scale2 = Address::no_scale; 3795 3796 if (ae != StrIntrinsicNode::LL) { 3797 stride2x2 = 0x20; 3798 } 3799 3800 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3801 shrl(cnt2, 1); 3802 } 3803 // Compute the minimum of the string lengths and the 3804 // difference of the string lengths (stack). 3805 // Do the conditional move stuff 3806 movl(result, cnt1); 3807 subl(cnt1, cnt2); 3808 push(cnt1); 3809 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3810 3811 // Is the minimum length zero? 3812 testl(cnt2, cnt2); 3813 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3814 if (ae == StrIntrinsicNode::LL) { 3815 // Load first bytes 3816 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3817 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3818 } else if (ae == StrIntrinsicNode::UU) { 3819 // Load first characters 3820 load_unsigned_short(result, Address(str1, 0)); 3821 load_unsigned_short(cnt1, Address(str2, 0)); 3822 } else { 3823 load_unsigned_byte(result, Address(str1, 0)); 3824 load_unsigned_short(cnt1, Address(str2, 0)); 3825 } 3826 subl(result, cnt1); 3827 jcc(Assembler::notZero, POP_LABEL); 3828 3829 if (ae == StrIntrinsicNode::UU) { 3830 // Divide length by 2 to get number of chars 3831 shrl(cnt2, 1); 3832 } 3833 cmpl(cnt2, 1); 3834 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3835 3836 // Check if the strings start at the same location and setup scale and stride 3837 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3838 cmpptr(str1, str2); 3839 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3840 if (ae == StrIntrinsicNode::LL) { 3841 scale = Address::times_1; 3842 stride = 16; 3843 } else { 3844 scale = Address::times_2; 3845 stride = 8; 3846 } 3847 } else { 3848 scale1 = Address::times_1; 3849 scale2 = Address::times_2; 3850 // scale not used 3851 stride = 8; 3852 } 3853 3854 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3855 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3856 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3857 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3858 Label COMPARE_TAIL_LONG; 3859 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3 3860 3861 int pcmpmask = 0x19; 3862 if (ae == StrIntrinsicNode::LL) { 3863 pcmpmask &= ~0x01; 3864 } 3865 3866 // Setup to compare 16-chars (32-bytes) vectors, 3867 // start from first character again because it has aligned address. 3868 if (ae == StrIntrinsicNode::LL) { 3869 stride2 = 32; 3870 } else { 3871 stride2 = 16; 3872 } 3873 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3874 adr_stride = stride << scale; 3875 } else { 3876 adr_stride1 = 8; //stride << scale1; 3877 adr_stride2 = 16; //stride << scale2; 3878 } 3879 3880 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3881 // rax and rdx are used by pcmpestri as elements counters 3882 movl(result, cnt2); 3883 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3884 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3885 3886 // fast path : compare first 2 8-char vectors. 3887 bind(COMPARE_16_CHARS); 3888 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3889 movdqu(vec1, Address(str1, 0)); 3890 } else { 3891 pmovzxbw(vec1, Address(str1, 0)); 3892 } 3893 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3894 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3895 3896 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3897 movdqu(vec1, Address(str1, adr_stride)); 3898 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3899 } else { 3900 pmovzxbw(vec1, Address(str1, adr_stride1)); 3901 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3902 } 3903 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3904 addl(cnt1, stride); 3905 3906 // Compare the characters at index in cnt1 3907 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3908 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3909 subl(result, cnt2); 3910 jmp(POP_LABEL); 3911 3912 // Setup the registers to start vector comparison loop 3913 bind(COMPARE_WIDE_VECTORS); 3914 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3915 lea(str1, Address(str1, result, scale)); 3916 lea(str2, Address(str2, result, scale)); 3917 } else { 3918 lea(str1, Address(str1, result, scale1)); 3919 lea(str2, Address(str2, result, scale2)); 3920 } 3921 subl(result, stride2); 3922 subl(cnt2, stride2); 3923 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3924 negptr(result); 3925 3926 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3927 bind(COMPARE_WIDE_VECTORS_LOOP); 3928 3929 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3930 cmpl(cnt2, stride2x2); 3931 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3932 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3933 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3934 3935 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3936 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3937 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3938 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3939 } else { 3940 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3941 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3942 } 3943 kortestql(mask, mask); 3944 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3945 addptr(result, stride2x2); // update since we already compared at this addr 3946 subl(cnt2, stride2x2); // and sub the size too 3947 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3948 3949 vpxor(vec1, vec1); 3950 jmpb(COMPARE_WIDE_TAIL); 3951 }//if (VM_Version::supports_avx512vlbw()) 3952 3953 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3954 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3955 vmovdqu(vec1, Address(str1, result, scale)); 3956 vpxor(vec1, Address(str2, result, scale)); 3957 } else { 3958 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3959 vpxor(vec1, Address(str2, result, scale2)); 3960 } 3961 vptest(vec1, vec1); 3962 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3963 addptr(result, stride2); 3964 subl(cnt2, stride2); 3965 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3966 // clean upper bits of YMM registers 3967 vpxor(vec1, vec1); 3968 3969 // compare wide vectors tail 3970 bind(COMPARE_WIDE_TAIL); 3971 testptr(result, result); 3972 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3973 3974 movl(result, stride2); 3975 movl(cnt2, result); 3976 negptr(result); 3977 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3978 3979 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3980 bind(VECTOR_NOT_EQUAL); 3981 // clean upper bits of YMM registers 3982 vpxor(vec1, vec1); 3983 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3984 lea(str1, Address(str1, result, scale)); 3985 lea(str2, Address(str2, result, scale)); 3986 } else { 3987 lea(str1, Address(str1, result, scale1)); 3988 lea(str2, Address(str2, result, scale2)); 3989 } 3990 jmp(COMPARE_16_CHARS); 3991 3992 // Compare tail chars, length between 1 to 15 chars 3993 bind(COMPARE_TAIL_LONG); 3994 movl(cnt2, result); 3995 cmpl(cnt2, stride); 3996 jcc(Assembler::less, COMPARE_SMALL_STR); 3997 3998 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3999 movdqu(vec1, Address(str1, 0)); 4000 } else { 4001 pmovzxbw(vec1, Address(str1, 0)); 4002 } 4003 pcmpestri(vec1, Address(str2, 0), pcmpmask); 4004 jcc(Assembler::below, COMPARE_INDEX_CHAR); 4005 subptr(cnt2, stride); 4006 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 4007 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4008 lea(str1, Address(str1, result, scale)); 4009 lea(str2, Address(str2, result, scale)); 4010 } else { 4011 lea(str1, Address(str1, result, scale1)); 4012 lea(str2, Address(str2, result, scale2)); 4013 } 4014 negptr(cnt2); 4015 jmpb(WHILE_HEAD_LABEL); 4016 4017 bind(COMPARE_SMALL_STR); 4018 } else if (UseSSE42Intrinsics) { 4019 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 4020 int pcmpmask = 0x19; 4021 // Setup to compare 8-char (16-byte) vectors, 4022 // start from first character again because it has aligned address. 4023 movl(result, cnt2); 4024 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 4025 if (ae == StrIntrinsicNode::LL) { 4026 pcmpmask &= ~0x01; 4027 } 4028 jcc(Assembler::zero, COMPARE_TAIL); 4029 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4030 lea(str1, Address(str1, result, scale)); 4031 lea(str2, Address(str2, result, scale)); 4032 } else { 4033 lea(str1, Address(str1, result, scale1)); 4034 lea(str2, Address(str2, result, scale2)); 4035 } 4036 negptr(result); 4037 4038 // pcmpestri 4039 // inputs: 4040 // vec1- substring 4041 // rax - negative string length (elements count) 4042 // mem - scanned string 4043 // rdx - string length (elements count) 4044 // pcmpmask - cmp mode: 11000 (string compare with negated result) 4045 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 4046 // outputs: 4047 // rcx - first mismatched element index 4048 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 4049 4050 bind(COMPARE_WIDE_VECTORS); 4051 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4052 movdqu(vec1, Address(str1, result, scale)); 4053 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 4054 } else { 4055 pmovzxbw(vec1, Address(str1, result, scale1)); 4056 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4057 } 4058 // After pcmpestri cnt1(rcx) contains mismatched element index 4059 4060 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 4061 addptr(result, stride); 4062 subptr(cnt2, stride); 4063 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4064 4065 // compare wide vectors tail 4066 testptr(result, result); 4067 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 4068 4069 movl(cnt2, stride); 4070 movl(result, stride); 4071 negptr(result); 4072 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4073 movdqu(vec1, Address(str1, result, scale)); 4074 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 4075 } else { 4076 pmovzxbw(vec1, Address(str1, result, scale1)); 4077 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4078 } 4079 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 4080 4081 // Mismatched characters in the vectors 4082 bind(VECTOR_NOT_EQUAL); 4083 addptr(cnt1, result); 4084 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 4085 subl(result, cnt2); 4086 jmpb(POP_LABEL); 4087 4088 bind(COMPARE_TAIL); // limit is zero 4089 movl(cnt2, result); 4090 // Fallthru to tail compare 4091 } 4092 // Shift str2 and str1 to the end of the arrays, negate min 4093 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4094 lea(str1, Address(str1, cnt2, scale)); 4095 lea(str2, Address(str2, cnt2, scale)); 4096 } else { 4097 lea(str1, Address(str1, cnt2, scale1)); 4098 lea(str2, Address(str2, cnt2, scale2)); 4099 } 4100 decrementl(cnt2); // first character was compared already 4101 negptr(cnt2); 4102 4103 // Compare the rest of the elements 4104 bind(WHILE_HEAD_LABEL); 4105 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 4106 subl(result, cnt1); 4107 jccb(Assembler::notZero, POP_LABEL); 4108 increment(cnt2); 4109 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 4110 4111 // Strings are equal up to min length. Return the length difference. 4112 bind(LENGTH_DIFF_LABEL); 4113 pop(result); 4114 if (ae == StrIntrinsicNode::UU) { 4115 // Divide diff by 2 to get number of chars 4116 sarl(result, 1); 4117 } 4118 jmpb(DONE_LABEL); 4119 4120 if (VM_Version::supports_avx512vlbw()) { 4121 4122 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 4123 4124 kmovql(cnt1, mask); 4125 notq(cnt1); 4126 bsfq(cnt2, cnt1); 4127 if (ae != StrIntrinsicNode::LL) { 4128 // Divide diff by 2 to get number of chars 4129 sarl(cnt2, 1); 4130 } 4131 addq(result, cnt2); 4132 if (ae == StrIntrinsicNode::LL) { 4133 load_unsigned_byte(cnt1, Address(str2, result)); 4134 load_unsigned_byte(result, Address(str1, result)); 4135 } else if (ae == StrIntrinsicNode::UU) { 4136 load_unsigned_short(cnt1, Address(str2, result, scale)); 4137 load_unsigned_short(result, Address(str1, result, scale)); 4138 } else { 4139 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4140 load_unsigned_byte(result, Address(str1, result, scale1)); 4141 } 4142 subl(result, cnt1); 4143 jmpb(POP_LABEL); 4144 }//if (VM_Version::supports_avx512vlbw()) 4145 4146 // Discard the stored length difference 4147 bind(POP_LABEL); 4148 pop(cnt1); 4149 4150 // That's it 4151 bind(DONE_LABEL); 4152 if(ae == StrIntrinsicNode::UL) { 4153 negl(result); 4154 } 4155 4156 } 4157 4158 // Search for Non-ASCII character (Negative byte value) in a byte array, 4159 // return the index of the first such character, otherwise the length 4160 // of the array segment searched. 4161 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4162 // @IntrinsicCandidate 4163 // public static int countPositives(byte[] ba, int off, int len) { 4164 // for (int i = off; i < off + len; i++) { 4165 // if (ba[i] < 0) { 4166 // return i - off; 4167 // } 4168 // } 4169 // return len; 4170 // } 4171 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4172 Register result, Register tmp1, 4173 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4174 // rsi: byte array 4175 // rcx: len 4176 // rax: result 4177 ShortBranchVerifier sbv(this); 4178 assert_different_registers(ary1, len, result, tmp1); 4179 assert_different_registers(vec1, vec2); 4180 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4181 4182 movl(result, len); // copy 4183 // len == 0 4184 testl(len, len); 4185 jcc(Assembler::zero, DONE); 4186 4187 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4188 VM_Version::supports_avx512vlbw() && 4189 VM_Version::supports_bmi2()) { 4190 4191 Label test_64_loop, test_tail, BREAK_LOOP; 4192 movl(tmp1, len); 4193 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4194 4195 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4196 andl(len, 0xffffffc0); // vector count (in chars) 4197 jccb(Assembler::zero, test_tail); 4198 4199 lea(ary1, Address(ary1, len, Address::times_1)); 4200 negptr(len); 4201 4202 bind(test_64_loop); 4203 // Check whether our 64 elements of size byte contain negatives 4204 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4205 kortestql(mask1, mask1); 4206 jcc(Assembler::notZero, BREAK_LOOP); 4207 4208 addptr(len, 64); 4209 jccb(Assembler::notZero, test_64_loop); 4210 4211 bind(test_tail); 4212 // bail out when there is nothing to be done 4213 testl(tmp1, -1); 4214 jcc(Assembler::zero, DONE); 4215 4216 4217 // check the tail for absense of negatives 4218 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4219 { 4220 Register tmp3_aliased = len; 4221 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4222 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4223 notq(tmp3_aliased); 4224 kmovql(mask2, tmp3_aliased); 4225 } 4226 4227 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4228 ktestq(mask1, mask2); 4229 jcc(Assembler::zero, DONE); 4230 4231 // do a full check for negative registers in the tail 4232 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4233 // ary1 already pointing to the right place 4234 jmpb(TAIL_START); 4235 4236 bind(BREAK_LOOP); 4237 // At least one byte in the last 64 byte block was negative. 4238 // Set up to look at the last 64 bytes as if they were a tail 4239 lea(ary1, Address(ary1, len, Address::times_1)); 4240 addptr(result, len); 4241 // Ignore the very last byte: if all others are positive, 4242 // it must be negative, so we can skip right to the 2+1 byte 4243 // end comparison at this point 4244 orl(result, 63); 4245 movl(len, 63); 4246 // Fallthru to tail compare 4247 } else { 4248 4249 if (UseAVX >= 2) { 4250 // With AVX2, use 32-byte vector compare 4251 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4252 4253 // Compare 32-byte vectors 4254 testl(len, 0xffffffe0); // vector count (in bytes) 4255 jccb(Assembler::zero, TAIL_START); 4256 4257 andl(len, 0xffffffe0); 4258 lea(ary1, Address(ary1, len, Address::times_1)); 4259 negptr(len); 4260 4261 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4262 movdl(vec2, tmp1); 4263 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4264 4265 bind(COMPARE_WIDE_VECTORS); 4266 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4267 vptest(vec1, vec2); 4268 jccb(Assembler::notZero, BREAK_LOOP); 4269 addptr(len, 32); 4270 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4271 4272 testl(result, 0x0000001f); // any bytes remaining? 4273 jcc(Assembler::zero, DONE); 4274 4275 // Quick test using the already prepared vector mask 4276 movl(len, result); 4277 andl(len, 0x0000001f); 4278 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4279 vptest(vec1, vec2); 4280 jcc(Assembler::zero, DONE); 4281 // There are zeros, jump to the tail to determine exactly where 4282 jmpb(TAIL_START); 4283 4284 bind(BREAK_LOOP); 4285 // At least one byte in the last 32-byte vector is negative. 4286 // Set up to look at the last 32 bytes as if they were a tail 4287 lea(ary1, Address(ary1, len, Address::times_1)); 4288 addptr(result, len); 4289 // Ignore the very last byte: if all others are positive, 4290 // it must be negative, so we can skip right to the 2+1 byte 4291 // end comparison at this point 4292 orl(result, 31); 4293 movl(len, 31); 4294 // Fallthru to tail compare 4295 } else if (UseSSE42Intrinsics) { 4296 // With SSE4.2, use double quad vector compare 4297 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4298 4299 // Compare 16-byte vectors 4300 testl(len, 0xfffffff0); // vector count (in bytes) 4301 jcc(Assembler::zero, TAIL_START); 4302 4303 andl(len, 0xfffffff0); 4304 lea(ary1, Address(ary1, len, Address::times_1)); 4305 negptr(len); 4306 4307 movl(tmp1, 0x80808080); 4308 movdl(vec2, tmp1); 4309 pshufd(vec2, vec2, 0); 4310 4311 bind(COMPARE_WIDE_VECTORS); 4312 movdqu(vec1, Address(ary1, len, Address::times_1)); 4313 ptest(vec1, vec2); 4314 jccb(Assembler::notZero, BREAK_LOOP); 4315 addptr(len, 16); 4316 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4317 4318 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4319 jcc(Assembler::zero, DONE); 4320 4321 // Quick test using the already prepared vector mask 4322 movl(len, result); 4323 andl(len, 0x0000000f); // tail count (in bytes) 4324 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4325 ptest(vec1, vec2); 4326 jcc(Assembler::zero, DONE); 4327 jmpb(TAIL_START); 4328 4329 bind(BREAK_LOOP); 4330 // At least one byte in the last 16-byte vector is negative. 4331 // Set up and look at the last 16 bytes as if they were a tail 4332 lea(ary1, Address(ary1, len, Address::times_1)); 4333 addptr(result, len); 4334 // Ignore the very last byte: if all others are positive, 4335 // it must be negative, so we can skip right to the 2+1 byte 4336 // end comparison at this point 4337 orl(result, 15); 4338 movl(len, 15); 4339 // Fallthru to tail compare 4340 } 4341 } 4342 4343 bind(TAIL_START); 4344 // Compare 4-byte vectors 4345 andl(len, 0xfffffffc); // vector count (in bytes) 4346 jccb(Assembler::zero, COMPARE_CHAR); 4347 4348 lea(ary1, Address(ary1, len, Address::times_1)); 4349 negptr(len); 4350 4351 bind(COMPARE_VECTORS); 4352 movl(tmp1, Address(ary1, len, Address::times_1)); 4353 andl(tmp1, 0x80808080); 4354 jccb(Assembler::notZero, TAIL_ADJUST); 4355 addptr(len, 4); 4356 jccb(Assembler::notZero, COMPARE_VECTORS); 4357 4358 // Compare trailing char (final 2-3 bytes), if any 4359 bind(COMPARE_CHAR); 4360 4361 testl(result, 0x2); // tail char 4362 jccb(Assembler::zero, COMPARE_BYTE); 4363 load_unsigned_short(tmp1, Address(ary1, 0)); 4364 andl(tmp1, 0x00008080); 4365 jccb(Assembler::notZero, CHAR_ADJUST); 4366 lea(ary1, Address(ary1, 2)); 4367 4368 bind(COMPARE_BYTE); 4369 testl(result, 0x1); // tail byte 4370 jccb(Assembler::zero, DONE); 4371 load_unsigned_byte(tmp1, Address(ary1, 0)); 4372 testl(tmp1, 0x00000080); 4373 jccb(Assembler::zero, DONE); 4374 subptr(result, 1); 4375 jmpb(DONE); 4376 4377 bind(TAIL_ADJUST); 4378 // there are negative bits in the last 4 byte block. 4379 // Adjust result and check the next three bytes 4380 addptr(result, len); 4381 orl(result, 3); 4382 lea(ary1, Address(ary1, len, Address::times_1)); 4383 jmpb(COMPARE_CHAR); 4384 4385 bind(CHAR_ADJUST); 4386 // We are looking at a char + optional byte tail, and found that one 4387 // of the bytes in the char is negative. Adjust the result, check the 4388 // first byte and readjust if needed. 4389 andl(result, 0xfffffffc); 4390 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4391 jccb(Assembler::notZero, DONE); 4392 addptr(result, 1); 4393 4394 // That's it 4395 bind(DONE); 4396 if (UseAVX >= 2) { 4397 // clean upper bits of YMM registers 4398 vpxor(vec1, vec1); 4399 vpxor(vec2, vec2); 4400 } 4401 } 4402 4403 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4404 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4405 Register limit, Register result, Register chr, 4406 XMMRegister vec1, XMMRegister vec2, bool is_char, 4407 KRegister mask, bool expand_ary2) { 4408 // for expand_ary2, limit is the (smaller) size of the second array. 4409 ShortBranchVerifier sbv(this); 4410 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4411 4412 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4413 "Expansion only implemented for AVX2"); 4414 4415 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4416 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4417 4418 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4419 int scaleIncr = expand_ary2 ? 8 : 16; 4420 4421 if (is_array_equ) { 4422 // Check the input args 4423 cmpoop(ary1, ary2); 4424 jcc(Assembler::equal, TRUE_LABEL); 4425 4426 // Need additional checks for arrays_equals. 4427 testptr(ary1, ary1); 4428 jcc(Assembler::zero, FALSE_LABEL); 4429 testptr(ary2, ary2); 4430 jcc(Assembler::zero, FALSE_LABEL); 4431 4432 // Check the lengths 4433 movl(limit, Address(ary1, length_offset)); 4434 cmpl(limit, Address(ary2, length_offset)); 4435 jcc(Assembler::notEqual, FALSE_LABEL); 4436 } 4437 4438 // count == 0 4439 testl(limit, limit); 4440 jcc(Assembler::zero, TRUE_LABEL); 4441 4442 if (is_array_equ) { 4443 // Load array address 4444 lea(ary1, Address(ary1, base_offset)); 4445 lea(ary2, Address(ary2, base_offset)); 4446 } 4447 4448 if (is_array_equ && is_char) { 4449 // arrays_equals when used for char[]. 4450 shll(limit, 1); // byte count != 0 4451 } 4452 movl(result, limit); // copy 4453 4454 if (UseAVX >= 2) { 4455 // With AVX2, use 32-byte vector compare 4456 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4457 4458 // Compare 32-byte vectors 4459 if (expand_ary2) { 4460 andl(result, 0x0000000f); // tail count (in bytes) 4461 andl(limit, 0xfffffff0); // vector count (in bytes) 4462 jcc(Assembler::zero, COMPARE_TAIL); 4463 } else { 4464 andl(result, 0x0000001f); // tail count (in bytes) 4465 andl(limit, 0xffffffe0); // vector count (in bytes) 4466 jcc(Assembler::zero, COMPARE_TAIL_16); 4467 } 4468 4469 lea(ary1, Address(ary1, limit, scaleFactor)); 4470 lea(ary2, Address(ary2, limit, Address::times_1)); 4471 negptr(limit); 4472 4473 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4474 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4475 4476 cmpl(limit, -64); 4477 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4478 4479 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4480 4481 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4482 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4483 kortestql(mask, mask); 4484 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4485 addptr(limit, 64); // update since we already compared at this addr 4486 cmpl(limit, -64); 4487 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4488 4489 // At this point we may still need to compare -limit+result bytes. 4490 // We could execute the next two instruction and just continue via non-wide path: 4491 // cmpl(limit, 0); 4492 // jcc(Assembler::equal, COMPARE_TAIL); // true 4493 // But since we stopped at the points ary{1,2}+limit which are 4494 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4495 // (|limit| <= 32 and result < 32), 4496 // we may just compare the last 64 bytes. 4497 // 4498 addptr(result, -64); // it is safe, bc we just came from this area 4499 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4500 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4501 kortestql(mask, mask); 4502 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4503 4504 jmp(TRUE_LABEL); 4505 4506 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4507 4508 }//if (VM_Version::supports_avx512vlbw()) 4509 4510 bind(COMPARE_WIDE_VECTORS); 4511 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4512 if (expand_ary2) { 4513 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4514 } else { 4515 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4516 } 4517 vpxor(vec1, vec2); 4518 4519 vptest(vec1, vec1); 4520 jcc(Assembler::notZero, FALSE_LABEL); 4521 addptr(limit, scaleIncr * 2); 4522 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4523 4524 testl(result, result); 4525 jcc(Assembler::zero, TRUE_LABEL); 4526 4527 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4528 if (expand_ary2) { 4529 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4530 } else { 4531 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4532 } 4533 vpxor(vec1, vec2); 4534 4535 vptest(vec1, vec1); 4536 jcc(Assembler::notZero, FALSE_LABEL); 4537 jmp(TRUE_LABEL); 4538 4539 bind(COMPARE_TAIL_16); // limit is zero 4540 movl(limit, result); 4541 4542 // Compare 16-byte chunks 4543 andl(result, 0x0000000f); // tail count (in bytes) 4544 andl(limit, 0xfffffff0); // vector count (in bytes) 4545 jcc(Assembler::zero, COMPARE_TAIL); 4546 4547 lea(ary1, Address(ary1, limit, scaleFactor)); 4548 lea(ary2, Address(ary2, limit, Address::times_1)); 4549 negptr(limit); 4550 4551 bind(COMPARE_WIDE_VECTORS_16); 4552 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4553 if (expand_ary2) { 4554 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4555 } else { 4556 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4557 } 4558 pxor(vec1, vec2); 4559 4560 ptest(vec1, vec1); 4561 jcc(Assembler::notZero, FALSE_LABEL); 4562 addptr(limit, scaleIncr); 4563 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4564 4565 bind(COMPARE_TAIL); // limit is zero 4566 movl(limit, result); 4567 // Fallthru to tail compare 4568 } else if (UseSSE42Intrinsics) { 4569 // With SSE4.2, use double quad vector compare 4570 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4571 4572 // Compare 16-byte vectors 4573 andl(result, 0x0000000f); // tail count (in bytes) 4574 andl(limit, 0xfffffff0); // vector count (in bytes) 4575 jcc(Assembler::zero, COMPARE_TAIL); 4576 4577 lea(ary1, Address(ary1, limit, Address::times_1)); 4578 lea(ary2, Address(ary2, limit, Address::times_1)); 4579 negptr(limit); 4580 4581 bind(COMPARE_WIDE_VECTORS); 4582 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4583 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4584 pxor(vec1, vec2); 4585 4586 ptest(vec1, vec1); 4587 jcc(Assembler::notZero, FALSE_LABEL); 4588 addptr(limit, 16); 4589 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4590 4591 testl(result, result); 4592 jcc(Assembler::zero, TRUE_LABEL); 4593 4594 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4595 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4596 pxor(vec1, vec2); 4597 4598 ptest(vec1, vec1); 4599 jccb(Assembler::notZero, FALSE_LABEL); 4600 jmpb(TRUE_LABEL); 4601 4602 bind(COMPARE_TAIL); // limit is zero 4603 movl(limit, result); 4604 // Fallthru to tail compare 4605 } 4606 4607 // Compare 4-byte vectors 4608 if (expand_ary2) { 4609 testl(result, result); 4610 jccb(Assembler::zero, TRUE_LABEL); 4611 } else { 4612 andl(limit, 0xfffffffc); // vector count (in bytes) 4613 jccb(Assembler::zero, COMPARE_CHAR); 4614 } 4615 4616 lea(ary1, Address(ary1, limit, scaleFactor)); 4617 lea(ary2, Address(ary2, limit, Address::times_1)); 4618 negptr(limit); 4619 4620 bind(COMPARE_VECTORS); 4621 if (expand_ary2) { 4622 // There are no "vector" operations for bytes to shorts 4623 movzbl(chr, Address(ary2, limit, Address::times_1)); 4624 cmpw(Address(ary1, limit, Address::times_2), chr); 4625 jccb(Assembler::notEqual, FALSE_LABEL); 4626 addptr(limit, 1); 4627 jcc(Assembler::notZero, COMPARE_VECTORS); 4628 jmp(TRUE_LABEL); 4629 } else { 4630 movl(chr, Address(ary1, limit, Address::times_1)); 4631 cmpl(chr, Address(ary2, limit, Address::times_1)); 4632 jccb(Assembler::notEqual, FALSE_LABEL); 4633 addptr(limit, 4); 4634 jcc(Assembler::notZero, COMPARE_VECTORS); 4635 } 4636 4637 // Compare trailing char (final 2 bytes), if any 4638 bind(COMPARE_CHAR); 4639 testl(result, 0x2); // tail char 4640 jccb(Assembler::zero, COMPARE_BYTE); 4641 load_unsigned_short(chr, Address(ary1, 0)); 4642 load_unsigned_short(limit, Address(ary2, 0)); 4643 cmpl(chr, limit); 4644 jccb(Assembler::notEqual, FALSE_LABEL); 4645 4646 if (is_array_equ && is_char) { 4647 bind(COMPARE_BYTE); 4648 } else { 4649 lea(ary1, Address(ary1, 2)); 4650 lea(ary2, Address(ary2, 2)); 4651 4652 bind(COMPARE_BYTE); 4653 testl(result, 0x1); // tail byte 4654 jccb(Assembler::zero, TRUE_LABEL); 4655 load_unsigned_byte(chr, Address(ary1, 0)); 4656 load_unsigned_byte(limit, Address(ary2, 0)); 4657 cmpl(chr, limit); 4658 jccb(Assembler::notEqual, FALSE_LABEL); 4659 } 4660 bind(TRUE_LABEL); 4661 movl(result, 1); // return true 4662 jmpb(DONE); 4663 4664 bind(FALSE_LABEL); 4665 xorl(result, result); // return false 4666 4667 // That's it 4668 bind(DONE); 4669 if (UseAVX >= 2) { 4670 // clean upper bits of YMM registers 4671 vpxor(vec1, vec1); 4672 vpxor(vec2, vec2); 4673 } 4674 } 4675 4676 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4677 #define __ masm. 4678 Register dst = stub.data<0>(); 4679 XMMRegister src = stub.data<1>(); 4680 address target = stub.data<2>(); 4681 __ bind(stub.entry()); 4682 __ subptr(rsp, 8); 4683 __ movdbl(Address(rsp), src); 4684 __ call(RuntimeAddress(target)); 4685 // APX REX2 encoding for pop(dst) increases the stub size by 1 byte. 4686 __ pop(dst); 4687 __ jmp(stub.continuation()); 4688 #undef __ 4689 } 4690 4691 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4692 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4693 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4694 4695 address slowpath_target; 4696 if (dst_bt == T_INT) { 4697 if (src_bt == T_FLOAT) { 4698 cvttss2sil(dst, src); 4699 cmpl(dst, 0x80000000); 4700 slowpath_target = StubRoutines::x86::f2i_fixup(); 4701 } else { 4702 cvttsd2sil(dst, src); 4703 cmpl(dst, 0x80000000); 4704 slowpath_target = StubRoutines::x86::d2i_fixup(); 4705 } 4706 } else { 4707 if (src_bt == T_FLOAT) { 4708 cvttss2siq(dst, src); 4709 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4710 slowpath_target = StubRoutines::x86::f2l_fixup(); 4711 } else { 4712 cvttsd2siq(dst, src); 4713 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4714 slowpath_target = StubRoutines::x86::d2l_fixup(); 4715 } 4716 } 4717 4718 // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte. 4719 int max_size = 23 + (UseAPX ? 1 : 0); 4720 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath); 4721 jcc(Assembler::equal, stub->entry()); 4722 bind(stub->continuation()); 4723 } 4724 4725 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4726 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4727 switch(ideal_opc) { 4728 case Op_LShiftVS: 4729 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4730 case Op_LShiftVI: 4731 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4732 case Op_LShiftVL: 4733 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4734 case Op_RShiftVS: 4735 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4736 case Op_RShiftVI: 4737 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4738 case Op_RShiftVL: 4739 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4740 case Op_URShiftVS: 4741 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4742 case Op_URShiftVI: 4743 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4744 case Op_URShiftVL: 4745 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4746 case Op_RotateRightV: 4747 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4748 case Op_RotateLeftV: 4749 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4750 default: 4751 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4752 break; 4753 } 4754 } 4755 4756 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4757 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) { 4758 if (is_unsigned) { 4759 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4760 } else { 4761 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4762 } 4763 } 4764 4765 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4766 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4767 switch (elem_bt) { 4768 case T_BYTE: 4769 if (ideal_opc == Op_SaturatingAddV) { 4770 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4771 } else { 4772 assert(ideal_opc == Op_SaturatingSubV, ""); 4773 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4774 } 4775 break; 4776 case T_SHORT: 4777 if (ideal_opc == Op_SaturatingAddV) { 4778 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4779 } else { 4780 assert(ideal_opc == Op_SaturatingSubV, ""); 4781 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4782 } 4783 break; 4784 default: 4785 fatal("Unsupported type %s", type2name(elem_bt)); 4786 break; 4787 } 4788 } 4789 4790 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4791 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4792 switch (elem_bt) { 4793 case T_BYTE: 4794 if (ideal_opc == Op_SaturatingAddV) { 4795 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4796 } else { 4797 assert(ideal_opc == Op_SaturatingSubV, ""); 4798 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4799 } 4800 break; 4801 case T_SHORT: 4802 if (ideal_opc == Op_SaturatingAddV) { 4803 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4804 } else { 4805 assert(ideal_opc == Op_SaturatingSubV, ""); 4806 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4807 } 4808 break; 4809 default: 4810 fatal("Unsupported type %s", type2name(elem_bt)); 4811 break; 4812 } 4813 } 4814 4815 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4816 Address src2, bool is_unsigned, bool merge, int vlen_enc) { 4817 if (is_unsigned) { 4818 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4819 } else { 4820 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4821 } 4822 } 4823 4824 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4825 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4826 switch (elem_bt) { 4827 case T_BYTE: 4828 if (ideal_opc == Op_SaturatingAddV) { 4829 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4830 } else { 4831 assert(ideal_opc == Op_SaturatingSubV, ""); 4832 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4833 } 4834 break; 4835 case T_SHORT: 4836 if (ideal_opc == Op_SaturatingAddV) { 4837 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4838 } else { 4839 assert(ideal_opc == Op_SaturatingSubV, ""); 4840 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4841 } 4842 break; 4843 default: 4844 fatal("Unsupported type %s", type2name(elem_bt)); 4845 break; 4846 } 4847 } 4848 4849 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4850 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4851 switch (elem_bt) { 4852 case T_BYTE: 4853 if (ideal_opc == Op_SaturatingAddV) { 4854 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4855 } else { 4856 assert(ideal_opc == Op_SaturatingSubV, ""); 4857 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4858 } 4859 break; 4860 case T_SHORT: 4861 if (ideal_opc == Op_SaturatingAddV) { 4862 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4863 } else { 4864 assert(ideal_opc == Op_SaturatingSubV, ""); 4865 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4866 } 4867 break; 4868 default: 4869 fatal("Unsupported type %s", type2name(elem_bt)); 4870 break; 4871 } 4872 } 4873 4874 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4875 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4876 bool is_varshift) { 4877 switch (ideal_opc) { 4878 case Op_AddVB: 4879 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4880 case Op_AddVS: 4881 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4882 case Op_AddVI: 4883 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4884 case Op_AddVL: 4885 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4886 case Op_AddVF: 4887 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4888 case Op_AddVD: 4889 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4890 case Op_SubVB: 4891 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4892 case Op_SubVS: 4893 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4894 case Op_SubVI: 4895 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4896 case Op_SubVL: 4897 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4898 case Op_SubVF: 4899 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4900 case Op_SubVD: 4901 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4902 case Op_MulVS: 4903 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4904 case Op_MulVI: 4905 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4906 case Op_MulVL: 4907 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4908 case Op_MulVF: 4909 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4910 case Op_MulVD: 4911 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4912 case Op_DivVF: 4913 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4914 case Op_DivVD: 4915 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4916 case Op_SqrtVF: 4917 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4918 case Op_SqrtVD: 4919 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4920 case Op_AbsVB: 4921 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4922 case Op_AbsVS: 4923 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4924 case Op_AbsVI: 4925 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4926 case Op_AbsVL: 4927 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4928 case Op_FmaVF: 4929 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4930 case Op_FmaVD: 4931 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4932 case Op_VectorRearrange: 4933 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4934 case Op_LShiftVS: 4935 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4936 case Op_LShiftVI: 4937 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4938 case Op_LShiftVL: 4939 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4940 case Op_RShiftVS: 4941 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4942 case Op_RShiftVI: 4943 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4944 case Op_RShiftVL: 4945 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4946 case Op_URShiftVS: 4947 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4948 case Op_URShiftVI: 4949 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4950 case Op_URShiftVL: 4951 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4952 case Op_RotateLeftV: 4953 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4954 case Op_RotateRightV: 4955 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4956 case Op_MaxV: 4957 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4958 case Op_MinV: 4959 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4960 case Op_UMinV: 4961 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4962 case Op_UMaxV: 4963 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4964 case Op_XorV: 4965 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4966 case Op_OrV: 4967 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4968 case Op_AndV: 4969 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4970 default: 4971 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4972 break; 4973 } 4974 } 4975 4976 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4977 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4978 switch (ideal_opc) { 4979 case Op_AddVB: 4980 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4981 case Op_AddVS: 4982 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4983 case Op_AddVI: 4984 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4985 case Op_AddVL: 4986 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4987 case Op_AddVF: 4988 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4989 case Op_AddVD: 4990 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4991 case Op_SubVB: 4992 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4993 case Op_SubVS: 4994 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4995 case Op_SubVI: 4996 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4997 case Op_SubVL: 4998 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4999 case Op_SubVF: 5000 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 5001 case Op_SubVD: 5002 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 5003 case Op_MulVS: 5004 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 5005 case Op_MulVI: 5006 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 5007 case Op_MulVL: 5008 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 5009 case Op_MulVF: 5010 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 5011 case Op_MulVD: 5012 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 5013 case Op_DivVF: 5014 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 5015 case Op_DivVD: 5016 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 5017 case Op_FmaVF: 5018 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 5019 case Op_FmaVD: 5020 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 5021 case Op_MaxV: 5022 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5023 case Op_MinV: 5024 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5025 case Op_UMaxV: 5026 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5027 case Op_UMinV: 5028 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5029 case Op_XorV: 5030 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5031 case Op_OrV: 5032 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5033 case Op_AndV: 5034 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5035 default: 5036 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 5037 break; 5038 } 5039 } 5040 5041 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 5042 KRegister src1, KRegister src2) { 5043 BasicType etype = T_ILLEGAL; 5044 switch(mask_len) { 5045 case 2: 5046 case 4: 5047 case 8: etype = T_BYTE; break; 5048 case 16: etype = T_SHORT; break; 5049 case 32: etype = T_INT; break; 5050 case 64: etype = T_LONG; break; 5051 default: fatal("Unsupported type"); break; 5052 } 5053 assert(etype != T_ILLEGAL, ""); 5054 switch(ideal_opc) { 5055 case Op_AndVMask: 5056 kand(etype, dst, src1, src2); break; 5057 case Op_OrVMask: 5058 kor(etype, dst, src1, src2); break; 5059 case Op_XorVMask: 5060 kxor(etype, dst, src1, src2); break; 5061 default: 5062 fatal("Unsupported masked operation"); break; 5063 } 5064 } 5065 5066 /* 5067 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5068 * If src is NaN, the result is 0. 5069 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 5070 * the result is equal to the value of Integer.MIN_VALUE. 5071 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 5072 * the result is equal to the value of Integer.MAX_VALUE. 5073 */ 5074 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5075 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5076 Register rscratch, AddressLiteral float_sign_flip, 5077 int vec_enc) { 5078 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5079 Label done; 5080 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 5081 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5082 vptest(xtmp2, xtmp2, vec_enc); 5083 jccb(Assembler::equal, done); 5084 5085 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 5086 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 5087 5088 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5089 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 5090 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 5091 5092 // Recompute the mask for remaining special value. 5093 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 5094 // Extract SRC values corresponding to TRUE mask lanes. 5095 vpand(xtmp4, xtmp2, src, vec_enc); 5096 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 5097 // values are set. 5098 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 5099 5100 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 5101 bind(done); 5102 } 5103 5104 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5105 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5106 Register rscratch, AddressLiteral float_sign_flip, 5107 int vec_enc) { 5108 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5109 Label done; 5110 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5111 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5112 kortestwl(ktmp1, ktmp1); 5113 jccb(Assembler::equal, done); 5114 5115 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5116 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5117 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5118 5119 kxorwl(ktmp1, ktmp1, ktmp2); 5120 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5121 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5122 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5123 bind(done); 5124 } 5125 5126 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5127 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5128 Register rscratch, AddressLiteral double_sign_flip, 5129 int vec_enc) { 5130 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5131 5132 Label done; 5133 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5134 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 5135 kortestwl(ktmp1, ktmp1); 5136 jccb(Assembler::equal, done); 5137 5138 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5139 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5140 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5141 5142 kxorwl(ktmp1, ktmp1, ktmp2); 5143 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5144 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5145 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5146 bind(done); 5147 } 5148 5149 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5150 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5151 Register rscratch, AddressLiteral float_sign_flip, 5152 int vec_enc) { 5153 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5154 Label done; 5155 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5156 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5157 kortestwl(ktmp1, ktmp1); 5158 jccb(Assembler::equal, done); 5159 5160 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5161 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5162 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5163 5164 kxorwl(ktmp1, ktmp1, ktmp2); 5165 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5166 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5167 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5168 bind(done); 5169 } 5170 5171 /* 5172 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5173 * If src is NaN, the result is 0. 5174 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 5175 * the result is equal to the value of Long.MIN_VALUE. 5176 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 5177 * the result is equal to the value of Long.MAX_VALUE. 5178 */ 5179 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5180 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5181 Register rscratch, AddressLiteral double_sign_flip, 5182 int vec_enc) { 5183 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5184 5185 Label done; 5186 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5187 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 5188 kortestwl(ktmp1, ktmp1); 5189 jccb(Assembler::equal, done); 5190 5191 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5192 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5193 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5194 5195 kxorwl(ktmp1, ktmp1, ktmp2); 5196 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5197 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5198 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5199 bind(done); 5200 } 5201 5202 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5203 XMMRegister xtmp, int index, int vec_enc) { 5204 assert(vec_enc < Assembler::AVX_512bit, ""); 5205 if (vec_enc == Assembler::AVX_256bit) { 5206 vextractf128_high(xtmp, src); 5207 vshufps(dst, src, xtmp, index, vec_enc); 5208 } else { 5209 vshufps(dst, src, zero, index, vec_enc); 5210 } 5211 } 5212 5213 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5214 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5215 AddressLiteral float_sign_flip, int src_vec_enc) { 5216 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5217 5218 Label done; 5219 // Compare the destination lanes with float_sign_flip 5220 // value to get mask for all special values. 5221 movdqu(xtmp1, float_sign_flip, rscratch); 5222 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5223 ptest(xtmp2, xtmp2); 5224 jccb(Assembler::equal, done); 5225 5226 // Flip float_sign_flip to get max integer value. 5227 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5228 pxor(xtmp1, xtmp4); 5229 5230 // Set detination lanes corresponding to unordered source lanes as zero. 5231 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5232 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5233 5234 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5235 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5236 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5237 5238 // Recompute the mask for remaining special value. 5239 pxor(xtmp2, xtmp3); 5240 // Extract mask corresponding to non-negative source lanes. 5241 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5242 5243 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5244 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5245 pand(xtmp3, xtmp2); 5246 5247 // Replace destination lanes holding special value(0x80000000) with max int 5248 // if corresponding source lane holds a +ve value. 5249 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5250 bind(done); 5251 } 5252 5253 5254 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5255 XMMRegister xtmp, Register rscratch, int vec_enc) { 5256 switch(to_elem_bt) { 5257 case T_SHORT: 5258 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5259 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5260 vpackusdw(dst, dst, zero, vec_enc); 5261 if (vec_enc == Assembler::AVX_256bit) { 5262 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5263 } 5264 break; 5265 case T_BYTE: 5266 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5267 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5268 vpackusdw(dst, dst, zero, vec_enc); 5269 if (vec_enc == Assembler::AVX_256bit) { 5270 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5271 } 5272 vpackuswb(dst, dst, zero, vec_enc); 5273 break; 5274 default: assert(false, "%s", type2name(to_elem_bt)); 5275 } 5276 } 5277 5278 /* 5279 * Algorithm for vector D2L and F2I conversions:- 5280 * a) Perform vector D2L/F2I cast. 5281 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5282 * It signifies that source value could be any of the special floating point 5283 * values(NaN,-Inf,Inf,Max,-Min). 5284 * c) Set destination to zero if source is NaN value. 5285 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5286 */ 5287 5288 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5289 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5290 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5291 int to_elem_sz = type2aelembytes(to_elem_bt); 5292 assert(to_elem_sz <= 4, ""); 5293 vcvttps2dq(dst, src, vec_enc); 5294 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5295 if (to_elem_sz < 4) { 5296 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5297 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5298 } 5299 } 5300 5301 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5302 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5303 Register rscratch, int vec_enc) { 5304 int to_elem_sz = type2aelembytes(to_elem_bt); 5305 assert(to_elem_sz <= 4, ""); 5306 vcvttps2dq(dst, src, vec_enc); 5307 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5308 switch(to_elem_bt) { 5309 case T_INT: 5310 break; 5311 case T_SHORT: 5312 evpmovdw(dst, dst, vec_enc); 5313 break; 5314 case T_BYTE: 5315 evpmovdb(dst, dst, vec_enc); 5316 break; 5317 default: assert(false, "%s", type2name(to_elem_bt)); 5318 } 5319 } 5320 5321 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5322 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5323 Register rscratch, int vec_enc) { 5324 evcvttps2qq(dst, src, vec_enc); 5325 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5326 } 5327 5328 // Handling for downcasting from double to integer or sub-word types on AVX2. 5329 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5330 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5331 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5332 int to_elem_sz = type2aelembytes(to_elem_bt); 5333 assert(to_elem_sz < 8, ""); 5334 vcvttpd2dq(dst, src, vec_enc); 5335 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5336 float_sign_flip, vec_enc); 5337 if (to_elem_sz < 4) { 5338 // xtmp4 holds all zero lanes. 5339 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5340 } 5341 } 5342 5343 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5344 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5345 KRegister ktmp2, AddressLiteral sign_flip, 5346 Register rscratch, int vec_enc) { 5347 if (VM_Version::supports_avx512dq()) { 5348 evcvttpd2qq(dst, src, vec_enc); 5349 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5350 switch(to_elem_bt) { 5351 case T_LONG: 5352 break; 5353 case T_INT: 5354 evpmovsqd(dst, dst, vec_enc); 5355 break; 5356 case T_SHORT: 5357 evpmovsqd(dst, dst, vec_enc); 5358 evpmovdw(dst, dst, vec_enc); 5359 break; 5360 case T_BYTE: 5361 evpmovsqd(dst, dst, vec_enc); 5362 evpmovdb(dst, dst, vec_enc); 5363 break; 5364 default: assert(false, "%s", type2name(to_elem_bt)); 5365 } 5366 } else { 5367 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5368 vcvttpd2dq(dst, src, vec_enc); 5369 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5370 switch(to_elem_bt) { 5371 case T_INT: 5372 break; 5373 case T_SHORT: 5374 evpmovdw(dst, dst, vec_enc); 5375 break; 5376 case T_BYTE: 5377 evpmovdb(dst, dst, vec_enc); 5378 break; 5379 default: assert(false, "%s", type2name(to_elem_bt)); 5380 } 5381 } 5382 } 5383 5384 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5385 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5386 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5387 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5388 // and re-instantiate original MXCSR.RC mode after that. 5389 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5390 5391 mov64(tmp, julong_cast(0.5L)); 5392 evpbroadcastq(xtmp1, tmp, vec_enc); 5393 vaddpd(xtmp1, src , xtmp1, vec_enc); 5394 evcvtpd2qq(dst, xtmp1, vec_enc); 5395 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5396 double_sign_flip, vec_enc);; 5397 5398 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5399 } 5400 5401 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5402 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5403 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5404 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5405 // and re-instantiate original MXCSR.RC mode after that. 5406 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5407 5408 movl(tmp, jint_cast(0.5)); 5409 movq(xtmp1, tmp); 5410 vbroadcastss(xtmp1, xtmp1, vec_enc); 5411 vaddps(xtmp1, src , xtmp1, vec_enc); 5412 vcvtps2dq(dst, xtmp1, vec_enc); 5413 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5414 float_sign_flip, vec_enc); 5415 5416 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5417 } 5418 5419 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5420 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5421 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5422 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5423 // and re-instantiate original MXCSR.RC mode after that. 5424 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5425 5426 movl(tmp, jint_cast(0.5)); 5427 movq(xtmp1, tmp); 5428 vbroadcastss(xtmp1, xtmp1, vec_enc); 5429 vaddps(xtmp1, src , xtmp1, vec_enc); 5430 vcvtps2dq(dst, xtmp1, vec_enc); 5431 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5432 5433 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5434 } 5435 5436 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5437 BasicType from_elem_bt, BasicType to_elem_bt) { 5438 switch (from_elem_bt) { 5439 case T_BYTE: 5440 switch (to_elem_bt) { 5441 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5442 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5443 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5444 default: ShouldNotReachHere(); 5445 } 5446 break; 5447 case T_SHORT: 5448 switch (to_elem_bt) { 5449 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5450 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5451 default: ShouldNotReachHere(); 5452 } 5453 break; 5454 case T_INT: 5455 assert(to_elem_bt == T_LONG, ""); 5456 vpmovzxdq(dst, src, vlen_enc); 5457 break; 5458 default: 5459 ShouldNotReachHere(); 5460 } 5461 } 5462 5463 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5464 BasicType from_elem_bt, BasicType to_elem_bt) { 5465 switch (from_elem_bt) { 5466 case T_BYTE: 5467 switch (to_elem_bt) { 5468 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5469 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5470 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5471 default: ShouldNotReachHere(); 5472 } 5473 break; 5474 case T_SHORT: 5475 switch (to_elem_bt) { 5476 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5477 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5478 default: ShouldNotReachHere(); 5479 } 5480 break; 5481 case T_INT: 5482 assert(to_elem_bt == T_LONG, ""); 5483 vpmovsxdq(dst, src, vlen_enc); 5484 break; 5485 default: 5486 ShouldNotReachHere(); 5487 } 5488 } 5489 5490 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5491 BasicType dst_bt, BasicType src_bt, int vlen) { 5492 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5493 assert(vlen_enc != AVX_512bit, ""); 5494 5495 int dst_bt_size = type2aelembytes(dst_bt); 5496 int src_bt_size = type2aelembytes(src_bt); 5497 if (dst_bt_size > src_bt_size) { 5498 switch (dst_bt_size / src_bt_size) { 5499 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5500 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5501 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5502 default: ShouldNotReachHere(); 5503 } 5504 } else { 5505 assert(dst_bt_size < src_bt_size, ""); 5506 switch (src_bt_size / dst_bt_size) { 5507 case 2: { 5508 if (vlen_enc == AVX_128bit) { 5509 vpacksswb(dst, src, src, vlen_enc); 5510 } else { 5511 vpacksswb(dst, src, src, vlen_enc); 5512 vpermq(dst, dst, 0x08, vlen_enc); 5513 } 5514 break; 5515 } 5516 case 4: { 5517 if (vlen_enc == AVX_128bit) { 5518 vpackssdw(dst, src, src, vlen_enc); 5519 vpacksswb(dst, dst, dst, vlen_enc); 5520 } else { 5521 vpackssdw(dst, src, src, vlen_enc); 5522 vpermq(dst, dst, 0x08, vlen_enc); 5523 vpacksswb(dst, dst, dst, AVX_128bit); 5524 } 5525 break; 5526 } 5527 case 8: { 5528 if (vlen_enc == AVX_128bit) { 5529 vpshufd(dst, src, 0x08, vlen_enc); 5530 vpackssdw(dst, dst, dst, vlen_enc); 5531 vpacksswb(dst, dst, dst, vlen_enc); 5532 } else { 5533 vpshufd(dst, src, 0x08, vlen_enc); 5534 vpermq(dst, dst, 0x08, vlen_enc); 5535 vpackssdw(dst, dst, dst, AVX_128bit); 5536 vpacksswb(dst, dst, dst, AVX_128bit); 5537 } 5538 break; 5539 } 5540 default: ShouldNotReachHere(); 5541 } 5542 } 5543 } 5544 5545 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5546 bool merge, BasicType bt, int vlen_enc) { 5547 if (bt == T_INT) { 5548 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5549 } else { 5550 assert(bt == T_LONG, ""); 5551 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5552 } 5553 } 5554 5555 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5556 bool merge, BasicType bt, int vlen_enc) { 5557 if (bt == T_INT) { 5558 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5559 } else { 5560 assert(bt == T_LONG, ""); 5561 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5562 } 5563 } 5564 5565 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5566 Register rtmp2, XMMRegister xtmp, int mask_len, 5567 int vec_enc) { 5568 int index = 0; 5569 int vindex = 0; 5570 mov64(rtmp1, 0x0101010101010101L); 5571 pdepq(rtmp1, src, rtmp1); 5572 if (mask_len > 8) { 5573 movq(rtmp2, src); 5574 vpxor(xtmp, xtmp, xtmp, vec_enc); 5575 movq(xtmp, rtmp1); 5576 } 5577 movq(dst, rtmp1); 5578 5579 mask_len -= 8; 5580 while (mask_len > 0) { 5581 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5582 index++; 5583 if ((index % 2) == 0) { 5584 pxor(xtmp, xtmp); 5585 } 5586 mov64(rtmp1, 0x0101010101010101L); 5587 shrq(rtmp2, 8); 5588 pdepq(rtmp1, rtmp2, rtmp1); 5589 pinsrq(xtmp, rtmp1, index % 2); 5590 vindex = index / 2; 5591 if (vindex) { 5592 // Write entire 16 byte vector when both 64 bit 5593 // lanes are update to save redundant instructions. 5594 if (index % 2) { 5595 vinsertf128(dst, dst, xtmp, vindex); 5596 } 5597 } else { 5598 vmovdqu(dst, xtmp); 5599 } 5600 mask_len -= 8; 5601 } 5602 } 5603 5604 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5605 switch(opc) { 5606 case Op_VectorMaskTrueCount: 5607 popcntq(dst, tmp); 5608 break; 5609 case Op_VectorMaskLastTrue: 5610 if (VM_Version::supports_lzcnt()) { 5611 lzcntq(tmp, tmp); 5612 movl(dst, 63); 5613 subl(dst, tmp); 5614 } else { 5615 movl(dst, -1); 5616 bsrq(tmp, tmp); 5617 cmov32(Assembler::notZero, dst, tmp); 5618 } 5619 break; 5620 case Op_VectorMaskFirstTrue: 5621 if (VM_Version::supports_bmi1()) { 5622 if (masklen < 32) { 5623 orl(tmp, 1 << masklen); 5624 tzcntl(dst, tmp); 5625 } else if (masklen == 32) { 5626 tzcntl(dst, tmp); 5627 } else { 5628 assert(masklen == 64, ""); 5629 tzcntq(dst, tmp); 5630 } 5631 } else { 5632 if (masklen < 32) { 5633 orl(tmp, 1 << masklen); 5634 bsfl(dst, tmp); 5635 } else { 5636 assert(masklen == 32 || masklen == 64, ""); 5637 movl(dst, masklen); 5638 if (masklen == 32) { 5639 bsfl(tmp, tmp); 5640 } else { 5641 bsfq(tmp, tmp); 5642 } 5643 cmov32(Assembler::notZero, dst, tmp); 5644 } 5645 } 5646 break; 5647 case Op_VectorMaskToLong: 5648 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5649 break; 5650 default: assert(false, "Unhandled mask operation"); 5651 } 5652 } 5653 5654 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5655 int masklen, int masksize, int vec_enc) { 5656 assert(VM_Version::supports_popcnt(), ""); 5657 5658 if(VM_Version::supports_avx512bw()) { 5659 kmovql(tmp, mask); 5660 } else { 5661 assert(masklen <= 16, ""); 5662 kmovwl(tmp, mask); 5663 } 5664 5665 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5666 // operations needs to be clipped. 5667 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5668 andq(tmp, (1 << masklen) - 1); 5669 } 5670 5671 vector_mask_operation_helper(opc, dst, tmp, masklen); 5672 } 5673 5674 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5675 Register tmp, int masklen, BasicType bt, int vec_enc) { 5676 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5677 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5678 assert(VM_Version::supports_popcnt(), ""); 5679 5680 bool need_clip = false; 5681 switch(bt) { 5682 case T_BOOLEAN: 5683 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5684 vpxor(xtmp, xtmp, xtmp, vec_enc); 5685 vpsubb(xtmp, xtmp, mask, vec_enc); 5686 vpmovmskb(tmp, xtmp, vec_enc); 5687 need_clip = masklen < 16; 5688 break; 5689 case T_BYTE: 5690 vpmovmskb(tmp, mask, vec_enc); 5691 need_clip = masklen < 16; 5692 break; 5693 case T_SHORT: 5694 vpacksswb(xtmp, mask, mask, vec_enc); 5695 if (masklen >= 16) { 5696 vpermpd(xtmp, xtmp, 8, vec_enc); 5697 } 5698 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5699 need_clip = masklen < 16; 5700 break; 5701 case T_INT: 5702 case T_FLOAT: 5703 vmovmskps(tmp, mask, vec_enc); 5704 need_clip = masklen < 4; 5705 break; 5706 case T_LONG: 5707 case T_DOUBLE: 5708 vmovmskpd(tmp, mask, vec_enc); 5709 need_clip = masklen < 2; 5710 break; 5711 default: assert(false, "Unhandled type, %s", type2name(bt)); 5712 } 5713 5714 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5715 // operations needs to be clipped. 5716 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5717 // need_clip implies masklen < 32 5718 andq(tmp, (1 << masklen) - 1); 5719 } 5720 5721 vector_mask_operation_helper(opc, dst, tmp, masklen); 5722 } 5723 5724 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5725 Register rtmp2, int mask_len) { 5726 kmov(rtmp1, src); 5727 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5728 mov64(rtmp2, -1L); 5729 pextq(rtmp2, rtmp2, rtmp1); 5730 kmov(dst, rtmp2); 5731 } 5732 5733 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5734 XMMRegister mask, Register rtmp, Register rscratch, 5735 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5736 int vec_enc) { 5737 assert(type2aelembytes(bt) >= 4, ""); 5738 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5739 address compress_perm_table = nullptr; 5740 address expand_perm_table = nullptr; 5741 if (type2aelembytes(bt) == 8) { 5742 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5743 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5744 vmovmskpd(rtmp, mask, vec_enc); 5745 } else { 5746 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5747 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5748 vmovmskps(rtmp, mask, vec_enc); 5749 } 5750 shlq(rtmp, 5); // for 32 byte permute row. 5751 if (opcode == Op_CompressV) { 5752 lea(rscratch, ExternalAddress(compress_perm_table)); 5753 } else { 5754 lea(rscratch, ExternalAddress(expand_perm_table)); 5755 } 5756 addptr(rtmp, rscratch); 5757 vmovdqu(permv, Address(rtmp)); 5758 vpermps(dst, permv, src, Assembler::AVX_256bit); 5759 vpxor(xtmp, xtmp, xtmp, vec_enc); 5760 // Blend the result with zero vector using permute mask, each column entry 5761 // in a permute table row contains either a valid permute index or a -1 (default) 5762 // value, this can potentially be used as a blending mask after 5763 // compressing/expanding the source vector lanes. 5764 vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv); 5765 } 5766 5767 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5768 bool merge, BasicType bt, int vec_enc) { 5769 if (opcode == Op_CompressV) { 5770 switch(bt) { 5771 case T_BYTE: 5772 evpcompressb(dst, mask, src, merge, vec_enc); 5773 break; 5774 case T_CHAR: 5775 case T_SHORT: 5776 evpcompressw(dst, mask, src, merge, vec_enc); 5777 break; 5778 case T_INT: 5779 evpcompressd(dst, mask, src, merge, vec_enc); 5780 break; 5781 case T_FLOAT: 5782 evcompressps(dst, mask, src, merge, vec_enc); 5783 break; 5784 case T_LONG: 5785 evpcompressq(dst, mask, src, merge, vec_enc); 5786 break; 5787 case T_DOUBLE: 5788 evcompresspd(dst, mask, src, merge, vec_enc); 5789 break; 5790 default: 5791 fatal("Unsupported type %s", type2name(bt)); 5792 break; 5793 } 5794 } else { 5795 assert(opcode == Op_ExpandV, ""); 5796 switch(bt) { 5797 case T_BYTE: 5798 evpexpandb(dst, mask, src, merge, vec_enc); 5799 break; 5800 case T_CHAR: 5801 case T_SHORT: 5802 evpexpandw(dst, mask, src, merge, vec_enc); 5803 break; 5804 case T_INT: 5805 evpexpandd(dst, mask, src, merge, vec_enc); 5806 break; 5807 case T_FLOAT: 5808 evexpandps(dst, mask, src, merge, vec_enc); 5809 break; 5810 case T_LONG: 5811 evpexpandq(dst, mask, src, merge, vec_enc); 5812 break; 5813 case T_DOUBLE: 5814 evexpandpd(dst, mask, src, merge, vec_enc); 5815 break; 5816 default: 5817 fatal("Unsupported type %s", type2name(bt)); 5818 break; 5819 } 5820 } 5821 } 5822 5823 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5824 KRegister ktmp1, int vec_enc) { 5825 if (opcode == Op_SignumVD) { 5826 vsubpd(dst, zero, one, vec_enc); 5827 // if src < 0 ? -1 : 1 5828 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5829 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5830 // if src == NaN, -0.0 or 0.0 return src. 5831 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5832 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5833 } else { 5834 assert(opcode == Op_SignumVF, ""); 5835 vsubps(dst, zero, one, vec_enc); 5836 // if src < 0 ? -1 : 1 5837 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5838 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5839 // if src == NaN, -0.0 or 0.0 return src. 5840 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5841 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5842 } 5843 } 5844 5845 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5846 XMMRegister xtmp1, int vec_enc) { 5847 if (opcode == Op_SignumVD) { 5848 vsubpd(dst, zero, one, vec_enc); 5849 // if src < 0 ? -1 : 1 5850 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5851 // if src == NaN, -0.0 or 0.0 return src. 5852 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5853 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5854 } else { 5855 assert(opcode == Op_SignumVF, ""); 5856 vsubps(dst, zero, one, vec_enc); 5857 // if src < 0 ? -1 : 1 5858 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5859 // if src == NaN, -0.0 or 0.0 return src. 5860 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5861 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5862 } 5863 } 5864 5865 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5866 if (VM_Version::supports_avx512bw()) { 5867 if (mask_len > 32) { 5868 kmovql(dst, src); 5869 } else { 5870 kmovdl(dst, src); 5871 if (mask_len != 32) { 5872 kshiftrdl(dst, dst, 32 - mask_len); 5873 } 5874 } 5875 } else { 5876 assert(mask_len <= 16, ""); 5877 kmovwl(dst, src); 5878 if (mask_len != 16) { 5879 kshiftrwl(dst, dst, 16 - mask_len); 5880 } 5881 } 5882 } 5883 5884 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5885 int lane_size = type2aelembytes(bt); 5886 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5887 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) { 5888 movptr(rtmp, imm32); 5889 switch(lane_size) { 5890 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5891 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5892 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5893 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5894 fatal("Unsupported lane size %d", lane_size); 5895 break; 5896 } 5897 } else { 5898 movptr(rtmp, imm32); 5899 movq(dst, rtmp); 5900 switch(lane_size) { 5901 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5902 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5903 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5904 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5905 fatal("Unsupported lane size %d", lane_size); 5906 break; 5907 } 5908 } 5909 } 5910 5911 // 5912 // Following is lookup table based popcount computation algorithm:- 5913 // Index Bit set count 5914 // [ 0000 -> 0, 5915 // 0001 -> 1, 5916 // 0010 -> 1, 5917 // 0011 -> 2, 5918 // 0100 -> 1, 5919 // 0101 -> 2, 5920 // 0110 -> 2, 5921 // 0111 -> 3, 5922 // 1000 -> 1, 5923 // 1001 -> 2, 5924 // 1010 -> 3, 5925 // 1011 -> 3, 5926 // 1100 -> 2, 5927 // 1101 -> 3, 5928 // 1111 -> 4 ] 5929 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5930 // shuffle indices for lookup table access. 5931 // b. Right shift each byte of vector lane by 4 positions. 5932 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5933 // shuffle indices for lookup table access. 5934 // d. Add the bitset count of upper and lower 4 bits of each byte. 5935 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5936 // count of all the bytes of a quadword. 5937 // f. Perform step e. for upper 128bit vector lane. 5938 // g. Pack the bitset count of quadwords back to double word. 5939 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5940 5941 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5942 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5943 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5944 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5945 vpsrlw(dst, src, 4, vec_enc); 5946 vpand(dst, dst, xtmp1, vec_enc); 5947 vpand(xtmp1, src, xtmp1, vec_enc); 5948 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5949 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5950 vpshufb(dst, xtmp2, dst, vec_enc); 5951 vpaddb(dst, dst, xtmp1, vec_enc); 5952 } 5953 5954 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5955 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5956 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5957 // Following code is as per steps e,f,g and h of above algorithm. 5958 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5959 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5960 vpsadbw(dst, dst, xtmp2, vec_enc); 5961 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5962 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5963 vpackuswb(dst, xtmp1, dst, vec_enc); 5964 } 5965 5966 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5967 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5968 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5969 // Add the popcount of upper and lower bytes of word. 5970 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5971 vpsrlw(dst, xtmp1, 8, vec_enc); 5972 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5973 vpaddw(dst, dst, xtmp1, vec_enc); 5974 } 5975 5976 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5977 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5978 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5979 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5980 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5981 } 5982 5983 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5984 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5985 switch(bt) { 5986 case T_LONG: 5987 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5988 break; 5989 case T_INT: 5990 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5991 break; 5992 case T_CHAR: 5993 case T_SHORT: 5994 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5995 break; 5996 case T_BYTE: 5997 case T_BOOLEAN: 5998 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5999 break; 6000 default: 6001 fatal("Unsupported type %s", type2name(bt)); 6002 break; 6003 } 6004 } 6005 6006 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6007 KRegister mask, bool merge, int vec_enc) { 6008 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6009 switch(bt) { 6010 case T_LONG: 6011 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 6012 evpopcntq(dst, mask, src, merge, vec_enc); 6013 break; 6014 case T_INT: 6015 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 6016 evpopcntd(dst, mask, src, merge, vec_enc); 6017 break; 6018 case T_CHAR: 6019 case T_SHORT: 6020 assert(VM_Version::supports_avx512_bitalg(), ""); 6021 evpopcntw(dst, mask, src, merge, vec_enc); 6022 break; 6023 case T_BYTE: 6024 case T_BOOLEAN: 6025 assert(VM_Version::supports_avx512_bitalg(), ""); 6026 evpopcntb(dst, mask, src, merge, vec_enc); 6027 break; 6028 default: 6029 fatal("Unsupported type %s", type2name(bt)); 6030 break; 6031 } 6032 } 6033 6034 // Bit reversal algorithm first reverses the bits of each byte followed by 6035 // a byte level reversal for multi-byte primitive types (short/int/long). 6036 // Algorithm performs a lookup table access to get reverse bit sequence 6037 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 6038 // is obtained by swapping the reverse bit sequences of upper and lower 6039 // nibble of a byte. 6040 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6041 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6042 if (VM_Version::supports_avx512vlbw()) { 6043 6044 // Get the reverse bit sequence of lower nibble of each byte. 6045 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 6046 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6047 evpandq(dst, xtmp2, src, vec_enc); 6048 vpshufb(dst, xtmp1, dst, vec_enc); 6049 vpsllq(dst, dst, 4, vec_enc); 6050 6051 // Get the reverse bit sequence of upper nibble of each byte. 6052 vpandn(xtmp2, xtmp2, src, vec_enc); 6053 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6054 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6055 6056 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6057 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6058 evporq(xtmp2, dst, xtmp2, vec_enc); 6059 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6060 6061 } else if(vec_enc == Assembler::AVX_512bit) { 6062 // Shift based bit reversal. 6063 assert(bt == T_LONG || bt == T_INT, ""); 6064 6065 // Swap lower and upper nibble of each byte. 6066 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 6067 6068 // Swap two least and most significant bits of each nibble. 6069 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 6070 6071 // Swap adjacent pair of bits. 6072 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6073 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 6074 6075 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6076 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 6077 } else { 6078 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 6079 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6080 6081 // Get the reverse bit sequence of lower nibble of each byte. 6082 vpand(dst, xtmp2, src, vec_enc); 6083 vpshufb(dst, xtmp1, dst, vec_enc); 6084 vpsllq(dst, dst, 4, vec_enc); 6085 6086 // Get the reverse bit sequence of upper nibble of each byte. 6087 vpandn(xtmp2, xtmp2, src, vec_enc); 6088 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6089 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6090 6091 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6092 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6093 vpor(xtmp2, dst, xtmp2, vec_enc); 6094 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6095 } 6096 } 6097 6098 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 6099 XMMRegister xtmp, Register rscratch) { 6100 assert(VM_Version::supports_gfni(), ""); 6101 assert(rscratch != noreg || always_reachable(mask), "missing"); 6102 6103 // Galois field instruction based bit reversal based on following algorithm. 6104 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6105 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 6106 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 6107 vector_reverse_byte(bt, dst, xtmp, vec_enc); 6108 } 6109 6110 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 6111 XMMRegister xtmp1, Register rtmp, int vec_enc) { 6112 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 6113 evpandq(dst, xtmp1, src, vec_enc); 6114 vpsllq(dst, dst, nbits, vec_enc); 6115 vpandn(xtmp1, xtmp1, src, vec_enc); 6116 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 6117 evporq(dst, dst, xtmp1, vec_enc); 6118 } 6119 6120 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6121 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6122 // Shift based bit reversal. 6123 assert(VM_Version::supports_evex(), ""); 6124 switch(bt) { 6125 case T_LONG: 6126 // Swap upper and lower double word of each quad word. 6127 evprorq(xtmp1, k0, src, 32, true, vec_enc); 6128 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 6129 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6130 break; 6131 case T_INT: 6132 // Swap upper and lower word of each double word. 6133 evprord(xtmp1, k0, src, 16, true, vec_enc); 6134 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6135 break; 6136 case T_CHAR: 6137 case T_SHORT: 6138 // Swap upper and lower byte of each word. 6139 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 6140 break; 6141 case T_BYTE: 6142 evmovdquq(dst, k0, src, true, vec_enc); 6143 break; 6144 default: 6145 fatal("Unsupported type %s", type2name(bt)); 6146 break; 6147 } 6148 } 6149 6150 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 6151 if (bt == T_BYTE) { 6152 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 6153 evmovdquq(dst, k0, src, true, vec_enc); 6154 } else { 6155 vmovdqu(dst, src); 6156 } 6157 return; 6158 } 6159 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 6160 // pre-computed shuffle indices. 6161 switch(bt) { 6162 case T_LONG: 6163 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 6164 break; 6165 case T_INT: 6166 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 6167 break; 6168 case T_CHAR: 6169 case T_SHORT: 6170 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6171 break; 6172 default: 6173 fatal("Unsupported type %s", type2name(bt)); 6174 break; 6175 } 6176 vpshufb(dst, src, dst, vec_enc); 6177 } 6178 6179 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6180 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6181 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6182 assert(is_integral_type(bt), ""); 6183 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6184 assert(VM_Version::supports_avx512cd(), ""); 6185 switch(bt) { 6186 case T_LONG: 6187 evplzcntq(dst, ktmp, src, merge, vec_enc); 6188 break; 6189 case T_INT: 6190 evplzcntd(dst, ktmp, src, merge, vec_enc); 6191 break; 6192 case T_SHORT: 6193 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6194 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6195 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6196 vpunpckhwd(dst, xtmp1, src, vec_enc); 6197 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6198 vpackusdw(dst, xtmp2, dst, vec_enc); 6199 break; 6200 case T_BYTE: 6201 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6202 // accessing the lookup table. 6203 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6204 // accessing the lookup table. 6205 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6206 assert(VM_Version::supports_avx512bw(), ""); 6207 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6208 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6209 vpand(xtmp2, dst, src, vec_enc); 6210 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6211 vpsrlw(xtmp3, src, 4, vec_enc); 6212 vpand(xtmp3, dst, xtmp3, vec_enc); 6213 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6214 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6215 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6216 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6217 break; 6218 default: 6219 fatal("Unsupported type %s", type2name(bt)); 6220 break; 6221 } 6222 } 6223 6224 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6225 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6226 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6227 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6228 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6229 // accessing the lookup table. 6230 vpand(dst, xtmp2, src, vec_enc); 6231 vpshufb(dst, xtmp1, dst, vec_enc); 6232 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6233 // accessing the lookup table. 6234 vpsrlw(xtmp3, src, 4, vec_enc); 6235 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6236 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6237 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6238 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6239 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6240 vpaddb(dst, dst, xtmp2, vec_enc); 6241 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6242 } 6243 6244 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6245 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6246 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6247 // Add zero counts of lower byte and upper byte of a word if 6248 // upper byte holds a zero value. 6249 vpsrlw(xtmp3, src, 8, vec_enc); 6250 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6251 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6252 vpsllw(xtmp2, dst, 8, vec_enc); 6253 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6254 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6255 vpsrlw(dst, dst, 8, vec_enc); 6256 } 6257 6258 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6259 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6260 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6261 // hence biased exponent can be used to compute leading zero count as per 6262 // following formula:- 6263 // LZCNT = 31 - (biased_exp - 127) 6264 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6265 6266 // Broadcast 0xFF 6267 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6268 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6269 6270 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher 6271 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit 6272 // contributes to the leading number of zeros. 6273 vpsrld(xtmp2, src, 1, vec_enc); 6274 vpandn(xtmp3, xtmp2, src, vec_enc); 6275 6276 // Extract biased exponent. 6277 vcvtdq2ps(dst, xtmp3, vec_enc); 6278 vpsrld(dst, dst, 23, vec_enc); 6279 vpand(dst, dst, xtmp1, vec_enc); 6280 6281 // Broadcast 127. 6282 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6283 // Exponent = biased_exp - 127 6284 vpsubd(dst, dst, xtmp1, vec_enc); 6285 6286 // Exponent_plus_one = Exponent + 1 6287 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6288 vpaddd(dst, dst, xtmp3, vec_enc); 6289 6290 // Replace -ve exponent with zero, exponent is -ve when src 6291 // lane contains a zero value. 6292 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6293 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6294 6295 // Rematerialize broadcast 32. 6296 vpslld(xtmp1, xtmp3, 5, vec_enc); 6297 // Exponent is 32 if corresponding source lane contains max_int value. 6298 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6299 // LZCNT = 32 - exponent_plus_one 6300 vpsubd(dst, xtmp1, dst, vec_enc); 6301 6302 // Replace LZCNT with a value 1 if corresponding source lane 6303 // contains max_int value. 6304 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6305 6306 // Replace biased_exp with 0 if source lane value is less than zero. 6307 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6308 vblendvps(dst, dst, xtmp2, src, vec_enc); 6309 } 6310 6311 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6312 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6313 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6314 // Add zero counts of lower word and upper word of a double word if 6315 // upper word holds a zero value. 6316 vpsrld(xtmp3, src, 16, vec_enc); 6317 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6318 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6319 vpslld(xtmp2, dst, 16, vec_enc); 6320 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6321 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6322 vpsrld(dst, dst, 16, vec_enc); 6323 // Add zero counts of lower doubleword and upper doubleword of a 6324 // quadword if upper doubleword holds a zero value. 6325 vpsrlq(xtmp3, src, 32, vec_enc); 6326 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6327 vpsllq(xtmp2, dst, 32, vec_enc); 6328 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6329 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6330 vpsrlq(dst, dst, 32, vec_enc); 6331 } 6332 6333 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6334 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6335 Register rtmp, int vec_enc) { 6336 assert(is_integral_type(bt), "unexpected type"); 6337 assert(vec_enc < Assembler::AVX_512bit, ""); 6338 switch(bt) { 6339 case T_LONG: 6340 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6341 break; 6342 case T_INT: 6343 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6344 break; 6345 case T_SHORT: 6346 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6347 break; 6348 case T_BYTE: 6349 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6350 break; 6351 default: 6352 fatal("Unsupported type %s", type2name(bt)); 6353 break; 6354 } 6355 } 6356 6357 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6358 switch(bt) { 6359 case T_BYTE: 6360 vpsubb(dst, src1, src2, vec_enc); 6361 break; 6362 case T_SHORT: 6363 vpsubw(dst, src1, src2, vec_enc); 6364 break; 6365 case T_INT: 6366 vpsubd(dst, src1, src2, vec_enc); 6367 break; 6368 case T_LONG: 6369 vpsubq(dst, src1, src2, vec_enc); 6370 break; 6371 default: 6372 fatal("Unsupported type %s", type2name(bt)); 6373 break; 6374 } 6375 } 6376 6377 // Trailing zero count computation is based on leading zero count operation as per 6378 // following equation. All AVX3 targets support AVX512CD feature which offers 6379 // direct vector instruction to compute leading zero count. 6380 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6381 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6382 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6383 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6384 assert(is_integral_type(bt), ""); 6385 // xtmp = -1 6386 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6387 // xtmp = xtmp + src 6388 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6389 // xtmp = xtmp & ~src 6390 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6391 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6392 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6393 vpsub(bt, dst, xtmp4, dst, vec_enc); 6394 } 6395 6396 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6397 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6398 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6399 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6400 assert(is_integral_type(bt), ""); 6401 // xtmp = 0 6402 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6403 // xtmp = 0 - src 6404 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6405 // xtmp = xtmp | src 6406 vpor(xtmp3, xtmp3, src, vec_enc); 6407 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6408 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6409 vpsub(bt, dst, xtmp1, dst, vec_enc); 6410 } 6411 6412 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6413 Label done; 6414 Label neg_divisor_fastpath; 6415 cmpl(divisor, 0); 6416 jccb(Assembler::less, neg_divisor_fastpath); 6417 xorl(rdx, rdx); 6418 divl(divisor); 6419 jmpb(done); 6420 bind(neg_divisor_fastpath); 6421 // Fastpath for divisor < 0: 6422 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6423 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6424 movl(rdx, rax); 6425 subl(rdx, divisor); 6426 if (VM_Version::supports_bmi1()) { 6427 andnl(rax, rdx, rax); 6428 } else { 6429 notl(rdx); 6430 andl(rax, rdx); 6431 } 6432 shrl(rax, 31); 6433 bind(done); 6434 } 6435 6436 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6437 Label done; 6438 Label neg_divisor_fastpath; 6439 cmpl(divisor, 0); 6440 jccb(Assembler::less, neg_divisor_fastpath); 6441 xorl(rdx, rdx); 6442 divl(divisor); 6443 jmpb(done); 6444 bind(neg_divisor_fastpath); 6445 // Fastpath when divisor < 0: 6446 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6447 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6448 movl(rdx, rax); 6449 subl(rax, divisor); 6450 if (VM_Version::supports_bmi1()) { 6451 andnl(rax, rax, rdx); 6452 } else { 6453 notl(rax); 6454 andl(rax, rdx); 6455 } 6456 sarl(rax, 31); 6457 andl(rax, divisor); 6458 subl(rdx, rax); 6459 bind(done); 6460 } 6461 6462 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6463 Label done; 6464 Label neg_divisor_fastpath; 6465 6466 cmpl(divisor, 0); 6467 jccb(Assembler::less, neg_divisor_fastpath); 6468 xorl(rdx, rdx); 6469 divl(divisor); 6470 jmpb(done); 6471 bind(neg_divisor_fastpath); 6472 // Fastpath for divisor < 0: 6473 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6474 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6475 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6476 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6477 movl(rdx, rax); 6478 subl(rax, divisor); 6479 if (VM_Version::supports_bmi1()) { 6480 andnl(rax, rax, rdx); 6481 } else { 6482 notl(rax); 6483 andl(rax, rdx); 6484 } 6485 movl(tmp, rax); 6486 shrl(rax, 31); // quotient 6487 sarl(tmp, 31); 6488 andl(tmp, divisor); 6489 subl(rdx, tmp); // remainder 6490 bind(done); 6491 } 6492 6493 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6494 XMMRegister xtmp2, Register rtmp) { 6495 if(VM_Version::supports_gfni()) { 6496 // Galois field instruction based bit reversal based on following algorithm. 6497 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6498 mov64(rtmp, 0x8040201008040201L); 6499 movq(xtmp1, src); 6500 movq(xtmp2, rtmp); 6501 gf2p8affineqb(xtmp1, xtmp2, 0); 6502 movq(dst, xtmp1); 6503 } else { 6504 // Swap even and odd numbered bits. 6505 movl(rtmp, src); 6506 andl(rtmp, 0x55555555); 6507 shll(rtmp, 1); 6508 movl(dst, src); 6509 andl(dst, 0xAAAAAAAA); 6510 shrl(dst, 1); 6511 orl(dst, rtmp); 6512 6513 // Swap LSB and MSB 2 bits of each nibble. 6514 movl(rtmp, dst); 6515 andl(rtmp, 0x33333333); 6516 shll(rtmp, 2); 6517 andl(dst, 0xCCCCCCCC); 6518 shrl(dst, 2); 6519 orl(dst, rtmp); 6520 6521 // Swap LSB and MSB 4 bits of each byte. 6522 movl(rtmp, dst); 6523 andl(rtmp, 0x0F0F0F0F); 6524 shll(rtmp, 4); 6525 andl(dst, 0xF0F0F0F0); 6526 shrl(dst, 4); 6527 orl(dst, rtmp); 6528 } 6529 bswapl(dst); 6530 } 6531 6532 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6533 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6534 if(VM_Version::supports_gfni()) { 6535 // Galois field instruction based bit reversal based on following algorithm. 6536 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6537 mov64(rtmp1, 0x8040201008040201L); 6538 movq(xtmp1, src); 6539 movq(xtmp2, rtmp1); 6540 gf2p8affineqb(xtmp1, xtmp2, 0); 6541 movq(dst, xtmp1); 6542 } else { 6543 // Swap even and odd numbered bits. 6544 movq(rtmp1, src); 6545 mov64(rtmp2, 0x5555555555555555L); 6546 andq(rtmp1, rtmp2); 6547 shlq(rtmp1, 1); 6548 movq(dst, src); 6549 notq(rtmp2); 6550 andq(dst, rtmp2); 6551 shrq(dst, 1); 6552 orq(dst, rtmp1); 6553 6554 // Swap LSB and MSB 2 bits of each nibble. 6555 movq(rtmp1, dst); 6556 mov64(rtmp2, 0x3333333333333333L); 6557 andq(rtmp1, rtmp2); 6558 shlq(rtmp1, 2); 6559 notq(rtmp2); 6560 andq(dst, rtmp2); 6561 shrq(dst, 2); 6562 orq(dst, rtmp1); 6563 6564 // Swap LSB and MSB 4 bits of each byte. 6565 movq(rtmp1, dst); 6566 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6567 andq(rtmp1, rtmp2); 6568 shlq(rtmp1, 4); 6569 notq(rtmp2); 6570 andq(dst, rtmp2); 6571 shrq(dst, 4); 6572 orq(dst, rtmp1); 6573 } 6574 bswapq(dst); 6575 } 6576 6577 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6578 Label done; 6579 Label neg_divisor_fastpath; 6580 cmpq(divisor, 0); 6581 jccb(Assembler::less, neg_divisor_fastpath); 6582 xorl(rdx, rdx); 6583 divq(divisor); 6584 jmpb(done); 6585 bind(neg_divisor_fastpath); 6586 // Fastpath for divisor < 0: 6587 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6588 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6589 movq(rdx, rax); 6590 subq(rdx, divisor); 6591 if (VM_Version::supports_bmi1()) { 6592 andnq(rax, rdx, rax); 6593 } else { 6594 notq(rdx); 6595 andq(rax, rdx); 6596 } 6597 shrq(rax, 63); 6598 bind(done); 6599 } 6600 6601 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6602 Label done; 6603 Label neg_divisor_fastpath; 6604 cmpq(divisor, 0); 6605 jccb(Assembler::less, neg_divisor_fastpath); 6606 xorq(rdx, rdx); 6607 divq(divisor); 6608 jmp(done); 6609 bind(neg_divisor_fastpath); 6610 // Fastpath when divisor < 0: 6611 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6612 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6613 movq(rdx, rax); 6614 subq(rax, divisor); 6615 if (VM_Version::supports_bmi1()) { 6616 andnq(rax, rax, rdx); 6617 } else { 6618 notq(rax); 6619 andq(rax, rdx); 6620 } 6621 sarq(rax, 63); 6622 andq(rax, divisor); 6623 subq(rdx, rax); 6624 bind(done); 6625 } 6626 6627 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6628 Label done; 6629 Label neg_divisor_fastpath; 6630 cmpq(divisor, 0); 6631 jccb(Assembler::less, neg_divisor_fastpath); 6632 xorq(rdx, rdx); 6633 divq(divisor); 6634 jmp(done); 6635 bind(neg_divisor_fastpath); 6636 // Fastpath for divisor < 0: 6637 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6638 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6639 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6640 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6641 movq(rdx, rax); 6642 subq(rax, divisor); 6643 if (VM_Version::supports_bmi1()) { 6644 andnq(rax, rax, rdx); 6645 } else { 6646 notq(rax); 6647 andq(rax, rdx); 6648 } 6649 movq(tmp, rax); 6650 shrq(rax, 63); // quotient 6651 sarq(tmp, 63); 6652 andq(tmp, divisor); 6653 subq(rdx, tmp); // remainder 6654 bind(done); 6655 } 6656 6657 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6658 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6659 int vlen_enc) { 6660 assert(VM_Version::supports_avx512bw(), ""); 6661 // Byte shuffles are inlane operations and indices are determined using 6662 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6663 // normalized to index range 0-15. This makes sure that all the multiples 6664 // of an index value are placed at same relative position in 128 bit 6665 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6666 // will be 16th element in their respective 128 bit lanes. 6667 movl(rtmp, 16); 6668 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6669 6670 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6671 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6672 // original shuffle indices and move the shuffled lanes corresponding to true 6673 // mask to destination vector. 6674 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6675 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6676 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6677 6678 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6679 // and broadcasting second 128 bit lane. 6680 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6681 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6682 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6683 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6684 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6685 6686 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6687 // and broadcasting third 128 bit lane. 6688 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6689 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6690 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6691 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6692 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6693 6694 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6695 // and broadcasting third 128 bit lane. 6696 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6697 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6698 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6699 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6700 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6701 } 6702 6703 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6704 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6705 if (vlen_enc == AVX_128bit) { 6706 vpermilps(dst, src, shuffle, vlen_enc); 6707 } else if (bt == T_INT) { 6708 vpermd(dst, shuffle, src, vlen_enc); 6709 } else { 6710 assert(bt == T_FLOAT, ""); 6711 vpermps(dst, shuffle, src, vlen_enc); 6712 } 6713 } 6714 6715 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 6716 switch(opcode) { 6717 case Op_AddHF: vaddsh(dst, src1, src2); break; 6718 case Op_SubHF: vsubsh(dst, src1, src2); break; 6719 case Op_MulHF: vmulsh(dst, src1, src2); break; 6720 case Op_DivHF: vdivsh(dst, src1, src2); break; 6721 default: assert(false, "%s", NodeClassNames[opcode]); break; 6722 } 6723 } 6724 6725 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6726 switch(elem_bt) { 6727 case T_BYTE: 6728 if (ideal_opc == Op_SaturatingAddV) { 6729 vpaddsb(dst, src1, src2, vlen_enc); 6730 } else { 6731 assert(ideal_opc == Op_SaturatingSubV, ""); 6732 vpsubsb(dst, src1, src2, vlen_enc); 6733 } 6734 break; 6735 case T_SHORT: 6736 if (ideal_opc == Op_SaturatingAddV) { 6737 vpaddsw(dst, src1, src2, vlen_enc); 6738 } else { 6739 assert(ideal_opc == Op_SaturatingSubV, ""); 6740 vpsubsw(dst, src1, src2, vlen_enc); 6741 } 6742 break; 6743 default: 6744 fatal("Unsupported type %s", type2name(elem_bt)); 6745 break; 6746 } 6747 } 6748 6749 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6750 switch(elem_bt) { 6751 case T_BYTE: 6752 if (ideal_opc == Op_SaturatingAddV) { 6753 vpaddusb(dst, src1, src2, vlen_enc); 6754 } else { 6755 assert(ideal_opc == Op_SaturatingSubV, ""); 6756 vpsubusb(dst, src1, src2, vlen_enc); 6757 } 6758 break; 6759 case T_SHORT: 6760 if (ideal_opc == Op_SaturatingAddV) { 6761 vpaddusw(dst, src1, src2, vlen_enc); 6762 } else { 6763 assert(ideal_opc == Op_SaturatingSubV, ""); 6764 vpsubusw(dst, src1, src2, vlen_enc); 6765 } 6766 break; 6767 default: 6768 fatal("Unsupported type %s", type2name(elem_bt)); 6769 break; 6770 } 6771 } 6772 6773 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6774 XMMRegister src2, KRegister ktmp, int vlen_enc) { 6775 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input. 6776 // overflow_mask = Inp1 <u Inp2 6777 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc); 6778 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative) 6779 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false); 6780 } 6781 6782 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6783 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6784 // Emulate unsigned comparison using signed comparison 6785 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE 6786 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true); 6787 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc); 6788 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc); 6789 6790 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc); 6791 6792 // Res = INP1 - INP2 (non-commutative and non-associative) 6793 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6794 // Res = Mask ? Zero : Res 6795 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); 6796 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc); 6797 } 6798 6799 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6800 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) { 6801 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation. 6802 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2) 6803 // Res = Signed Add INP1, INP2 6804 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6805 // T1 = SRC1 | SRC2 6806 vpor(xtmp1, src1, src2, vlen_enc); 6807 // Max_Unsigned = -1 6808 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6809 // Unsigned compare: Mask = Res <u T1 6810 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc); 6811 // res = Mask ? Max_Unsigned : Res 6812 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc); 6813 } 6814 6815 // 6816 // Section 2-13 Hacker's Delight list following overflow detection check for saturating 6817 // unsigned addition operation. 6818 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1 6819 // 6820 // We empirically determined its semantic equivalence to following reduced expression 6821 // overflow_mask = (a + b) <u (a | b) 6822 // 6823 // and also verified it though Alive2 solver. 6824 // (https://alive2.llvm.org/ce/z/XDQ7dY) 6825 // 6826 6827 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6828 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) { 6829 // Res = Signed Add INP1, INP2 6830 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6831 // Compute T1 = INP1 | INP2 6832 vpor(xtmp3, src1, src2, vlen_enc); 6833 // T1 = Minimum signed value. 6834 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6835 // Convert T1 to signed value, T1 = T1 + MIN_VALUE 6836 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc); 6837 // Convert Res to signed value, Res<s> = Res + MIN_VALUE 6838 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc); 6839 // Compute overflow detection mask = Res<1> <s T1 6840 if (elem_bt == T_INT) { 6841 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc); 6842 } else { 6843 assert(elem_bt == T_LONG, ""); 6844 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc); 6845 } 6846 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6847 } 6848 6849 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6850 int vlen_enc, bool xtmp2_hold_M1) { 6851 if (VM_Version::supports_avx512dq()) { 6852 evpmovq2m(ktmp, src, vlen_enc); 6853 } else { 6854 assert(VM_Version::supports_evex(), ""); 6855 if (!xtmp2_hold_M1) { 6856 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6857 } 6858 evpsraq(xtmp1, src, 63, vlen_enc); 6859 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6860 } 6861 } 6862 6863 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6864 int vlen_enc, bool xtmp2_hold_M1) { 6865 if (VM_Version::supports_avx512dq()) { 6866 evpmovd2m(ktmp, src, vlen_enc); 6867 } else { 6868 assert(VM_Version::supports_evex(), ""); 6869 if (!xtmp2_hold_M1) { 6870 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6871 } 6872 vpsrad(xtmp1, src, 31, vlen_enc); 6873 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6874 } 6875 } 6876 6877 6878 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 6879 if (elem_bt == T_LONG) { 6880 if (VM_Version::supports_evex()) { 6881 evpsraq(dst, src, 63, vlen_enc); 6882 } else { 6883 vpsrad(dst, src, 31, vlen_enc); 6884 vpshufd(dst, dst, 0xF5, vlen_enc); 6885 } 6886 } else { 6887 assert(elem_bt == T_INT, ""); 6888 vpsrad(dst, src, 31, vlen_enc); 6889 } 6890 } 6891 6892 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6893 if (compute_allones) { 6894 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6895 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6896 } else { 6897 vpcmpeqq(allones, allones, allones, vlen_enc); 6898 } 6899 } 6900 if (elem_bt == T_LONG) { 6901 vpsrlq(dst, allones, 1, vlen_enc); 6902 } else { 6903 assert(elem_bt == T_INT, ""); 6904 vpsrld(dst, allones, 1, vlen_enc); 6905 } 6906 } 6907 6908 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6909 if (compute_allones) { 6910 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6911 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6912 } else { 6913 vpcmpeqq(allones, allones, allones, vlen_enc); 6914 } 6915 } 6916 if (elem_bt == T_LONG) { 6917 vpsllq(dst, allones, 63, vlen_enc); 6918 } else { 6919 assert(elem_bt == T_INT, ""); 6920 vpslld(dst, allones, 31, vlen_enc); 6921 } 6922 } 6923 6924 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2, 6925 Assembler::ComparisonPredicate cond, int vlen_enc) { 6926 switch(elem_bt) { 6927 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break; 6928 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break; 6929 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break; 6930 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break; 6931 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6932 } 6933 } 6934 6935 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6936 switch(elem_bt) { 6937 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break; 6938 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break; 6939 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break; 6940 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break; 6941 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6942 } 6943 } 6944 6945 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1, 6946 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) { 6947 if (elem_bt == T_LONG) { 6948 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6949 } else { 6950 assert(elem_bt == T_INT, ""); 6951 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6952 } 6953 } 6954 6955 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6956 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6957 KRegister ktmp1, KRegister ktmp2, int vlen_enc) { 6958 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6959 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6960 // Overflow detection based on Hacker's delight section 2-13. 6961 if (ideal_opc == Op_SaturatingAddV) { 6962 // res = src1 + src2 6963 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6964 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6965 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6966 vpxor(xtmp1, dst, src1, vlen_enc); 6967 vpxor(xtmp2, dst, src2, vlen_enc); 6968 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6969 } else { 6970 assert(ideal_opc == Op_SaturatingSubV, ""); 6971 // res = src1 - src2 6972 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6973 // Overflow occurs when both inputs have opposite polarity and 6974 // result polarity does not comply with first input polarity. 6975 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6976 vpxor(xtmp1, src1, src2, vlen_enc); 6977 vpxor(xtmp2, dst, src1, vlen_enc); 6978 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6979 } 6980 6981 // Compute overflow detection mask. 6982 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc); 6983 // Note: xtmp1 hold -1 in all its lanes after above call. 6984 6985 // Compute mask based on first input polarity. 6986 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true); 6987 6988 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6989 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6990 6991 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to 6992 // set bits in first input polarity mask holds a min value. 6993 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc); 6994 // Blend destination lanes with saturated values using overflow detection mask. 6995 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc); 6996 } 6997 6998 6999 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 7000 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 7001 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) { 7002 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 7003 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 7004 // Overflow detection based on Hacker's delight section 2-13. 7005 if (ideal_opc == Op_SaturatingAddV) { 7006 // res = src1 + src2 7007 vpadd(elem_bt, dst, src1, src2, vlen_enc); 7008 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 7009 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 7010 vpxor(xtmp1, dst, src1, vlen_enc); 7011 vpxor(xtmp2, dst, src2, vlen_enc); 7012 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 7013 } else { 7014 assert(ideal_opc == Op_SaturatingSubV, ""); 7015 // res = src1 - src2 7016 vpsub(elem_bt, dst, src1, src2, vlen_enc); 7017 // Overflow occurs when both inputs have opposite polarity and 7018 // result polarity does not comply with first input polarity. 7019 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 7020 vpxor(xtmp1, src1, src2, vlen_enc); 7021 vpxor(xtmp2, dst, src1, vlen_enc); 7022 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 7023 } 7024 7025 // Sign-extend to compute overflow detection mask. 7026 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc); 7027 7028 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc); 7029 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc); 7030 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 7031 7032 // Compose saturating min/max vector using first input polarity mask. 7033 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc); 7034 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc); 7035 7036 // Blend result with saturating vector using overflow detection mask. 7037 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 7038 } 7039 7040 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7041 switch(elem_bt) { 7042 case T_BYTE: 7043 if (ideal_opc == Op_SaturatingAddV) { 7044 vpaddsb(dst, src1, src2, vlen_enc); 7045 } else { 7046 assert(ideal_opc == Op_SaturatingSubV, ""); 7047 vpsubsb(dst, src1, src2, vlen_enc); 7048 } 7049 break; 7050 case T_SHORT: 7051 if (ideal_opc == Op_SaturatingAddV) { 7052 vpaddsw(dst, src1, src2, vlen_enc); 7053 } else { 7054 assert(ideal_opc == Op_SaturatingSubV, ""); 7055 vpsubsw(dst, src1, src2, vlen_enc); 7056 } 7057 break; 7058 default: 7059 fatal("Unsupported type %s", type2name(elem_bt)); 7060 break; 7061 } 7062 } 7063 7064 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7065 switch(elem_bt) { 7066 case T_BYTE: 7067 if (ideal_opc == Op_SaturatingAddV) { 7068 vpaddusb(dst, src1, src2, vlen_enc); 7069 } else { 7070 assert(ideal_opc == Op_SaturatingSubV, ""); 7071 vpsubusb(dst, src1, src2, vlen_enc); 7072 } 7073 break; 7074 case T_SHORT: 7075 if (ideal_opc == Op_SaturatingAddV) { 7076 vpaddusw(dst, src1, src2, vlen_enc); 7077 } else { 7078 assert(ideal_opc == Op_SaturatingSubV, ""); 7079 vpsubusw(dst, src1, src2, vlen_enc); 7080 } 7081 break; 7082 default: 7083 fatal("Unsupported type %s", type2name(elem_bt)); 7084 break; 7085 } 7086 } 7087 7088 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 7089 XMMRegister src2, int vlen_enc) { 7090 switch(elem_bt) { 7091 case T_BYTE: 7092 evpermi2b(dst, src1, src2, vlen_enc); 7093 break; 7094 case T_SHORT: 7095 evpermi2w(dst, src1, src2, vlen_enc); 7096 break; 7097 case T_INT: 7098 evpermi2d(dst, src1, src2, vlen_enc); 7099 break; 7100 case T_LONG: 7101 evpermi2q(dst, src1, src2, vlen_enc); 7102 break; 7103 case T_FLOAT: 7104 evpermi2ps(dst, src1, src2, vlen_enc); 7105 break; 7106 case T_DOUBLE: 7107 evpermi2pd(dst, src1, src2, vlen_enc); 7108 break; 7109 default: 7110 fatal("Unsupported type %s", type2name(elem_bt)); 7111 break; 7112 } 7113 } 7114 7115 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) { 7116 if (is_unsigned) { 7117 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7118 } else { 7119 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7120 } 7121 } 7122 7123 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) { 7124 if (is_unsigned) { 7125 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7126 } else { 7127 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7128 } 7129 } 7130 7131 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 7132 switch(opcode) { 7133 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 7134 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 7135 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 7136 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 7137 default: assert(false, "%s", NodeClassNames[opcode]); break; 7138 } 7139 } 7140 7141 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7142 switch(opcode) { 7143 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 7144 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 7145 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 7146 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 7147 default: assert(false, "%s", NodeClassNames[opcode]); break; 7148 } 7149 } 7150 7151 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7152 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) { 7153 vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit); 7154 } 7155 7156 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7157 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 7158 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) { 7159 // Move sign bits of src2 to mask register. 7160 evpmovw2m(ktmp, src2, vlen_enc); 7161 // xtmp1 = src2 < 0 ? src2 : src1 7162 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7163 // xtmp2 = src2 < 0 ? ? src1 : src2 7164 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7165 // Idea behind above swapping is to make seconds source operand a +ve value. 7166 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in 7167 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction, 7168 // the second source operand, either a NaN or a valid floating-point value, is returned 7169 // dst = max(xtmp1, xtmp2) 7170 evmaxph(dst, xtmp1, xtmp2, vlen_enc); 7171 // isNaN = is_unordered_quiet(xtmp1) 7172 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7173 // Final result is same as first source if its a NaN value, 7174 // in case second operand holds a NaN value then as per above semantics 7175 // result is same as second operand. 7176 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7177 } else { 7178 assert(opcode == Op_MinVHF || opcode == Op_MinHF, ""); 7179 // Move sign bits of src1 to mask register. 7180 evpmovw2m(ktmp, src1, vlen_enc); 7181 // xtmp1 = src1 < 0 ? src2 : src1 7182 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7183 // xtmp2 = src1 < 0 ? src1 : src2 7184 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7185 // Idea behind above swapping is to make seconds source operand a -ve value. 7186 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in 7187 // the second source operand is returned. 7188 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN 7189 // or a valid floating-point value, is written to the result. 7190 // dst = min(xtmp1, xtmp2) 7191 evminph(dst, xtmp1, xtmp2, vlen_enc); 7192 // isNaN = is_unordered_quiet(xtmp1) 7193 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7194 // Final result is same as first source if its a NaN value, 7195 // in case second operand holds a NaN value then as per above semantics 7196 // result is same as second operand. 7197 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7198 } 7199 }