1 /* 2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "gc/shared/barrierSet.hpp" 28 #include "gc/shared/barrierSetAssembler.hpp" 29 #include "oops/methodData.hpp" 30 #include "opto/c2_MacroAssembler.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/output.hpp" 33 #include "opto/opcodes.hpp" 34 #include "opto/subnode.hpp" 35 #include "runtime/globals.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 #include "utilities/checkedCast.hpp" 39 #include "utilities/globalDefinitions.hpp" 40 #include "utilities/powerOfTwo.hpp" 41 #include "utilities/sizes.hpp" 42 43 #ifdef PRODUCT 44 #define BLOCK_COMMENT(str) /* nothing */ 45 #define STOP(error) stop(error) 46 #else 47 #define BLOCK_COMMENT(str) block_comment(str) 48 #define STOP(error) block_comment(error); stop(error) 49 #endif 50 51 // C2 compiled method's prolog code. 52 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) { 53 if (C->clinit_barrier_on_entry()) { 54 assert(VM_Version::supports_fast_class_init_checks(), "sanity"); 55 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started"); 56 57 Label L_skip_barrier; 58 Register klass = rscratch1; 59 60 mov_metadata(klass, C->method()->holder()->constant_encoding()); 61 clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 62 63 jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 64 65 bind(L_skip_barrier); 66 } 67 68 int framesize = C->output()->frame_size_in_bytes(); 69 int bangsize = C->output()->bang_size_in_bytes(); 70 bool fp_mode_24b = false; 71 int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0; 72 73 // WARNING: Initial instruction MUST be 5 bytes or longer so that 74 // NativeJump::patch_verified_entry will be able to patch out the entry 75 // code safely. The push to verify stack depth is ok at 5 bytes, 76 // the frame allocation can be either 3 or 6 bytes. So if we don't do 77 // stack bang then we must use the 6 byte frame allocation even if 78 // we have no frame. :-( 79 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 80 81 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 82 // Remove word for return addr 83 framesize -= wordSize; 84 stack_bang_size -= wordSize; 85 86 // Calls to C2R adapters often do not accept exceptional returns. 87 // We require that their callers must bang for them. But be careful, because 88 // some VM calls (such as call site linkage) can use several kilobytes of 89 // stack. But the stack safety zone should account for that. 90 // See bugs 4446381, 4468289, 4497237. 91 if (stack_bang_size > 0) { 92 generate_stack_overflow_check(stack_bang_size); 93 94 // We always push rbp, so that on return to interpreter rbp, will be 95 // restored correctly and we can correct the stack. 96 push(rbp); 97 // Save caller's stack pointer into RBP if the frame pointer is preserved. 98 if (PreserveFramePointer) { 99 mov(rbp, rsp); 100 } 101 // Remove word for ebp 102 framesize -= wordSize; 103 104 // Create frame 105 if (framesize) { 106 subptr(rsp, framesize); 107 } 108 } else { 109 // Create frame (force generation of a 4 byte immediate value) 110 subptr_imm32(rsp, framesize); 111 112 // Save RBP register now. 113 framesize -= wordSize; 114 movptr(Address(rsp, framesize), rbp); 115 // Save caller's stack pointer into RBP if the frame pointer is preserved. 116 if (PreserveFramePointer) { 117 movptr(rbp, rsp); 118 if (framesize > 0) { 119 addptr(rbp, framesize); 120 } 121 } 122 } 123 124 if (C->needs_stack_repair()) { 125 // Save stack increment just below the saved rbp (also account for fixed framesize and rbp) 126 assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned"); 127 movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize); 128 } 129 130 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 131 framesize -= wordSize; 132 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 133 } 134 135 #ifndef _LP64 136 // If method sets FPU control word do it now 137 if (fp_mode_24b) { 138 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 139 } 140 if (UseSSE >= 2 && VerifyFPU) { 141 verify_FPU(0, "FPU stack must be clean on entry"); 142 } 143 #endif 144 145 #ifdef ASSERT 146 if (VerifyStackAtCalls) { 147 Label L; 148 push(rax); 149 mov(rax, rsp); 150 andptr(rax, StackAlignmentInBytes-1); 151 cmpptr(rax, StackAlignmentInBytes-wordSize); 152 pop(rax); 153 jcc(Assembler::equal, L); 154 STOP("Stack is not properly aligned!"); 155 bind(L); 156 } 157 #endif 158 } 159 160 void C2_MacroAssembler::entry_barrier() { 161 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 162 #ifdef _LP64 163 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 164 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 165 Label dummy_slow_path; 166 Label dummy_continuation; 167 Label* slow_path = &dummy_slow_path; 168 Label* continuation = &dummy_continuation; 169 if (!Compile::current()->output()->in_scratch_emit_size()) { 170 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 171 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 172 Compile::current()->output()->add_stub(stub); 173 slow_path = &stub->entry(); 174 continuation = &stub->continuation(); 175 } 176 bs->nmethod_entry_barrier(this, slow_path, continuation); 177 } 178 #else 179 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 180 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 181 #endif 182 } 183 184 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 185 switch (vlen_in_bytes) { 186 case 4: // fall-through 187 case 8: // fall-through 188 case 16: return Assembler::AVX_128bit; 189 case 32: return Assembler::AVX_256bit; 190 case 64: return Assembler::AVX_512bit; 191 192 default: { 193 ShouldNotReachHere(); 194 return Assembler::AVX_NoVec; 195 } 196 } 197 } 198 199 // fast_lock and fast_unlock used by C2 200 201 // Because the transitions from emitted code to the runtime 202 // monitorenter/exit helper stubs are so slow it's critical that 203 // we inline both the stack-locking fast path and the inflated fast path. 204 // 205 // See also: cmpFastLock and cmpFastUnlock. 206 // 207 // What follows is a specialized inline transliteration of the code 208 // in enter() and exit(). If we're concerned about I$ bloat another 209 // option would be to emit TrySlowEnter and TrySlowExit methods 210 // at startup-time. These methods would accept arguments as 211 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 212 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 213 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 214 // In practice, however, the # of lock sites is bounded and is usually small. 215 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 216 // if the processor uses simple bimodal branch predictors keyed by EIP 217 // Since the helper routines would be called from multiple synchronization 218 // sites. 219 // 220 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 221 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 222 // to those specialized methods. That'd give us a mostly platform-independent 223 // implementation that the JITs could optimize and inline at their pleasure. 224 // Done correctly, the only time we'd need to cross to native could would be 225 // to park() or unpark() threads. We'd also need a few more unsafe operators 226 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 227 // (b) explicit barriers or fence operations. 228 // 229 // TODO: 230 // 231 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 232 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 233 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 234 // the lock operators would typically be faster than reifying Self. 235 // 236 // * Ideally I'd define the primitives as: 237 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 238 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 239 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 240 // Instead, we're stuck with a rather awkward and brittle register assignments below. 241 // Furthermore the register assignments are overconstrained, possibly resulting in 242 // sub-optimal code near the synchronization site. 243 // 244 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 245 // Alternately, use a better sp-proximity test. 246 // 247 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 248 // Either one is sufficient to uniquely identify a thread. 249 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 250 // 251 // * Intrinsify notify() and notifyAll() for the common cases where the 252 // object is locked by the calling thread but the waitlist is empty. 253 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 254 // 255 // * use jccb and jmpb instead of jcc and jmp to improve code density. 256 // But beware of excessive branch density on AMD Opterons. 257 // 258 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 259 // or failure of the fast path. If the fast path fails then we pass 260 // control to the slow path, typically in C. In fast_lock and 261 // fast_unlock we often branch to DONE_LABEL, just to find that C2 262 // will emit a conditional branch immediately after the node. 263 // So we have branches to branches and lots of ICC.ZF games. 264 // Instead, it might be better to have C2 pass a "FailureLabel" 265 // into fast_lock and fast_unlock. In the case of success, control 266 // will drop through the node. ICC.ZF is undefined at exit. 267 // In the case of failure, the node will branch directly to the 268 // FailureLabel 269 270 271 // obj: object to lock 272 // box: on-stack box address (displaced header location) - KILLED 273 // rax,: tmp -- KILLED 274 // scr: tmp -- KILLED 275 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 276 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 277 Metadata* method_data) { 278 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 279 // Ensure the register assignments are disjoint 280 assert(tmpReg == rax, ""); 281 assert(cx1Reg == noreg, ""); 282 assert(cx2Reg == noreg, ""); 283 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 284 285 // Possible cases that we'll encounter in fast_lock 286 // ------------------------------------------------ 287 // * Inflated 288 // -- unlocked 289 // -- Locked 290 // = by self 291 // = by other 292 // * neutral 293 // * stack-locked 294 // -- by self 295 // = sp-proximity test hits 296 // = sp-proximity test generates false-negative 297 // -- by other 298 // 299 300 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 301 302 if (DiagnoseSyncOnValueBasedClasses != 0) { 303 load_klass(tmpReg, objReg, scrReg); 304 testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 305 jcc(Assembler::notZero, DONE_LABEL); 306 } 307 308 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 309 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 310 jcc(Assembler::notZero, IsInflated); 311 312 if (LockingMode == LM_MONITOR) { 313 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 314 testptr(objReg, objReg); 315 } else { 316 assert(LockingMode == LM_LEGACY, "must be"); 317 // Attempt stack-locking ... 318 orptr (tmpReg, markWord::unlocked_value); 319 if (EnableValhalla) { 320 // Mask inline_type bit such that we go to the slow path if object is an inline type 321 andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place)); 322 } 323 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 324 lock(); 325 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 326 jcc(Assembler::equal, COUNT); // Success 327 328 // Recursive locking. 329 // The object is stack-locked: markword contains stack pointer to BasicLock. 330 // Locked by current thread if difference with current SP is less than one page. 331 subptr(tmpReg, rsp); 332 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 333 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 334 movptr(Address(boxReg, 0), tmpReg); 335 } 336 jmp(DONE_LABEL); 337 338 bind(IsInflated); 339 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 340 341 #ifndef _LP64 342 // Just take slow path to avoid dealing with 64 bit atomic instructions here. 343 orl(boxReg, 1); // set ICC.ZF=0 to indicate failure 344 #else 345 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 346 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 347 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 348 349 // It's inflated and we use scrReg for ObjectMonitor* in this section. 350 movptr(boxReg, Address(r15_thread, JavaThread::monitor_owner_id_offset())); 351 movq(scrReg, tmpReg); 352 xorq(tmpReg, tmpReg); 353 lock(); 354 cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 355 356 // Propagate ICC.ZF from CAS above into DONE_LABEL. 357 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 358 359 cmpptr(boxReg, rax); // Check if we are already the owner (recursive lock) 360 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 361 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 362 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 363 #endif // _LP64 364 bind(DONE_LABEL); 365 366 // ZFlag == 1 count in fast path 367 // ZFlag == 0 count in slow path 368 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 369 370 bind(COUNT); 371 if (LockingMode == LM_LEGACY) { 372 #ifdef _LP64 373 // Count monitors in fast path 374 increment(Address(thread, JavaThread::held_monitor_count_offset())); 375 #endif 376 } 377 xorl(tmpReg, tmpReg); // Set ZF == 1 378 379 bind(NO_COUNT); 380 381 // At NO_COUNT the icc ZFlag is set as follows ... 382 // fast_unlock uses the same protocol. 383 // ZFlag == 1 -> Success 384 // ZFlag == 0 -> Failure - force control through the slow path 385 } 386 387 // obj: object to unlock 388 // box: box address (displaced header location), killed. Must be EAX. 389 // tmp: killed, cannot be obj nor box. 390 // 391 // Some commentary on balanced locking: 392 // 393 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 394 // Methods that don't have provably balanced locking are forced to run in the 395 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 396 // The interpreter provides two properties: 397 // I1: At return-time the interpreter automatically and quietly unlocks any 398 // objects acquired the current activation (frame). Recall that the 399 // interpreter maintains an on-stack list of locks currently held by 400 // a frame. 401 // I2: If a method attempts to unlock an object that is not held by the 402 // the frame the interpreter throws IMSX. 403 // 404 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 405 // B() doesn't have provably balanced locking so it runs in the interpreter. 406 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 407 // is still locked by A(). 408 // 409 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 410 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 411 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 412 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 413 // Arguably given that the spec legislates the JNI case as undefined our implementation 414 // could reasonably *avoid* checking owner in fast_unlock(). 415 // In the interest of performance we elide m->Owner==Self check in unlock. 416 // A perfectly viable alternative is to elide the owner check except when 417 // Xcheck:jni is enabled. 418 419 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { 420 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 421 assert(boxReg == rax, ""); 422 assert_different_registers(objReg, boxReg, tmpReg); 423 424 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 425 426 if (LockingMode == LM_LEGACY) { 427 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 428 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 429 } 430 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 431 if (LockingMode != LM_MONITOR) { 432 testptr(tmpReg, markWord::monitor_value); // Inflated? 433 jcc(Assembler::zero, Stacked); 434 } 435 436 // It's inflated. 437 438 #ifndef _LP64 439 // Just take slow path to avoid dealing with 64 bit atomic instructions here. 440 orl(boxReg, 1); // set ICC.ZF=0 to indicate failure 441 jmpb(DONE_LABEL); 442 #else 443 // Despite our balanced locking property we still check that m->_owner == Self 444 // as java routines or native JNI code called by this thread might 445 // have released the lock. 446 // Refer to the comments in synchronizer.cpp for how we might encode extra 447 // state in _succ so we can avoid fetching EntryList|cxq. 448 // 449 // If there's no contention try a 1-0 exit. That is, exit without 450 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 451 // we detect and recover from the race that the 1-0 exit admits. 452 // 453 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 454 // before it STs null into _owner, releasing the lock. Updates 455 // to data protected by the critical section must be visible before 456 // we drop the lock (and thus before any other thread could acquire 457 // the lock and observe the fields protected by the lock). 458 // IA32's memory-model is SPO, so STs are ordered with respect to 459 // each other and there's no need for an explicit barrier (fence). 460 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 461 Label LSuccess, LNotRecursive; 462 463 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 464 jccb(Assembler::equal, LNotRecursive); 465 466 // Recursive inflated unlock 467 decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 468 jmpb(LSuccess); 469 470 bind(LNotRecursive); 471 472 // Set owner to null. 473 // Release to satisfy the JMM 474 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 475 // We need a full fence after clearing owner to avoid stranding. 476 // StoreLoad achieves this. 477 membar(StoreLoad); 478 479 // Check if the entry lists are empty (EntryList first - by convention). 480 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 481 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 482 jccb(Assembler::zero, LSuccess); // If so we are done. 483 484 // Check if there is a successor. 485 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 486 jccb(Assembler::notZero, LSuccess); // If so we are done. 487 488 // Save the monitor pointer in the current thread, so we can try to 489 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 490 andptr(tmpReg, ~(int32_t)markWord::monitor_value); 491 movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg); 492 493 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 494 jmpb (DONE_LABEL); 495 496 bind (LSuccess); 497 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 498 jmpb (DONE_LABEL); 499 #endif // _LP64 500 501 if (LockingMode == LM_LEGACY) { 502 bind (Stacked); 503 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 504 lock(); 505 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 506 // Intentional fall-thru into DONE_LABEL 507 } 508 509 bind(DONE_LABEL); 510 511 // ZFlag == 1 count in fast path 512 // ZFlag == 0 count in slow path 513 jccb(Assembler::notZero, NO_COUNT); 514 515 bind(COUNT); 516 517 if (LockingMode == LM_LEGACY) { 518 // Count monitors in fast path 519 #ifdef _LP64 520 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 521 #endif 522 } 523 524 xorl(tmpReg, tmpReg); // Set ZF == 1 525 526 bind(NO_COUNT); 527 } 528 529 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 530 Register t, Register thread) { 531 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 532 assert(rax_reg == rax, "Used for CAS"); 533 assert_different_registers(obj, box, rax_reg, t, thread); 534 535 // Handle inflated monitor. 536 Label inflated; 537 // Finish fast lock successfully. ZF value is irrelevant. 538 Label locked; 539 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 540 Label slow_path; 541 542 if (UseObjectMonitorTable) { 543 // Clear cache in case fast locking succeeds. 544 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 545 } 546 547 if (DiagnoseSyncOnValueBasedClasses != 0) { 548 load_klass(rax_reg, obj, t); 549 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 550 jcc(Assembler::notZero, slow_path); 551 } 552 553 const Register mark = t; 554 555 { // Lightweight Lock 556 557 Label push; 558 559 const Register top = UseObjectMonitorTable ? rax_reg : box; 560 561 // Load the mark. 562 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 563 564 // Prefetch top. 565 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 566 567 // Check for monitor (0b10). 568 testptr(mark, markWord::monitor_value); 569 jcc(Assembler::notZero, inflated); 570 571 // Check if lock-stack is full. 572 cmpl(top, LockStack::end_offset() - 1); 573 jcc(Assembler::greater, slow_path); 574 575 // Check if recursive. 576 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 577 jccb(Assembler::equal, push); 578 579 // Try to lock. Transition lock bits 0b01 => 0b00 580 movptr(rax_reg, mark); 581 orptr(rax_reg, markWord::unlocked_value); 582 andptr(mark, ~(int32_t)markWord::unlocked_value); 583 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 584 jcc(Assembler::notEqual, slow_path); 585 586 if (UseObjectMonitorTable) { 587 // Need to reload top, clobbered by CAS. 588 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 589 } 590 bind(push); 591 // After successful lock, push object on lock-stack. 592 movptr(Address(thread, top), obj); 593 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 594 jmpb(locked); 595 } 596 597 { // Handle inflated monitor. 598 bind(inflated); 599 600 #ifndef _LP64 601 // Just take slow path to avoid dealing with 64 bit atomic instructions here. 602 orl(box, 1); // set ICC.ZF=0 to indicate failure 603 jmpb(slow_path); 604 #else 605 const Register monitor = t; 606 607 if (!UseObjectMonitorTable) { 608 assert(mark == monitor, "should be the same here"); 609 } else { 610 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 611 // Fetch ObjectMonitor* from the cache or take the slow-path. 612 Label monitor_found; 613 614 // Load cache address 615 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 616 617 const int num_unrolled = 2; 618 for (int i = 0; i < num_unrolled; i++) { 619 cmpptr(obj, Address(t)); 620 jccb(Assembler::equal, monitor_found); 621 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 622 } 623 624 Label loop; 625 626 // Search for obj in cache. 627 bind(loop); 628 629 // Check for match. 630 cmpptr(obj, Address(t)); 631 jccb(Assembler::equal, monitor_found); 632 633 // Search until null encountered, guaranteed _null_sentinel at end. 634 cmpptr(Address(t), 1); 635 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 636 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 637 jmpb(loop); 638 639 // Cache hit. 640 bind(monitor_found); 641 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 642 } 643 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 644 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 645 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 646 647 Label monitor_locked; 648 // Lock the monitor. 649 650 if (UseObjectMonitorTable) { 651 // Cache the monitor for unlock before trashing box. On failure to acquire 652 // the lock, the slow path will reset the entry accordingly (see CacheSetter). 653 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 654 } 655 656 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 657 xorptr(rax_reg, rax_reg); 658 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset())); 659 lock(); cmpxchgptr(box, owner_address); 660 jccb(Assembler::equal, monitor_locked); 661 662 // Check if recursive. 663 cmpptr(box, rax_reg); 664 jccb(Assembler::notEqual, slow_path); 665 666 // Recursive. 667 increment(recursions_address); 668 669 bind(monitor_locked); 670 #endif // _LP64 671 } 672 673 bind(locked); 674 // Set ZF = 1 675 xorl(rax_reg, rax_reg); 676 677 #ifdef ASSERT 678 // Check that locked label is reached with ZF set. 679 Label zf_correct; 680 Label zf_bad_zero; 681 jcc(Assembler::zero, zf_correct); 682 jmp(zf_bad_zero); 683 #endif 684 685 bind(slow_path); 686 #ifdef ASSERT 687 // Check that slow_path label is reached with ZF not set. 688 jcc(Assembler::notZero, zf_correct); 689 stop("Fast Lock ZF != 0"); 690 bind(zf_bad_zero); 691 stop("Fast Lock ZF != 1"); 692 bind(zf_correct); 693 #endif 694 // C2 uses the value of ZF to determine the continuation. 695 } 696 697 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 698 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 699 assert(reg_rax == rax, "Used for CAS"); 700 assert_different_registers(obj, reg_rax, t); 701 702 // Handle inflated monitor. 703 Label inflated, inflated_check_lock_stack; 704 // Finish fast unlock successfully. MUST jump with ZF == 1 705 Label unlocked, slow_path; 706 707 const Register mark = t; 708 const Register monitor = t; 709 const Register top = UseObjectMonitorTable ? t : reg_rax; 710 const Register box = reg_rax; 711 712 Label dummy; 713 C2FastUnlockLightweightStub* stub = nullptr; 714 715 if (!Compile::current()->output()->in_scratch_emit_size()) { 716 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 717 Compile::current()->output()->add_stub(stub); 718 } 719 720 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 721 722 { // Lightweight Unlock 723 724 // Load top. 725 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 726 727 if (!UseObjectMonitorTable) { 728 // Prefetch mark. 729 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 730 } 731 732 // Check if obj is top of lock-stack. 733 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 734 // Top of lock stack was not obj. Must be monitor. 735 jcc(Assembler::notEqual, inflated_check_lock_stack); 736 737 // Pop lock-stack. 738 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 739 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 740 741 // Check if recursive. 742 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 743 jcc(Assembler::equal, unlocked); 744 745 // We elide the monitor check, let the CAS fail instead. 746 747 if (UseObjectMonitorTable) { 748 // Load mark. 749 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 750 } 751 752 // Try to unlock. Transition lock bits 0b00 => 0b01 753 movptr(reg_rax, mark); 754 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 755 orptr(mark, markWord::unlocked_value); 756 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 757 jcc(Assembler::notEqual, push_and_slow_path); 758 jmp(unlocked); 759 } 760 761 762 { // Handle inflated monitor. 763 bind(inflated_check_lock_stack); 764 #ifdef ASSERT 765 Label check_done; 766 subl(top, oopSize); 767 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 768 jcc(Assembler::below, check_done); 769 cmpptr(obj, Address(thread, top)); 770 jccb(Assembler::notEqual, inflated_check_lock_stack); 771 stop("Fast Unlock lock on stack"); 772 bind(check_done); 773 if (UseObjectMonitorTable) { 774 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 775 } 776 testptr(mark, markWord::monitor_value); 777 jccb(Assembler::notZero, inflated); 778 stop("Fast Unlock not monitor"); 779 #endif 780 781 bind(inflated); 782 783 #ifndef _LP64 784 // Just take slow path to avoid dealing with 64 bit atomic instructions here. 785 orl(t, 1); // set ICC.ZF=0 to indicate failure 786 jmpb(slow_path); 787 #else 788 if (!UseObjectMonitorTable) { 789 assert(mark == monitor, "should be the same here"); 790 } else { 791 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 792 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 793 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 794 cmpptr(monitor, alignof(ObjectMonitor*)); 795 jcc(Assembler::below, slow_path); 796 } 797 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 798 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 799 const Address cxq_address{monitor, ObjectMonitor::cxq_offset() - monitor_tag}; 800 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 801 const Address EntryList_address{monitor, ObjectMonitor::EntryList_offset() - monitor_tag}; 802 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 803 804 Label recursive; 805 806 // Check if recursive. 807 cmpptr(recursions_address, 0); 808 jccb(Assembler::notZero, recursive); 809 810 // Set owner to null. 811 // Release to satisfy the JMM 812 movptr(owner_address, NULL_WORD); 813 // We need a full fence after clearing owner to avoid stranding. 814 // StoreLoad achieves this. 815 membar(StoreLoad); 816 817 // Check if the entry lists are empty (EntryList first - by convention). 818 movptr(reg_rax, EntryList_address); 819 orptr(reg_rax, cxq_address); 820 jccb(Assembler::zero, unlocked); // If so we are done. 821 822 // Check if there is a successor. 823 cmpptr(succ_address, NULL_WORD); 824 jccb(Assembler::notZero, unlocked); // If so we are done. 825 826 // Save the monitor pointer in the current thread, so we can try to 827 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 828 if (!UseObjectMonitorTable) { 829 andptr(monitor, ~(int32_t)markWord::monitor_value); 830 } 831 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 832 833 orl(t, 1); // Fast Unlock ZF = 0 834 jmpb(slow_path); 835 836 // Recursive unlock. 837 bind(recursive); 838 decrement(recursions_address); 839 #endif // _LP64 840 } 841 842 bind(unlocked); 843 xorl(t, t); // Fast Unlock ZF = 1 844 845 #ifdef ASSERT 846 // Check that unlocked label is reached with ZF set. 847 Label zf_correct; 848 Label zf_bad_zero; 849 jcc(Assembler::zero, zf_correct); 850 jmp(zf_bad_zero); 851 #endif 852 853 bind(slow_path); 854 if (stub != nullptr) { 855 bind(stub->slow_path_continuation()); 856 } 857 #ifdef ASSERT 858 // Check that stub->continuation() label is reached with ZF not set. 859 jcc(Assembler::notZero, zf_correct); 860 stop("Fast Unlock ZF != 0"); 861 bind(zf_bad_zero); 862 stop("Fast Unlock ZF != 1"); 863 bind(zf_correct); 864 #endif 865 // C2 uses the value of ZF to determine the continuation. 866 } 867 868 //------------------------------------------------------------------------------------------- 869 // Generic instructions support for use in .ad files C2 code generation 870 871 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 872 if (dst != src) { 873 movdqu(dst, src); 874 } 875 if (opcode == Op_AbsVD) { 876 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 877 } else { 878 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 879 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 880 } 881 } 882 883 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 884 if (opcode == Op_AbsVD) { 885 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 886 } else { 887 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 888 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 889 } 890 } 891 892 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 893 if (dst != src) { 894 movdqu(dst, src); 895 } 896 if (opcode == Op_AbsVF) { 897 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 898 } else { 899 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 900 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 901 } 902 } 903 904 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 905 if (opcode == Op_AbsVF) { 906 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 907 } else { 908 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 909 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 910 } 911 } 912 913 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 914 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 915 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 916 917 if (opcode == Op_MinV) { 918 if (elem_bt == T_BYTE) { 919 pminsb(dst, src); 920 } else if (elem_bt == T_SHORT) { 921 pminsw(dst, src); 922 } else if (elem_bt == T_INT) { 923 pminsd(dst, src); 924 } else { 925 assert(elem_bt == T_LONG, "required"); 926 assert(tmp == xmm0, "required"); 927 assert_different_registers(dst, src, tmp); 928 movdqu(xmm0, dst); 929 pcmpgtq(xmm0, src); 930 blendvpd(dst, src); // xmm0 as mask 931 } 932 } else { // opcode == Op_MaxV 933 if (elem_bt == T_BYTE) { 934 pmaxsb(dst, src); 935 } else if (elem_bt == T_SHORT) { 936 pmaxsw(dst, src); 937 } else if (elem_bt == T_INT) { 938 pmaxsd(dst, src); 939 } else { 940 assert(elem_bt == T_LONG, "required"); 941 assert(tmp == xmm0, "required"); 942 assert_different_registers(dst, src, tmp); 943 movdqu(xmm0, src); 944 pcmpgtq(xmm0, dst); 945 blendvpd(dst, src); // xmm0 as mask 946 } 947 } 948 } 949 950 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 951 XMMRegister src1, Address src2, int vlen_enc) { 952 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 953 if (opcode == Op_UMinV) { 954 switch(elem_bt) { 955 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 956 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 957 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 958 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 959 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 960 } 961 } else { 962 assert(opcode == Op_UMaxV, "required"); 963 switch(elem_bt) { 964 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 965 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 966 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 967 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 968 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 969 } 970 } 971 } 972 973 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 974 // For optimality, leverage a full vector width of 512 bits 975 // for operations over smaller vector sizes on AVX512 targets. 976 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { 977 if (opcode == Op_UMaxV) { 978 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 979 } else { 980 assert(opcode == Op_UMinV, "required"); 981 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 982 } 983 } else { 984 // T1 = -1 985 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc); 986 // T1 = -1 << 63 987 vpsllq(xtmp1, xtmp1, 63, vlen_enc); 988 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2 989 vpaddq(xtmp2, xtmp1, src2, vlen_enc); 990 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1 991 vpaddq(xtmp1, xtmp1, src1, vlen_enc); 992 // Mask = T2 > T1 993 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc); 994 if (opcode == Op_UMaxV) { 995 // Res = Mask ? Src2 : Src1 996 vpblendvb(dst, src1, src2, xtmp1, vlen_enc); 997 } else { 998 // Res = Mask ? Src1 : Src2 999 vpblendvb(dst, src2, src1, xtmp1, vlen_enc); 1000 } 1001 } 1002 } 1003 1004 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 1005 XMMRegister src1, XMMRegister src2, int vlen_enc) { 1006 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 1007 if (opcode == Op_UMinV) { 1008 switch(elem_bt) { 1009 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 1010 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 1011 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 1012 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 1013 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 1014 } 1015 } else { 1016 assert(opcode == Op_UMaxV, "required"); 1017 switch(elem_bt) { 1018 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 1019 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 1020 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 1021 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 1022 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 1023 } 1024 } 1025 } 1026 1027 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1028 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1029 int vlen_enc) { 1030 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1031 1032 if (opcode == Op_MinV) { 1033 if (elem_bt == T_BYTE) { 1034 vpminsb(dst, src1, src2, vlen_enc); 1035 } else if (elem_bt == T_SHORT) { 1036 vpminsw(dst, src1, src2, vlen_enc); 1037 } else if (elem_bt == T_INT) { 1038 vpminsd(dst, src1, src2, vlen_enc); 1039 } else { 1040 assert(elem_bt == T_LONG, "required"); 1041 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1042 vpminsq(dst, src1, src2, vlen_enc); 1043 } else { 1044 assert_different_registers(dst, src1, src2); 1045 vpcmpgtq(dst, src1, src2, vlen_enc); 1046 vblendvpd(dst, src1, src2, dst, vlen_enc); 1047 } 1048 } 1049 } else { // opcode == Op_MaxV 1050 if (elem_bt == T_BYTE) { 1051 vpmaxsb(dst, src1, src2, vlen_enc); 1052 } else if (elem_bt == T_SHORT) { 1053 vpmaxsw(dst, src1, src2, vlen_enc); 1054 } else if (elem_bt == T_INT) { 1055 vpmaxsd(dst, src1, src2, vlen_enc); 1056 } else { 1057 assert(elem_bt == T_LONG, "required"); 1058 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1059 vpmaxsq(dst, src1, src2, vlen_enc); 1060 } else { 1061 assert_different_registers(dst, src1, src2); 1062 vpcmpgtq(dst, src1, src2, vlen_enc); 1063 vblendvpd(dst, src2, src1, dst, vlen_enc); 1064 } 1065 } 1066 } 1067 } 1068 1069 // Float/Double min max 1070 1071 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1072 XMMRegister dst, XMMRegister a, XMMRegister b, 1073 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1074 int vlen_enc) { 1075 assert(UseAVX > 0, "required"); 1076 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1077 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1078 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1079 assert_different_registers(a, tmp, atmp, btmp); 1080 assert_different_registers(b, tmp, atmp, btmp); 1081 1082 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1083 bool is_double_word = is_double_word_type(elem_bt); 1084 1085 /* Note on 'non-obvious' assembly sequence: 1086 * 1087 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1088 * and Java on how they handle floats: 1089 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1090 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1091 * 1092 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1093 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1094 * (only useful when signs differ, noop otherwise) 1095 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1096 1097 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1098 * btmp = (b < +0.0) ? a : b 1099 * atmp = (b < +0.0) ? b : a 1100 * Tmp = Max_Float(atmp , btmp) 1101 * Res = (atmp == NaN) ? atmp : Tmp 1102 */ 1103 1104 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1105 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1106 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1107 XMMRegister mask; 1108 1109 if (!is_double_word && is_min) { 1110 mask = a; 1111 vblend = &MacroAssembler::vblendvps; 1112 vmaxmin = &MacroAssembler::vminps; 1113 vcmp = &MacroAssembler::vcmpps; 1114 } else if (!is_double_word && !is_min) { 1115 mask = b; 1116 vblend = &MacroAssembler::vblendvps; 1117 vmaxmin = &MacroAssembler::vmaxps; 1118 vcmp = &MacroAssembler::vcmpps; 1119 } else if (is_double_word && is_min) { 1120 mask = a; 1121 vblend = &MacroAssembler::vblendvpd; 1122 vmaxmin = &MacroAssembler::vminpd; 1123 vcmp = &MacroAssembler::vcmppd; 1124 } else { 1125 assert(is_double_word && !is_min, "sanity"); 1126 mask = b; 1127 vblend = &MacroAssembler::vblendvpd; 1128 vmaxmin = &MacroAssembler::vmaxpd; 1129 vcmp = &MacroAssembler::vcmppd; 1130 } 1131 1132 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1133 XMMRegister maxmin, scratch; 1134 if (dst == btmp) { 1135 maxmin = btmp; 1136 scratch = tmp; 1137 } else { 1138 maxmin = tmp; 1139 scratch = btmp; 1140 } 1141 1142 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1143 if (precompute_mask && !is_double_word) { 1144 vpsrad(tmp, mask, 32, vlen_enc); 1145 mask = tmp; 1146 } else if (precompute_mask && is_double_word) { 1147 vpxor(tmp, tmp, tmp, vlen_enc); 1148 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1149 mask = tmp; 1150 } 1151 1152 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1153 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1154 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1155 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1156 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1157 } 1158 1159 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1160 XMMRegister dst, XMMRegister a, XMMRegister b, 1161 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1162 int vlen_enc) { 1163 assert(UseAVX > 2, "required"); 1164 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1165 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1166 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1167 assert_different_registers(dst, a, atmp, btmp); 1168 assert_different_registers(dst, b, atmp, btmp); 1169 1170 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1171 bool is_double_word = is_double_word_type(elem_bt); 1172 bool merge = true; 1173 1174 if (!is_double_word && is_min) { 1175 evpmovd2m(ktmp, a, vlen_enc); 1176 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1177 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1178 vminps(dst, atmp, btmp, vlen_enc); 1179 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1180 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1181 } else if (!is_double_word && !is_min) { 1182 evpmovd2m(ktmp, b, vlen_enc); 1183 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1184 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1185 vmaxps(dst, atmp, btmp, vlen_enc); 1186 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1187 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1188 } else if (is_double_word && is_min) { 1189 evpmovq2m(ktmp, a, vlen_enc); 1190 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1191 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1192 vminpd(dst, atmp, btmp, vlen_enc); 1193 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1194 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1195 } else { 1196 assert(is_double_word && !is_min, "sanity"); 1197 evpmovq2m(ktmp, b, vlen_enc); 1198 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1199 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1200 vmaxpd(dst, atmp, btmp, vlen_enc); 1201 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1202 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1203 } 1204 } 1205 1206 // Float/Double signum 1207 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1208 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1209 1210 Label DONE_LABEL; 1211 1212 if (opcode == Op_SignumF) { 1213 assert(UseSSE > 0, "required"); 1214 ucomiss(dst, zero); 1215 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1216 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1217 movflt(dst, one); 1218 jcc(Assembler::above, DONE_LABEL); 1219 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1220 } else if (opcode == Op_SignumD) { 1221 assert(UseSSE > 1, "required"); 1222 ucomisd(dst, zero); 1223 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1224 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1225 movdbl(dst, one); 1226 jcc(Assembler::above, DONE_LABEL); 1227 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1228 } 1229 1230 bind(DONE_LABEL); 1231 } 1232 1233 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1234 if (sign) { 1235 pmovsxbw(dst, src); 1236 } else { 1237 pmovzxbw(dst, src); 1238 } 1239 } 1240 1241 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1242 if (sign) { 1243 vpmovsxbw(dst, src, vector_len); 1244 } else { 1245 vpmovzxbw(dst, src, vector_len); 1246 } 1247 } 1248 1249 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1250 if (sign) { 1251 vpmovsxbd(dst, src, vector_len); 1252 } else { 1253 vpmovzxbd(dst, src, vector_len); 1254 } 1255 } 1256 1257 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1258 if (sign) { 1259 vpmovsxwd(dst, src, vector_len); 1260 } else { 1261 vpmovzxwd(dst, src, vector_len); 1262 } 1263 } 1264 1265 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1266 int shift, int vector_len) { 1267 if (opcode == Op_RotateLeftV) { 1268 if (etype == T_INT) { 1269 evprold(dst, src, shift, vector_len); 1270 } else { 1271 assert(etype == T_LONG, "expected type T_LONG"); 1272 evprolq(dst, src, shift, vector_len); 1273 } 1274 } else { 1275 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1276 if (etype == T_INT) { 1277 evprord(dst, src, shift, vector_len); 1278 } else { 1279 assert(etype == T_LONG, "expected type T_LONG"); 1280 evprorq(dst, src, shift, vector_len); 1281 } 1282 } 1283 } 1284 1285 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1286 XMMRegister shift, int vector_len) { 1287 if (opcode == Op_RotateLeftV) { 1288 if (etype == T_INT) { 1289 evprolvd(dst, src, shift, vector_len); 1290 } else { 1291 assert(etype == T_LONG, "expected type T_LONG"); 1292 evprolvq(dst, src, shift, vector_len); 1293 } 1294 } else { 1295 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1296 if (etype == T_INT) { 1297 evprorvd(dst, src, shift, vector_len); 1298 } else { 1299 assert(etype == T_LONG, "expected type T_LONG"); 1300 evprorvq(dst, src, shift, vector_len); 1301 } 1302 } 1303 } 1304 1305 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1306 if (opcode == Op_RShiftVI) { 1307 psrad(dst, shift); 1308 } else if (opcode == Op_LShiftVI) { 1309 pslld(dst, shift); 1310 } else { 1311 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1312 psrld(dst, shift); 1313 } 1314 } 1315 1316 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1317 switch (opcode) { 1318 case Op_RShiftVI: psrad(dst, shift); break; 1319 case Op_LShiftVI: pslld(dst, shift); break; 1320 case Op_URShiftVI: psrld(dst, shift); break; 1321 1322 default: assert(false, "%s", NodeClassNames[opcode]); 1323 } 1324 } 1325 1326 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1327 if (opcode == Op_RShiftVI) { 1328 vpsrad(dst, nds, shift, vector_len); 1329 } else if (opcode == Op_LShiftVI) { 1330 vpslld(dst, nds, shift, vector_len); 1331 } else { 1332 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1333 vpsrld(dst, nds, shift, vector_len); 1334 } 1335 } 1336 1337 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1338 switch (opcode) { 1339 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1340 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1341 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1342 1343 default: assert(false, "%s", NodeClassNames[opcode]); 1344 } 1345 } 1346 1347 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1348 switch (opcode) { 1349 case Op_RShiftVB: // fall-through 1350 case Op_RShiftVS: psraw(dst, shift); break; 1351 1352 case Op_LShiftVB: // fall-through 1353 case Op_LShiftVS: psllw(dst, shift); break; 1354 1355 case Op_URShiftVS: // fall-through 1356 case Op_URShiftVB: psrlw(dst, shift); break; 1357 1358 default: assert(false, "%s", NodeClassNames[opcode]); 1359 } 1360 } 1361 1362 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1363 switch (opcode) { 1364 case Op_RShiftVB: // fall-through 1365 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1366 1367 case Op_LShiftVB: // fall-through 1368 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1369 1370 case Op_URShiftVS: // fall-through 1371 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1372 1373 default: assert(false, "%s", NodeClassNames[opcode]); 1374 } 1375 } 1376 1377 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1378 switch (opcode) { 1379 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1380 case Op_LShiftVL: psllq(dst, shift); break; 1381 case Op_URShiftVL: psrlq(dst, shift); break; 1382 1383 default: assert(false, "%s", NodeClassNames[opcode]); 1384 } 1385 } 1386 1387 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1388 if (opcode == Op_RShiftVL) { 1389 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1390 } else if (opcode == Op_LShiftVL) { 1391 psllq(dst, shift); 1392 } else { 1393 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1394 psrlq(dst, shift); 1395 } 1396 } 1397 1398 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1399 switch (opcode) { 1400 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1401 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1402 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1403 1404 default: assert(false, "%s", NodeClassNames[opcode]); 1405 } 1406 } 1407 1408 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1409 if (opcode == Op_RShiftVL) { 1410 evpsraq(dst, nds, shift, vector_len); 1411 } else if (opcode == Op_LShiftVL) { 1412 vpsllq(dst, nds, shift, vector_len); 1413 } else { 1414 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1415 vpsrlq(dst, nds, shift, vector_len); 1416 } 1417 } 1418 1419 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1420 switch (opcode) { 1421 case Op_RShiftVB: // fall-through 1422 case Op_RShiftVS: // fall-through 1423 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1424 1425 case Op_LShiftVB: // fall-through 1426 case Op_LShiftVS: // fall-through 1427 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1428 1429 case Op_URShiftVB: // fall-through 1430 case Op_URShiftVS: // fall-through 1431 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1432 1433 default: assert(false, "%s", NodeClassNames[opcode]); 1434 } 1435 } 1436 1437 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1438 switch (opcode) { 1439 case Op_RShiftVB: // fall-through 1440 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1441 1442 case Op_LShiftVB: // fall-through 1443 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1444 1445 case Op_URShiftVB: // fall-through 1446 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1447 1448 default: assert(false, "%s", NodeClassNames[opcode]); 1449 } 1450 } 1451 1452 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1453 assert(UseAVX >= 2, "required"); 1454 switch (opcode) { 1455 case Op_RShiftVL: { 1456 if (UseAVX > 2) { 1457 assert(tmp == xnoreg, "not used"); 1458 if (!VM_Version::supports_avx512vl()) { 1459 vlen_enc = Assembler::AVX_512bit; 1460 } 1461 evpsravq(dst, src, shift, vlen_enc); 1462 } else { 1463 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1464 vpsrlvq(dst, src, shift, vlen_enc); 1465 vpsrlvq(tmp, tmp, shift, vlen_enc); 1466 vpxor(dst, dst, tmp, vlen_enc); 1467 vpsubq(dst, dst, tmp, vlen_enc); 1468 } 1469 break; 1470 } 1471 case Op_LShiftVL: { 1472 assert(tmp == xnoreg, "not used"); 1473 vpsllvq(dst, src, shift, vlen_enc); 1474 break; 1475 } 1476 case Op_URShiftVL: { 1477 assert(tmp == xnoreg, "not used"); 1478 vpsrlvq(dst, src, shift, vlen_enc); 1479 break; 1480 } 1481 default: assert(false, "%s", NodeClassNames[opcode]); 1482 } 1483 } 1484 1485 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1486 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1487 assert(opcode == Op_LShiftVB || 1488 opcode == Op_RShiftVB || 1489 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1490 bool sign = (opcode != Op_URShiftVB); 1491 assert(vector_len == 0, "required"); 1492 vextendbd(sign, dst, src, 1); 1493 vpmovzxbd(vtmp, shift, 1); 1494 varshiftd(opcode, dst, dst, vtmp, 1); 1495 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1496 vextracti128_high(vtmp, dst); 1497 vpackusdw(dst, dst, vtmp, 0); 1498 } 1499 1500 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1501 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1502 assert(opcode == Op_LShiftVB || 1503 opcode == Op_RShiftVB || 1504 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1505 bool sign = (opcode != Op_URShiftVB); 1506 int ext_vector_len = vector_len + 1; 1507 vextendbw(sign, dst, src, ext_vector_len); 1508 vpmovzxbw(vtmp, shift, ext_vector_len); 1509 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1510 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1511 if (vector_len == 0) { 1512 vextracti128_high(vtmp, dst); 1513 vpackuswb(dst, dst, vtmp, vector_len); 1514 } else { 1515 vextracti64x4_high(vtmp, dst); 1516 vpackuswb(dst, dst, vtmp, vector_len); 1517 vpermq(dst, dst, 0xD8, vector_len); 1518 } 1519 } 1520 1521 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1522 switch(typ) { 1523 case T_BYTE: 1524 pinsrb(dst, val, idx); 1525 break; 1526 case T_SHORT: 1527 pinsrw(dst, val, idx); 1528 break; 1529 case T_INT: 1530 pinsrd(dst, val, idx); 1531 break; 1532 case T_LONG: 1533 pinsrq(dst, val, idx); 1534 break; 1535 default: 1536 assert(false,"Should not reach here."); 1537 break; 1538 } 1539 } 1540 1541 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1542 switch(typ) { 1543 case T_BYTE: 1544 vpinsrb(dst, src, val, idx); 1545 break; 1546 case T_SHORT: 1547 vpinsrw(dst, src, val, idx); 1548 break; 1549 case T_INT: 1550 vpinsrd(dst, src, val, idx); 1551 break; 1552 case T_LONG: 1553 vpinsrq(dst, src, val, idx); 1554 break; 1555 default: 1556 assert(false,"Should not reach here."); 1557 break; 1558 } 1559 } 1560 1561 #ifdef _LP64 1562 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1563 XMMRegister dst, Register base, 1564 Register idx_base, 1565 Register offset, Register mask, 1566 Register mask_idx, Register rtmp, 1567 int vlen_enc) { 1568 vpxor(dst, dst, dst, vlen_enc); 1569 if (elem_bt == T_SHORT) { 1570 for (int i = 0; i < 4; i++) { 1571 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1572 Label skip_load; 1573 btq(mask, mask_idx); 1574 jccb(Assembler::carryClear, skip_load); 1575 movl(rtmp, Address(idx_base, i * 4)); 1576 if (offset != noreg) { 1577 addl(rtmp, offset); 1578 } 1579 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1580 bind(skip_load); 1581 incq(mask_idx); 1582 } 1583 } else { 1584 assert(elem_bt == T_BYTE, ""); 1585 for (int i = 0; i < 8; i++) { 1586 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1587 Label skip_load; 1588 btq(mask, mask_idx); 1589 jccb(Assembler::carryClear, skip_load); 1590 movl(rtmp, Address(idx_base, i * 4)); 1591 if (offset != noreg) { 1592 addl(rtmp, offset); 1593 } 1594 pinsrb(dst, Address(base, rtmp), i); 1595 bind(skip_load); 1596 incq(mask_idx); 1597 } 1598 } 1599 } 1600 #endif // _LP64 1601 1602 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1603 Register base, Register idx_base, 1604 Register offset, Register rtmp, 1605 int vlen_enc) { 1606 vpxor(dst, dst, dst, vlen_enc); 1607 if (elem_bt == T_SHORT) { 1608 for (int i = 0; i < 4; i++) { 1609 // dst[i] = src[offset + idx_base[i]] 1610 movl(rtmp, Address(idx_base, i * 4)); 1611 if (offset != noreg) { 1612 addl(rtmp, offset); 1613 } 1614 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1615 } 1616 } else { 1617 assert(elem_bt == T_BYTE, ""); 1618 for (int i = 0; i < 8; i++) { 1619 // dst[i] = src[offset + idx_base[i]] 1620 movl(rtmp, Address(idx_base, i * 4)); 1621 if (offset != noreg) { 1622 addl(rtmp, offset); 1623 } 1624 pinsrb(dst, Address(base, rtmp), i); 1625 } 1626 } 1627 } 1628 1629 /* 1630 * Gather using hybrid algorithm, first partially unroll scalar loop 1631 * to accumulate values from gather indices into a quad-word(64bit) slice. 1632 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1633 * permutation to place the slice into appropriate vector lane 1634 * locations in destination vector. Following pseudo code describes the 1635 * algorithm in detail: 1636 * 1637 * DST_VEC = ZERO_VEC 1638 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1639 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1640 * FOREACH_ITER: 1641 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1642 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1643 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1644 * PERM_INDEX = PERM_INDEX - TWO_VEC 1645 * 1646 * With each iteration, doubleword permute indices (0,1) corresponding 1647 * to gathered quadword gets right shifted by two lane positions. 1648 * 1649 */ 1650 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1651 Register base, Register idx_base, 1652 Register offset, Register mask, 1653 XMMRegister xtmp1, XMMRegister xtmp2, 1654 XMMRegister temp_dst, Register rtmp, 1655 Register mask_idx, Register length, 1656 int vector_len, int vlen_enc) { 1657 Label GATHER8_LOOP; 1658 assert(is_subword_type(elem_ty), ""); 1659 movl(length, vector_len); 1660 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1661 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1662 vallones(xtmp2, vlen_enc); 1663 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1664 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1665 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1666 1667 bind(GATHER8_LOOP); 1668 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1669 if (mask == noreg) { 1670 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1671 } else { 1672 LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc)); 1673 } 1674 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1675 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1676 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1677 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1678 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1679 vpor(dst, dst, temp_dst, vlen_enc); 1680 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1681 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1682 jcc(Assembler::notEqual, GATHER8_LOOP); 1683 } 1684 1685 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1686 switch(typ) { 1687 case T_INT: 1688 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1689 break; 1690 case T_FLOAT: 1691 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1692 break; 1693 case T_LONG: 1694 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1695 break; 1696 case T_DOUBLE: 1697 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1698 break; 1699 default: 1700 assert(false,"Should not reach here."); 1701 break; 1702 } 1703 } 1704 1705 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1706 switch(typ) { 1707 case T_INT: 1708 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1709 break; 1710 case T_FLOAT: 1711 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1712 break; 1713 case T_LONG: 1714 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1715 break; 1716 case T_DOUBLE: 1717 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1718 break; 1719 default: 1720 assert(false,"Should not reach here."); 1721 break; 1722 } 1723 } 1724 1725 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1726 switch(typ) { 1727 case T_INT: 1728 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1729 break; 1730 case T_FLOAT: 1731 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1732 break; 1733 case T_LONG: 1734 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1735 break; 1736 case T_DOUBLE: 1737 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1738 break; 1739 default: 1740 assert(false,"Should not reach here."); 1741 break; 1742 } 1743 } 1744 1745 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1746 if (vlen_in_bytes <= 16) { 1747 pxor (dst, dst); 1748 psubb(dst, src); 1749 switch (elem_bt) { 1750 case T_BYTE: /* nothing to do */ break; 1751 case T_SHORT: pmovsxbw(dst, dst); break; 1752 case T_INT: pmovsxbd(dst, dst); break; 1753 case T_FLOAT: pmovsxbd(dst, dst); break; 1754 case T_LONG: pmovsxbq(dst, dst); break; 1755 case T_DOUBLE: pmovsxbq(dst, dst); break; 1756 1757 default: assert(false, "%s", type2name(elem_bt)); 1758 } 1759 } else { 1760 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1761 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1762 1763 vpxor (dst, dst, dst, vlen_enc); 1764 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1765 1766 switch (elem_bt) { 1767 case T_BYTE: /* nothing to do */ break; 1768 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1769 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1770 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1771 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1772 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1773 1774 default: assert(false, "%s", type2name(elem_bt)); 1775 } 1776 } 1777 } 1778 1779 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1780 if (novlbwdq) { 1781 vpmovsxbd(xtmp, src, vlen_enc); 1782 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1783 Assembler::eq, true, vlen_enc, noreg); 1784 } else { 1785 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1786 vpsubb(xtmp, xtmp, src, vlen_enc); 1787 evpmovb2m(dst, xtmp, vlen_enc); 1788 } 1789 } 1790 1791 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) { 1792 if (is_integral_type(bt)) { 1793 switch (vlen_in_bytes) { 1794 case 4: movdl(dst, src); break; 1795 case 8: movq(dst, src); break; 1796 case 16: movdqu(dst, src); break; 1797 case 32: vmovdqu(dst, src); break; 1798 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1799 default: ShouldNotReachHere(); 1800 } 1801 } else { 1802 switch (vlen_in_bytes) { 1803 case 4: movflt(dst, src); break; 1804 case 8: movdbl(dst, src); break; 1805 case 16: movups(dst, src); break; 1806 case 32: vmovups(dst, src, Assembler::AVX_256bit); break; 1807 case 64: vmovups(dst, src, Assembler::AVX_512bit); break; 1808 default: ShouldNotReachHere(); 1809 } 1810 } 1811 } 1812 1813 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1814 assert(rscratch != noreg || always_reachable(src), "missing"); 1815 1816 if (reachable(src)) { 1817 load_vector(bt, dst, as_Address(src), vlen_in_bytes); 1818 } else { 1819 lea(rscratch, src); 1820 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes); 1821 } 1822 } 1823 1824 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1825 int vlen_enc = vector_length_encoding(vlen); 1826 if (VM_Version::supports_avx()) { 1827 if (bt == T_LONG) { 1828 if (VM_Version::supports_avx2()) { 1829 vpbroadcastq(dst, src, vlen_enc); 1830 } else { 1831 vmovddup(dst, src, vlen_enc); 1832 } 1833 } else if (bt == T_DOUBLE) { 1834 if (vlen_enc != Assembler::AVX_128bit) { 1835 vbroadcastsd(dst, src, vlen_enc, noreg); 1836 } else { 1837 vmovddup(dst, src, vlen_enc); 1838 } 1839 } else { 1840 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1841 vpbroadcastd(dst, src, vlen_enc); 1842 } else { 1843 vbroadcastss(dst, src, vlen_enc); 1844 } 1845 } 1846 } else if (VM_Version::supports_sse3()) { 1847 movddup(dst, src); 1848 } else { 1849 load_vector(bt, dst, src, vlen); 1850 } 1851 } 1852 1853 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1854 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1855 int offset = exact_log2(type2aelembytes(bt)) << 6; 1856 if (is_floating_point_type(bt)) { 1857 offset += 128; 1858 } 1859 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1860 load_vector(T_BYTE, dst, addr, vlen_in_bytes); 1861 } 1862 1863 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1864 1865 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1866 int vector_len = Assembler::AVX_128bit; 1867 1868 switch (opcode) { 1869 case Op_AndReductionV: pand(dst, src); break; 1870 case Op_OrReductionV: por (dst, src); break; 1871 case Op_XorReductionV: pxor(dst, src); break; 1872 case Op_MinReductionV: 1873 switch (typ) { 1874 case T_BYTE: pminsb(dst, src); break; 1875 case T_SHORT: pminsw(dst, src); break; 1876 case T_INT: pminsd(dst, src); break; 1877 case T_LONG: assert(UseAVX > 2, "required"); 1878 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1879 default: assert(false, "wrong type"); 1880 } 1881 break; 1882 case Op_MaxReductionV: 1883 switch (typ) { 1884 case T_BYTE: pmaxsb(dst, src); break; 1885 case T_SHORT: pmaxsw(dst, src); break; 1886 case T_INT: pmaxsd(dst, src); break; 1887 case T_LONG: assert(UseAVX > 2, "required"); 1888 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1889 default: assert(false, "wrong type"); 1890 } 1891 break; 1892 case Op_AddReductionVF: addss(dst, src); break; 1893 case Op_AddReductionVD: addsd(dst, src); break; 1894 case Op_AddReductionVI: 1895 switch (typ) { 1896 case T_BYTE: paddb(dst, src); break; 1897 case T_SHORT: paddw(dst, src); break; 1898 case T_INT: paddd(dst, src); break; 1899 default: assert(false, "wrong type"); 1900 } 1901 break; 1902 case Op_AddReductionVL: paddq(dst, src); break; 1903 case Op_MulReductionVF: mulss(dst, src); break; 1904 case Op_MulReductionVD: mulsd(dst, src); break; 1905 case Op_MulReductionVI: 1906 switch (typ) { 1907 case T_SHORT: pmullw(dst, src); break; 1908 case T_INT: pmulld(dst, src); break; 1909 default: assert(false, "wrong type"); 1910 } 1911 break; 1912 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1913 evpmullq(dst, dst, src, vector_len); break; 1914 default: assert(false, "wrong opcode"); 1915 } 1916 } 1917 1918 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1919 switch (opcode) { 1920 case Op_AddReductionVF: addps(dst, src); break; 1921 case Op_AddReductionVD: addpd(dst, src); break; 1922 case Op_MulReductionVF: mulps(dst, src); break; 1923 case Op_MulReductionVD: mulpd(dst, src); break; 1924 default: assert(false, "%s", NodeClassNames[opcode]); 1925 } 1926 } 1927 1928 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1929 int vector_len = Assembler::AVX_256bit; 1930 1931 switch (opcode) { 1932 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1933 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1934 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1935 case Op_MinReductionV: 1936 switch (typ) { 1937 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1938 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1939 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1940 case T_LONG: assert(UseAVX > 2, "required"); 1941 vpminsq(dst, src1, src2, vector_len); break; 1942 default: assert(false, "wrong type"); 1943 } 1944 break; 1945 case Op_MaxReductionV: 1946 switch (typ) { 1947 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1948 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1949 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1950 case T_LONG: assert(UseAVX > 2, "required"); 1951 vpmaxsq(dst, src1, src2, vector_len); break; 1952 default: assert(false, "wrong type"); 1953 } 1954 break; 1955 case Op_AddReductionVI: 1956 switch (typ) { 1957 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1958 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1959 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1960 default: assert(false, "wrong type"); 1961 } 1962 break; 1963 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1964 case Op_MulReductionVI: 1965 switch (typ) { 1966 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1967 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1968 default: assert(false, "wrong type"); 1969 } 1970 break; 1971 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1972 default: assert(false, "wrong opcode"); 1973 } 1974 } 1975 1976 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1977 int vector_len = Assembler::AVX_256bit; 1978 1979 switch (opcode) { 1980 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 1981 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 1982 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 1983 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 1984 default: assert(false, "%s", NodeClassNames[opcode]); 1985 } 1986 } 1987 1988 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1989 XMMRegister dst, XMMRegister src, 1990 XMMRegister vtmp1, XMMRegister vtmp2) { 1991 switch (opcode) { 1992 case Op_AddReductionVF: 1993 case Op_MulReductionVF: 1994 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1995 break; 1996 1997 case Op_AddReductionVD: 1998 case Op_MulReductionVD: 1999 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 2000 break; 2001 2002 default: assert(false, "wrong opcode"); 2003 } 2004 } 2005 2006 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 2007 XMMRegister dst, XMMRegister src, 2008 XMMRegister vtmp1, XMMRegister vtmp2) { 2009 switch (opcode) { 2010 case Op_AddReductionVF: 2011 case Op_MulReductionVF: 2012 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 2013 break; 2014 2015 case Op_AddReductionVD: 2016 case Op_MulReductionVD: 2017 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 2018 break; 2019 2020 default: assert(false, "%s", NodeClassNames[opcode]); 2021 } 2022 } 2023 2024 void C2_MacroAssembler::reduceB(int opcode, int vlen, 2025 Register dst, Register src1, XMMRegister src2, 2026 XMMRegister vtmp1, XMMRegister vtmp2) { 2027 switch (vlen) { 2028 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2029 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2030 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2031 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2032 2033 default: assert(false, "wrong vector length"); 2034 } 2035 } 2036 2037 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 2038 Register dst, Register src1, XMMRegister src2, 2039 XMMRegister vtmp1, XMMRegister vtmp2) { 2040 switch (vlen) { 2041 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2042 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2043 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2044 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2045 2046 default: assert(false, "wrong vector length"); 2047 } 2048 } 2049 2050 void C2_MacroAssembler::reduceS(int opcode, int vlen, 2051 Register dst, Register src1, XMMRegister src2, 2052 XMMRegister vtmp1, XMMRegister vtmp2) { 2053 switch (vlen) { 2054 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2055 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2056 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2057 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2058 2059 default: assert(false, "wrong vector length"); 2060 } 2061 } 2062 2063 void C2_MacroAssembler::reduceI(int opcode, int vlen, 2064 Register dst, Register src1, XMMRegister src2, 2065 XMMRegister vtmp1, XMMRegister vtmp2) { 2066 switch (vlen) { 2067 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2068 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2069 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2070 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2071 2072 default: assert(false, "wrong vector length"); 2073 } 2074 } 2075 2076 #ifdef _LP64 2077 void C2_MacroAssembler::reduceL(int opcode, int vlen, 2078 Register dst, Register src1, XMMRegister src2, 2079 XMMRegister vtmp1, XMMRegister vtmp2) { 2080 switch (vlen) { 2081 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2082 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2083 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2084 2085 default: assert(false, "wrong vector length"); 2086 } 2087 } 2088 #endif // _LP64 2089 2090 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2091 switch (vlen) { 2092 case 2: 2093 assert(vtmp2 == xnoreg, ""); 2094 reduce2F(opcode, dst, src, vtmp1); 2095 break; 2096 case 4: 2097 assert(vtmp2 == xnoreg, ""); 2098 reduce4F(opcode, dst, src, vtmp1); 2099 break; 2100 case 8: 2101 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2102 break; 2103 case 16: 2104 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2105 break; 2106 default: assert(false, "wrong vector length"); 2107 } 2108 } 2109 2110 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2111 switch (vlen) { 2112 case 2: 2113 assert(vtmp2 == xnoreg, ""); 2114 reduce2D(opcode, dst, src, vtmp1); 2115 break; 2116 case 4: 2117 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2118 break; 2119 case 8: 2120 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2121 break; 2122 default: assert(false, "wrong vector length"); 2123 } 2124 } 2125 2126 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2127 switch (vlen) { 2128 case 2: 2129 assert(vtmp1 == xnoreg, ""); 2130 assert(vtmp2 == xnoreg, ""); 2131 unorderedReduce2F(opcode, dst, src); 2132 break; 2133 case 4: 2134 assert(vtmp2 == xnoreg, ""); 2135 unorderedReduce4F(opcode, dst, src, vtmp1); 2136 break; 2137 case 8: 2138 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 2139 break; 2140 case 16: 2141 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 2142 break; 2143 default: assert(false, "wrong vector length"); 2144 } 2145 } 2146 2147 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2148 switch (vlen) { 2149 case 2: 2150 assert(vtmp1 == xnoreg, ""); 2151 assert(vtmp2 == xnoreg, ""); 2152 unorderedReduce2D(opcode, dst, src); 2153 break; 2154 case 4: 2155 assert(vtmp2 == xnoreg, ""); 2156 unorderedReduce4D(opcode, dst, src, vtmp1); 2157 break; 2158 case 8: 2159 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2160 break; 2161 default: assert(false, "wrong vector length"); 2162 } 2163 } 2164 2165 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2166 if (opcode == Op_AddReductionVI) { 2167 if (vtmp1 != src2) { 2168 movdqu(vtmp1, src2); 2169 } 2170 phaddd(vtmp1, vtmp1); 2171 } else { 2172 pshufd(vtmp1, src2, 0x1); 2173 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2174 } 2175 movdl(vtmp2, src1); 2176 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2177 movdl(dst, vtmp1); 2178 } 2179 2180 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2181 if (opcode == Op_AddReductionVI) { 2182 if (vtmp1 != src2) { 2183 movdqu(vtmp1, src2); 2184 } 2185 phaddd(vtmp1, src2); 2186 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2187 } else { 2188 pshufd(vtmp2, src2, 0xE); 2189 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2190 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2191 } 2192 } 2193 2194 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2195 if (opcode == Op_AddReductionVI) { 2196 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2197 vextracti128_high(vtmp2, vtmp1); 2198 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2199 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2200 } else { 2201 vextracti128_high(vtmp1, src2); 2202 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2203 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2204 } 2205 } 2206 2207 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2208 vextracti64x4_high(vtmp2, src2); 2209 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2210 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2211 } 2212 2213 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2214 pshufd(vtmp2, src2, 0x1); 2215 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2216 movdqu(vtmp1, vtmp2); 2217 psrldq(vtmp1, 2); 2218 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2219 movdqu(vtmp2, vtmp1); 2220 psrldq(vtmp2, 1); 2221 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2222 movdl(vtmp2, src1); 2223 pmovsxbd(vtmp1, vtmp1); 2224 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2225 pextrb(dst, vtmp1, 0x0); 2226 movsbl(dst, dst); 2227 } 2228 2229 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2230 pshufd(vtmp1, src2, 0xE); 2231 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2232 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2233 } 2234 2235 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2236 vextracti128_high(vtmp2, src2); 2237 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2238 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2239 } 2240 2241 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2242 vextracti64x4_high(vtmp1, src2); 2243 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2244 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2245 } 2246 2247 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2248 pmovsxbw(vtmp2, src2); 2249 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2250 } 2251 2252 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2253 if (UseAVX > 1) { 2254 int vector_len = Assembler::AVX_256bit; 2255 vpmovsxbw(vtmp1, src2, vector_len); 2256 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2257 } else { 2258 pmovsxbw(vtmp2, src2); 2259 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2260 pshufd(vtmp2, src2, 0x1); 2261 pmovsxbw(vtmp2, src2); 2262 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2263 } 2264 } 2265 2266 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2267 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2268 int vector_len = Assembler::AVX_512bit; 2269 vpmovsxbw(vtmp1, src2, vector_len); 2270 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2271 } else { 2272 assert(UseAVX >= 2,"Should not reach here."); 2273 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2274 vextracti128_high(vtmp2, src2); 2275 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2276 } 2277 } 2278 2279 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2280 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2281 vextracti64x4_high(vtmp2, src2); 2282 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2283 } 2284 2285 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2286 if (opcode == Op_AddReductionVI) { 2287 if (vtmp1 != src2) { 2288 movdqu(vtmp1, src2); 2289 } 2290 phaddw(vtmp1, vtmp1); 2291 phaddw(vtmp1, vtmp1); 2292 } else { 2293 pshufd(vtmp2, src2, 0x1); 2294 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2295 movdqu(vtmp1, vtmp2); 2296 psrldq(vtmp1, 2); 2297 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2298 } 2299 movdl(vtmp2, src1); 2300 pmovsxwd(vtmp1, vtmp1); 2301 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2302 pextrw(dst, vtmp1, 0x0); 2303 movswl(dst, dst); 2304 } 2305 2306 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2307 if (opcode == Op_AddReductionVI) { 2308 if (vtmp1 != src2) { 2309 movdqu(vtmp1, src2); 2310 } 2311 phaddw(vtmp1, src2); 2312 } else { 2313 pshufd(vtmp1, src2, 0xE); 2314 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2315 } 2316 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2317 } 2318 2319 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2320 if (opcode == Op_AddReductionVI) { 2321 int vector_len = Assembler::AVX_256bit; 2322 vphaddw(vtmp2, src2, src2, vector_len); 2323 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2324 } else { 2325 vextracti128_high(vtmp2, src2); 2326 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2327 } 2328 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2329 } 2330 2331 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2332 int vector_len = Assembler::AVX_256bit; 2333 vextracti64x4_high(vtmp1, src2); 2334 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2335 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2336 } 2337 2338 #ifdef _LP64 2339 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2340 pshufd(vtmp2, src2, 0xE); 2341 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2342 movdq(vtmp1, src1); 2343 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2344 movdq(dst, vtmp1); 2345 } 2346 2347 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2348 vextracti128_high(vtmp1, src2); 2349 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2350 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2351 } 2352 2353 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2354 vextracti64x4_high(vtmp2, src2); 2355 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2356 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2357 } 2358 2359 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2360 mov64(temp, -1L); 2361 bzhiq(temp, temp, len); 2362 kmovql(dst, temp); 2363 } 2364 #endif // _LP64 2365 2366 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2367 reduce_operation_128(T_FLOAT, opcode, dst, src); 2368 pshufd(vtmp, src, 0x1); 2369 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2370 } 2371 2372 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2373 reduce2F(opcode, dst, src, vtmp); 2374 pshufd(vtmp, src, 0x2); 2375 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2376 pshufd(vtmp, src, 0x3); 2377 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2378 } 2379 2380 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2381 reduce4F(opcode, dst, src, vtmp2); 2382 vextractf128_high(vtmp2, src); 2383 reduce4F(opcode, dst, vtmp2, vtmp1); 2384 } 2385 2386 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2387 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2388 vextracti64x4_high(vtmp1, src); 2389 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2390 } 2391 2392 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2393 pshufd(dst, src, 0x1); 2394 reduce_operation_128(T_FLOAT, opcode, dst, src); 2395 } 2396 2397 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2398 pshufd(vtmp, src, 0xE); 2399 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2400 unorderedReduce2F(opcode, dst, vtmp); 2401 } 2402 2403 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2404 vextractf128_high(vtmp1, src); 2405 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2406 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2407 } 2408 2409 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2410 vextractf64x4_high(vtmp2, src); 2411 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2412 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2413 } 2414 2415 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2416 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2417 pshufd(vtmp, src, 0xE); 2418 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2419 } 2420 2421 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2422 reduce2D(opcode, dst, src, vtmp2); 2423 vextractf128_high(vtmp2, src); 2424 reduce2D(opcode, dst, vtmp2, vtmp1); 2425 } 2426 2427 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2428 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2429 vextracti64x4_high(vtmp1, src); 2430 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2431 } 2432 2433 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2434 pshufd(dst, src, 0xE); 2435 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2436 } 2437 2438 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2439 vextractf128_high(vtmp, src); 2440 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2441 unorderedReduce2D(opcode, dst, vtmp); 2442 } 2443 2444 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2445 vextractf64x4_high(vtmp2, src); 2446 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2447 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2448 } 2449 2450 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2451 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2452 } 2453 2454 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2455 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2456 } 2457 2458 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 2459 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2460 } 2461 2462 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2463 int vec_enc) { 2464 switch(elem_bt) { 2465 case T_INT: 2466 case T_FLOAT: 2467 vmaskmovps(dst, src, mask, vec_enc); 2468 break; 2469 case T_LONG: 2470 case T_DOUBLE: 2471 vmaskmovpd(dst, src, mask, vec_enc); 2472 break; 2473 default: 2474 fatal("Unsupported type %s", type2name(elem_bt)); 2475 break; 2476 } 2477 } 2478 2479 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2480 int vec_enc) { 2481 switch(elem_bt) { 2482 case T_INT: 2483 case T_FLOAT: 2484 vmaskmovps(dst, src, mask, vec_enc); 2485 break; 2486 case T_LONG: 2487 case T_DOUBLE: 2488 vmaskmovpd(dst, src, mask, vec_enc); 2489 break; 2490 default: 2491 fatal("Unsupported type %s", type2name(elem_bt)); 2492 break; 2493 } 2494 } 2495 2496 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2497 XMMRegister dst, XMMRegister src, 2498 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2499 XMMRegister xmm_0, XMMRegister xmm_1) { 2500 const int permconst[] = {1, 14}; 2501 XMMRegister wsrc = src; 2502 XMMRegister wdst = xmm_0; 2503 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2504 2505 int vlen_enc = Assembler::AVX_128bit; 2506 if (vlen == 16) { 2507 vlen_enc = Assembler::AVX_256bit; 2508 } 2509 2510 for (int i = log2(vlen) - 1; i >=0; i--) { 2511 if (i == 0 && !is_dst_valid) { 2512 wdst = dst; 2513 } 2514 if (i == 3) { 2515 vextracti64x4_high(wtmp, wsrc); 2516 } else if (i == 2) { 2517 vextracti128_high(wtmp, wsrc); 2518 } else { // i = [0,1] 2519 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2520 } 2521 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2522 wsrc = wdst; 2523 vlen_enc = Assembler::AVX_128bit; 2524 } 2525 if (is_dst_valid) { 2526 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2527 } 2528 } 2529 2530 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2531 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2532 XMMRegister xmm_0, XMMRegister xmm_1) { 2533 XMMRegister wsrc = src; 2534 XMMRegister wdst = xmm_0; 2535 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2536 int vlen_enc = Assembler::AVX_128bit; 2537 if (vlen == 8) { 2538 vlen_enc = Assembler::AVX_256bit; 2539 } 2540 for (int i = log2(vlen) - 1; i >=0; i--) { 2541 if (i == 0 && !is_dst_valid) { 2542 wdst = dst; 2543 } 2544 if (i == 1) { 2545 vextracti128_high(wtmp, wsrc); 2546 } else if (i == 2) { 2547 vextracti64x4_high(wtmp, wsrc); 2548 } else { 2549 assert(i == 0, "%d", i); 2550 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2551 } 2552 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2553 wsrc = wdst; 2554 vlen_enc = Assembler::AVX_128bit; 2555 } 2556 if (is_dst_valid) { 2557 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2558 } 2559 } 2560 2561 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2562 switch (bt) { 2563 case T_BYTE: pextrb(dst, src, idx); break; 2564 case T_SHORT: pextrw(dst, src, idx); break; 2565 case T_INT: pextrd(dst, src, idx); break; 2566 case T_LONG: pextrq(dst, src, idx); break; 2567 2568 default: 2569 assert(false,"Should not reach here."); 2570 break; 2571 } 2572 } 2573 2574 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2575 int esize = type2aelembytes(typ); 2576 int elem_per_lane = 16/esize; 2577 int lane = elemindex / elem_per_lane; 2578 int eindex = elemindex % elem_per_lane; 2579 2580 if (lane >= 2) { 2581 assert(UseAVX > 2, "required"); 2582 vextractf32x4(dst, src, lane & 3); 2583 return dst; 2584 } else if (lane > 0) { 2585 assert(UseAVX > 0, "required"); 2586 vextractf128(dst, src, lane); 2587 return dst; 2588 } else { 2589 return src; 2590 } 2591 } 2592 2593 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2594 if (typ == T_BYTE) { 2595 movsbl(dst, dst); 2596 } else if (typ == T_SHORT) { 2597 movswl(dst, dst); 2598 } 2599 } 2600 2601 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2602 int esize = type2aelembytes(typ); 2603 int elem_per_lane = 16/esize; 2604 int eindex = elemindex % elem_per_lane; 2605 assert(is_integral_type(typ),"required"); 2606 2607 if (eindex == 0) { 2608 if (typ == T_LONG) { 2609 movq(dst, src); 2610 } else { 2611 movdl(dst, src); 2612 movsxl(typ, dst); 2613 } 2614 } else { 2615 extract(typ, dst, src, eindex); 2616 movsxl(typ, dst); 2617 } 2618 } 2619 2620 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2621 int esize = type2aelembytes(typ); 2622 int elem_per_lane = 16/esize; 2623 int eindex = elemindex % elem_per_lane; 2624 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2625 2626 if (eindex == 0) { 2627 movq(dst, src); 2628 } else { 2629 if (typ == T_FLOAT) { 2630 if (UseAVX == 0) { 2631 movdqu(dst, src); 2632 shufps(dst, dst, eindex); 2633 } else { 2634 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2635 } 2636 } else { 2637 if (UseAVX == 0) { 2638 movdqu(dst, src); 2639 psrldq(dst, eindex*esize); 2640 } else { 2641 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2642 } 2643 movq(dst, dst); 2644 } 2645 } 2646 // Zero upper bits 2647 if (typ == T_FLOAT) { 2648 if (UseAVX == 0) { 2649 assert(vtmp != xnoreg, "required."); 2650 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2651 pand(dst, vtmp); 2652 } else { 2653 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2654 } 2655 } 2656 } 2657 2658 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2659 switch(typ) { 2660 case T_BYTE: 2661 case T_BOOLEAN: 2662 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2663 break; 2664 case T_SHORT: 2665 case T_CHAR: 2666 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2667 break; 2668 case T_INT: 2669 case T_FLOAT: 2670 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2671 break; 2672 case T_LONG: 2673 case T_DOUBLE: 2674 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2675 break; 2676 default: 2677 assert(false,"Should not reach here."); 2678 break; 2679 } 2680 } 2681 2682 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2683 assert(rscratch != noreg || always_reachable(src2), "missing"); 2684 2685 switch(typ) { 2686 case T_BOOLEAN: 2687 case T_BYTE: 2688 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2689 break; 2690 case T_CHAR: 2691 case T_SHORT: 2692 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2693 break; 2694 case T_INT: 2695 case T_FLOAT: 2696 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2697 break; 2698 case T_LONG: 2699 case T_DOUBLE: 2700 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2701 break; 2702 default: 2703 assert(false,"Should not reach here."); 2704 break; 2705 } 2706 } 2707 2708 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2709 switch(typ) { 2710 case T_BYTE: 2711 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2712 break; 2713 case T_SHORT: 2714 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2715 break; 2716 case T_INT: 2717 case T_FLOAT: 2718 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2719 break; 2720 case T_LONG: 2721 case T_DOUBLE: 2722 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2723 break; 2724 default: 2725 assert(false,"Should not reach here."); 2726 break; 2727 } 2728 } 2729 2730 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2731 assert(vlen_in_bytes <= 32, ""); 2732 int esize = type2aelembytes(bt); 2733 if (vlen_in_bytes == 32) { 2734 assert(vtmp == xnoreg, "required."); 2735 if (esize >= 4) { 2736 vtestps(src1, src2, AVX_256bit); 2737 } else { 2738 vptest(src1, src2, AVX_256bit); 2739 } 2740 return; 2741 } 2742 if (vlen_in_bytes < 16) { 2743 // Duplicate the lower part to fill the whole register, 2744 // Don't need to do so for src2 2745 assert(vtmp != xnoreg, "required"); 2746 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2747 pshufd(vtmp, src1, shuffle_imm); 2748 } else { 2749 assert(vtmp == xnoreg, "required"); 2750 vtmp = src1; 2751 } 2752 if (esize >= 4 && VM_Version::supports_avx()) { 2753 vtestps(vtmp, src2, AVX_128bit); 2754 } else { 2755 ptest(vtmp, src2); 2756 } 2757 } 2758 2759 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2760 #ifdef ASSERT 2761 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2762 bool is_bw_supported = VM_Version::supports_avx512bw(); 2763 if (is_bw && !is_bw_supported) { 2764 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2765 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2766 "XMM register should be 0-15"); 2767 } 2768 #endif // ASSERT 2769 switch (elem_bt) { 2770 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2771 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2772 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2773 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2774 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2775 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2776 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2777 } 2778 } 2779 2780 #ifdef _LP64 2781 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2782 assert(UseAVX >= 2, "required"); 2783 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2784 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2785 if ((UseAVX > 2) && 2786 (!is_bw || VM_Version::supports_avx512bw()) && 2787 (!is_vl || VM_Version::supports_avx512vl())) { 2788 switch (elem_bt) { 2789 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2790 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2791 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2792 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2793 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2794 } 2795 } else { 2796 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2797 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2798 switch (elem_bt) { 2799 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2800 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2801 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2802 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2803 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2804 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2805 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2806 } 2807 } 2808 } 2809 #endif 2810 2811 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2812 switch (to_elem_bt) { 2813 case T_SHORT: 2814 vpmovsxbw(dst, src, vlen_enc); 2815 break; 2816 case T_INT: 2817 vpmovsxbd(dst, src, vlen_enc); 2818 break; 2819 case T_FLOAT: 2820 vpmovsxbd(dst, src, vlen_enc); 2821 vcvtdq2ps(dst, dst, vlen_enc); 2822 break; 2823 case T_LONG: 2824 vpmovsxbq(dst, src, vlen_enc); 2825 break; 2826 case T_DOUBLE: { 2827 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2828 vpmovsxbd(dst, src, mid_vlen_enc); 2829 vcvtdq2pd(dst, dst, vlen_enc); 2830 break; 2831 } 2832 default: 2833 fatal("Unsupported type %s", type2name(to_elem_bt)); 2834 break; 2835 } 2836 } 2837 2838 //------------------------------------------------------------------------------------------- 2839 2840 // IndexOf for constant substrings with size >= 8 chars 2841 // which don't need to be loaded through stack. 2842 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2843 Register cnt1, Register cnt2, 2844 int int_cnt2, Register result, 2845 XMMRegister vec, Register tmp, 2846 int ae) { 2847 ShortBranchVerifier sbv(this); 2848 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2849 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2850 2851 // This method uses the pcmpestri instruction with bound registers 2852 // inputs: 2853 // xmm - substring 2854 // rax - substring length (elements count) 2855 // mem - scanned string 2856 // rdx - string length (elements count) 2857 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2858 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2859 // outputs: 2860 // rcx - matched index in string 2861 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2862 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2863 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2864 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2865 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2866 2867 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2868 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2869 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2870 2871 // Note, inline_string_indexOf() generates checks: 2872 // if (substr.count > string.count) return -1; 2873 // if (substr.count == 0) return 0; 2874 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2875 2876 // Load substring. 2877 if (ae == StrIntrinsicNode::UL) { 2878 pmovzxbw(vec, Address(str2, 0)); 2879 } else { 2880 movdqu(vec, Address(str2, 0)); 2881 } 2882 movl(cnt2, int_cnt2); 2883 movptr(result, str1); // string addr 2884 2885 if (int_cnt2 > stride) { 2886 jmpb(SCAN_TO_SUBSTR); 2887 2888 // Reload substr for rescan, this code 2889 // is executed only for large substrings (> 8 chars) 2890 bind(RELOAD_SUBSTR); 2891 if (ae == StrIntrinsicNode::UL) { 2892 pmovzxbw(vec, Address(str2, 0)); 2893 } else { 2894 movdqu(vec, Address(str2, 0)); 2895 } 2896 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2897 2898 bind(RELOAD_STR); 2899 // We came here after the beginning of the substring was 2900 // matched but the rest of it was not so we need to search 2901 // again. Start from the next element after the previous match. 2902 2903 // cnt2 is number of substring reminding elements and 2904 // cnt1 is number of string reminding elements when cmp failed. 2905 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2906 subl(cnt1, cnt2); 2907 addl(cnt1, int_cnt2); 2908 movl(cnt2, int_cnt2); // Now restore cnt2 2909 2910 decrementl(cnt1); // Shift to next element 2911 cmpl(cnt1, cnt2); 2912 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2913 2914 addptr(result, (1<<scale1)); 2915 2916 } // (int_cnt2 > 8) 2917 2918 // Scan string for start of substr in 16-byte vectors 2919 bind(SCAN_TO_SUBSTR); 2920 pcmpestri(vec, Address(result, 0), mode); 2921 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2922 subl(cnt1, stride); 2923 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2924 cmpl(cnt1, cnt2); 2925 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2926 addptr(result, 16); 2927 jmpb(SCAN_TO_SUBSTR); 2928 2929 // Found a potential substr 2930 bind(FOUND_CANDIDATE); 2931 // Matched whole vector if first element matched (tmp(rcx) == 0). 2932 if (int_cnt2 == stride) { 2933 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2934 } else { // int_cnt2 > 8 2935 jccb(Assembler::overflow, FOUND_SUBSTR); 2936 } 2937 // After pcmpestri tmp(rcx) contains matched element index 2938 // Compute start addr of substr 2939 lea(result, Address(result, tmp, scale1)); 2940 2941 // Make sure string is still long enough 2942 subl(cnt1, tmp); 2943 cmpl(cnt1, cnt2); 2944 if (int_cnt2 == stride) { 2945 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2946 } else { // int_cnt2 > 8 2947 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2948 } 2949 // Left less then substring. 2950 2951 bind(RET_NOT_FOUND); 2952 movl(result, -1); 2953 jmp(EXIT); 2954 2955 if (int_cnt2 > stride) { 2956 // This code is optimized for the case when whole substring 2957 // is matched if its head is matched. 2958 bind(MATCH_SUBSTR_HEAD); 2959 pcmpestri(vec, Address(result, 0), mode); 2960 // Reload only string if does not match 2961 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2962 2963 Label CONT_SCAN_SUBSTR; 2964 // Compare the rest of substring (> 8 chars). 2965 bind(FOUND_SUBSTR); 2966 // First 8 chars are already matched. 2967 negptr(cnt2); 2968 addptr(cnt2, stride); 2969 2970 bind(SCAN_SUBSTR); 2971 subl(cnt1, stride); 2972 cmpl(cnt2, -stride); // Do not read beyond substring 2973 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2974 // Back-up strings to avoid reading beyond substring: 2975 // cnt1 = cnt1 - cnt2 + 8 2976 addl(cnt1, cnt2); // cnt2 is negative 2977 addl(cnt1, stride); 2978 movl(cnt2, stride); negptr(cnt2); 2979 bind(CONT_SCAN_SUBSTR); 2980 if (int_cnt2 < (int)G) { 2981 int tail_off1 = int_cnt2<<scale1; 2982 int tail_off2 = int_cnt2<<scale2; 2983 if (ae == StrIntrinsicNode::UL) { 2984 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2985 } else { 2986 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2987 } 2988 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2989 } else { 2990 // calculate index in register to avoid integer overflow (int_cnt2*2) 2991 movl(tmp, int_cnt2); 2992 addptr(tmp, cnt2); 2993 if (ae == StrIntrinsicNode::UL) { 2994 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2995 } else { 2996 movdqu(vec, Address(str2, tmp, scale2, 0)); 2997 } 2998 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2999 } 3000 // Need to reload strings pointers if not matched whole vector 3001 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3002 addptr(cnt2, stride); 3003 jcc(Assembler::negative, SCAN_SUBSTR); 3004 // Fall through if found full substring 3005 3006 } // (int_cnt2 > 8) 3007 3008 bind(RET_FOUND); 3009 // Found result if we matched full small substring. 3010 // Compute substr offset 3011 subptr(result, str1); 3012 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3013 shrl(result, 1); // index 3014 } 3015 bind(EXIT); 3016 3017 } // string_indexofC8 3018 3019 // Small strings are loaded through stack if they cross page boundary. 3020 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 3021 Register cnt1, Register cnt2, 3022 int int_cnt2, Register result, 3023 XMMRegister vec, Register tmp, 3024 int ae) { 3025 ShortBranchVerifier sbv(this); 3026 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3027 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3028 3029 // 3030 // int_cnt2 is length of small (< 8 chars) constant substring 3031 // or (-1) for non constant substring in which case its length 3032 // is in cnt2 register. 3033 // 3034 // Note, inline_string_indexOf() generates checks: 3035 // if (substr.count > string.count) return -1; 3036 // if (substr.count == 0) return 0; 3037 // 3038 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 3039 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 3040 // This method uses the pcmpestri instruction with bound registers 3041 // inputs: 3042 // xmm - substring 3043 // rax - substring length (elements count) 3044 // mem - scanned string 3045 // rdx - string length (elements count) 3046 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 3047 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 3048 // outputs: 3049 // rcx - matched index in string 3050 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3051 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 3052 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 3053 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 3054 3055 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 3056 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 3057 FOUND_CANDIDATE; 3058 3059 { //======================================================== 3060 // We don't know where these strings are located 3061 // and we can't read beyond them. Load them through stack. 3062 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 3063 3064 movptr(tmp, rsp); // save old SP 3065 3066 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 3067 if (int_cnt2 == (1>>scale2)) { // One byte 3068 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 3069 load_unsigned_byte(result, Address(str2, 0)); 3070 movdl(vec, result); // move 32 bits 3071 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 3072 // Not enough header space in 32-bit VM: 12+3 = 15. 3073 movl(result, Address(str2, -1)); 3074 shrl(result, 8); 3075 movdl(vec, result); // move 32 bits 3076 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 3077 load_unsigned_short(result, Address(str2, 0)); 3078 movdl(vec, result); // move 32 bits 3079 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 3080 movdl(vec, Address(str2, 0)); // move 32 bits 3081 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 3082 movq(vec, Address(str2, 0)); // move 64 bits 3083 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 3084 // Array header size is 12 bytes in 32-bit VM 3085 // + 6 bytes for 3 chars == 18 bytes, 3086 // enough space to load vec and shift. 3087 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3088 if (ae == StrIntrinsicNode::UL) { 3089 int tail_off = int_cnt2-8; 3090 pmovzxbw(vec, Address(str2, tail_off)); 3091 psrldq(vec, -2*tail_off); 3092 } 3093 else { 3094 int tail_off = int_cnt2*(1<<scale2); 3095 movdqu(vec, Address(str2, tail_off-16)); 3096 psrldq(vec, 16-tail_off); 3097 } 3098 } 3099 } else { // not constant substring 3100 cmpl(cnt2, stride); 3101 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3102 3103 // We can read beyond string if srt+16 does not cross page boundary 3104 // since heaps are aligned and mapped by pages. 3105 assert(os::vm_page_size() < (int)G, "default page should be small"); 3106 movl(result, str2); // We need only low 32 bits 3107 andl(result, ((int)os::vm_page_size()-1)); 3108 cmpl(result, ((int)os::vm_page_size()-16)); 3109 jccb(Assembler::belowEqual, CHECK_STR); 3110 3111 // Move small strings to stack to allow load 16 bytes into vec. 3112 subptr(rsp, 16); 3113 int stk_offset = wordSize-(1<<scale2); 3114 push(cnt2); 3115 3116 bind(COPY_SUBSTR); 3117 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3118 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3119 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3120 } else if (ae == StrIntrinsicNode::UU) { 3121 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3122 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3123 } 3124 decrement(cnt2); 3125 jccb(Assembler::notZero, COPY_SUBSTR); 3126 3127 pop(cnt2); 3128 movptr(str2, rsp); // New substring address 3129 } // non constant 3130 3131 bind(CHECK_STR); 3132 cmpl(cnt1, stride); 3133 jccb(Assembler::aboveEqual, BIG_STRINGS); 3134 3135 // Check cross page boundary. 3136 movl(result, str1); // We need only low 32 bits 3137 andl(result, ((int)os::vm_page_size()-1)); 3138 cmpl(result, ((int)os::vm_page_size()-16)); 3139 jccb(Assembler::belowEqual, BIG_STRINGS); 3140 3141 subptr(rsp, 16); 3142 int stk_offset = -(1<<scale1); 3143 if (int_cnt2 < 0) { // not constant 3144 push(cnt2); 3145 stk_offset += wordSize; 3146 } 3147 movl(cnt2, cnt1); 3148 3149 bind(COPY_STR); 3150 if (ae == StrIntrinsicNode::LL) { 3151 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3152 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3153 } else { 3154 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3155 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3156 } 3157 decrement(cnt2); 3158 jccb(Assembler::notZero, COPY_STR); 3159 3160 if (int_cnt2 < 0) { // not constant 3161 pop(cnt2); 3162 } 3163 movptr(str1, rsp); // New string address 3164 3165 bind(BIG_STRINGS); 3166 // Load substring. 3167 if (int_cnt2 < 0) { // -1 3168 if (ae == StrIntrinsicNode::UL) { 3169 pmovzxbw(vec, Address(str2, 0)); 3170 } else { 3171 movdqu(vec, Address(str2, 0)); 3172 } 3173 push(cnt2); // substr count 3174 push(str2); // substr addr 3175 push(str1); // string addr 3176 } else { 3177 // Small (< 8 chars) constant substrings are loaded already. 3178 movl(cnt2, int_cnt2); 3179 } 3180 push(tmp); // original SP 3181 3182 } // Finished loading 3183 3184 //======================================================== 3185 // Start search 3186 // 3187 3188 movptr(result, str1); // string addr 3189 3190 if (int_cnt2 < 0) { // Only for non constant substring 3191 jmpb(SCAN_TO_SUBSTR); 3192 3193 // SP saved at sp+0 3194 // String saved at sp+1*wordSize 3195 // Substr saved at sp+2*wordSize 3196 // Substr count saved at sp+3*wordSize 3197 3198 // Reload substr for rescan, this code 3199 // is executed only for large substrings (> 8 chars) 3200 bind(RELOAD_SUBSTR); 3201 movptr(str2, Address(rsp, 2*wordSize)); 3202 movl(cnt2, Address(rsp, 3*wordSize)); 3203 if (ae == StrIntrinsicNode::UL) { 3204 pmovzxbw(vec, Address(str2, 0)); 3205 } else { 3206 movdqu(vec, Address(str2, 0)); 3207 } 3208 // We came here after the beginning of the substring was 3209 // matched but the rest of it was not so we need to search 3210 // again. Start from the next element after the previous match. 3211 subptr(str1, result); // Restore counter 3212 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3213 shrl(str1, 1); 3214 } 3215 addl(cnt1, str1); 3216 decrementl(cnt1); // Shift to next element 3217 cmpl(cnt1, cnt2); 3218 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3219 3220 addptr(result, (1<<scale1)); 3221 } // non constant 3222 3223 // Scan string for start of substr in 16-byte vectors 3224 bind(SCAN_TO_SUBSTR); 3225 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3226 pcmpestri(vec, Address(result, 0), mode); 3227 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3228 subl(cnt1, stride); 3229 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3230 cmpl(cnt1, cnt2); 3231 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3232 addptr(result, 16); 3233 3234 bind(ADJUST_STR); 3235 cmpl(cnt1, stride); // Do not read beyond string 3236 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3237 // Back-up string to avoid reading beyond string. 3238 lea(result, Address(result, cnt1, scale1, -16)); 3239 movl(cnt1, stride); 3240 jmpb(SCAN_TO_SUBSTR); 3241 3242 // Found a potential substr 3243 bind(FOUND_CANDIDATE); 3244 // After pcmpestri tmp(rcx) contains matched element index 3245 3246 // Make sure string is still long enough 3247 subl(cnt1, tmp); 3248 cmpl(cnt1, cnt2); 3249 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3250 // Left less then substring. 3251 3252 bind(RET_NOT_FOUND); 3253 movl(result, -1); 3254 jmp(CLEANUP); 3255 3256 bind(FOUND_SUBSTR); 3257 // Compute start addr of substr 3258 lea(result, Address(result, tmp, scale1)); 3259 if (int_cnt2 > 0) { // Constant substring 3260 // Repeat search for small substring (< 8 chars) 3261 // from new point without reloading substring. 3262 // Have to check that we don't read beyond string. 3263 cmpl(tmp, stride-int_cnt2); 3264 jccb(Assembler::greater, ADJUST_STR); 3265 // Fall through if matched whole substring. 3266 } else { // non constant 3267 assert(int_cnt2 == -1, "should be != 0"); 3268 3269 addl(tmp, cnt2); 3270 // Found result if we matched whole substring. 3271 cmpl(tmp, stride); 3272 jcc(Assembler::lessEqual, RET_FOUND); 3273 3274 // Repeat search for small substring (<= 8 chars) 3275 // from new point 'str1' without reloading substring. 3276 cmpl(cnt2, stride); 3277 // Have to check that we don't read beyond string. 3278 jccb(Assembler::lessEqual, ADJUST_STR); 3279 3280 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3281 // Compare the rest of substring (> 8 chars). 3282 movptr(str1, result); 3283 3284 cmpl(tmp, cnt2); 3285 // First 8 chars are already matched. 3286 jccb(Assembler::equal, CHECK_NEXT); 3287 3288 bind(SCAN_SUBSTR); 3289 pcmpestri(vec, Address(str1, 0), mode); 3290 // Need to reload strings pointers if not matched whole vector 3291 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3292 3293 bind(CHECK_NEXT); 3294 subl(cnt2, stride); 3295 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3296 addptr(str1, 16); 3297 if (ae == StrIntrinsicNode::UL) { 3298 addptr(str2, 8); 3299 } else { 3300 addptr(str2, 16); 3301 } 3302 subl(cnt1, stride); 3303 cmpl(cnt2, stride); // Do not read beyond substring 3304 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3305 // Back-up strings to avoid reading beyond substring. 3306 3307 if (ae == StrIntrinsicNode::UL) { 3308 lea(str2, Address(str2, cnt2, scale2, -8)); 3309 lea(str1, Address(str1, cnt2, scale1, -16)); 3310 } else { 3311 lea(str2, Address(str2, cnt2, scale2, -16)); 3312 lea(str1, Address(str1, cnt2, scale1, -16)); 3313 } 3314 subl(cnt1, cnt2); 3315 movl(cnt2, stride); 3316 addl(cnt1, stride); 3317 bind(CONT_SCAN_SUBSTR); 3318 if (ae == StrIntrinsicNode::UL) { 3319 pmovzxbw(vec, Address(str2, 0)); 3320 } else { 3321 movdqu(vec, Address(str2, 0)); 3322 } 3323 jmp(SCAN_SUBSTR); 3324 3325 bind(RET_FOUND_LONG); 3326 movptr(str1, Address(rsp, wordSize)); 3327 } // non constant 3328 3329 bind(RET_FOUND); 3330 // Compute substr offset 3331 subptr(result, str1); 3332 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3333 shrl(result, 1); // index 3334 } 3335 bind(CLEANUP); 3336 pop(rsp); // restore SP 3337 3338 } // string_indexof 3339 3340 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3341 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3342 ShortBranchVerifier sbv(this); 3343 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3344 3345 int stride = 8; 3346 3347 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3348 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3349 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3350 FOUND_SEQ_CHAR, DONE_LABEL; 3351 3352 movptr(result, str1); 3353 if (UseAVX >= 2) { 3354 cmpl(cnt1, stride); 3355 jcc(Assembler::less, SCAN_TO_CHAR); 3356 cmpl(cnt1, 2*stride); 3357 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3358 movdl(vec1, ch); 3359 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3360 vpxor(vec2, vec2); 3361 movl(tmp, cnt1); 3362 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3363 andl(cnt1,0x0000000F); //tail count (in chars) 3364 3365 bind(SCAN_TO_16_CHAR_LOOP); 3366 vmovdqu(vec3, Address(result, 0)); 3367 vpcmpeqw(vec3, vec3, vec1, 1); 3368 vptest(vec2, vec3); 3369 jcc(Assembler::carryClear, FOUND_CHAR); 3370 addptr(result, 32); 3371 subl(tmp, 2*stride); 3372 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3373 jmp(SCAN_TO_8_CHAR); 3374 bind(SCAN_TO_8_CHAR_INIT); 3375 movdl(vec1, ch); 3376 pshuflw(vec1, vec1, 0x00); 3377 pshufd(vec1, vec1, 0); 3378 pxor(vec2, vec2); 3379 } 3380 bind(SCAN_TO_8_CHAR); 3381 cmpl(cnt1, stride); 3382 jcc(Assembler::less, SCAN_TO_CHAR); 3383 if (UseAVX < 2) { 3384 movdl(vec1, ch); 3385 pshuflw(vec1, vec1, 0x00); 3386 pshufd(vec1, vec1, 0); 3387 pxor(vec2, vec2); 3388 } 3389 movl(tmp, cnt1); 3390 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3391 andl(cnt1,0x00000007); //tail count (in chars) 3392 3393 bind(SCAN_TO_8_CHAR_LOOP); 3394 movdqu(vec3, Address(result, 0)); 3395 pcmpeqw(vec3, vec1); 3396 ptest(vec2, vec3); 3397 jcc(Assembler::carryClear, FOUND_CHAR); 3398 addptr(result, 16); 3399 subl(tmp, stride); 3400 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3401 bind(SCAN_TO_CHAR); 3402 testl(cnt1, cnt1); 3403 jcc(Assembler::zero, RET_NOT_FOUND); 3404 bind(SCAN_TO_CHAR_LOOP); 3405 load_unsigned_short(tmp, Address(result, 0)); 3406 cmpl(ch, tmp); 3407 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3408 addptr(result, 2); 3409 subl(cnt1, 1); 3410 jccb(Assembler::zero, RET_NOT_FOUND); 3411 jmp(SCAN_TO_CHAR_LOOP); 3412 3413 bind(RET_NOT_FOUND); 3414 movl(result, -1); 3415 jmpb(DONE_LABEL); 3416 3417 bind(FOUND_CHAR); 3418 if (UseAVX >= 2) { 3419 vpmovmskb(tmp, vec3); 3420 } else { 3421 pmovmskb(tmp, vec3); 3422 } 3423 bsfl(ch, tmp); 3424 addptr(result, ch); 3425 3426 bind(FOUND_SEQ_CHAR); 3427 subptr(result, str1); 3428 shrl(result, 1); 3429 3430 bind(DONE_LABEL); 3431 } // string_indexof_char 3432 3433 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3434 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3435 ShortBranchVerifier sbv(this); 3436 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3437 3438 int stride = 16; 3439 3440 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3441 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3442 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3443 FOUND_SEQ_CHAR, DONE_LABEL; 3444 3445 movptr(result, str1); 3446 if (UseAVX >= 2) { 3447 cmpl(cnt1, stride); 3448 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3449 cmpl(cnt1, stride*2); 3450 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3451 movdl(vec1, ch); 3452 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3453 vpxor(vec2, vec2); 3454 movl(tmp, cnt1); 3455 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3456 andl(cnt1,0x0000001F); //tail count (in chars) 3457 3458 bind(SCAN_TO_32_CHAR_LOOP); 3459 vmovdqu(vec3, Address(result, 0)); 3460 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3461 vptest(vec2, vec3); 3462 jcc(Assembler::carryClear, FOUND_CHAR); 3463 addptr(result, 32); 3464 subl(tmp, stride*2); 3465 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3466 jmp(SCAN_TO_16_CHAR); 3467 3468 bind(SCAN_TO_16_CHAR_INIT); 3469 movdl(vec1, ch); 3470 pxor(vec2, vec2); 3471 pshufb(vec1, vec2); 3472 } 3473 3474 bind(SCAN_TO_16_CHAR); 3475 cmpl(cnt1, stride); 3476 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3477 if (UseAVX < 2) { 3478 movdl(vec1, ch); 3479 pxor(vec2, vec2); 3480 pshufb(vec1, vec2); 3481 } 3482 movl(tmp, cnt1); 3483 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3484 andl(cnt1,0x0000000F); //tail count (in bytes) 3485 3486 bind(SCAN_TO_16_CHAR_LOOP); 3487 movdqu(vec3, Address(result, 0)); 3488 pcmpeqb(vec3, vec1); 3489 ptest(vec2, vec3); 3490 jcc(Assembler::carryClear, FOUND_CHAR); 3491 addptr(result, 16); 3492 subl(tmp, stride); 3493 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3494 3495 bind(SCAN_TO_CHAR_INIT); 3496 testl(cnt1, cnt1); 3497 jcc(Assembler::zero, RET_NOT_FOUND); 3498 bind(SCAN_TO_CHAR_LOOP); 3499 load_unsigned_byte(tmp, Address(result, 0)); 3500 cmpl(ch, tmp); 3501 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3502 addptr(result, 1); 3503 subl(cnt1, 1); 3504 jccb(Assembler::zero, RET_NOT_FOUND); 3505 jmp(SCAN_TO_CHAR_LOOP); 3506 3507 bind(RET_NOT_FOUND); 3508 movl(result, -1); 3509 jmpb(DONE_LABEL); 3510 3511 bind(FOUND_CHAR); 3512 if (UseAVX >= 2) { 3513 vpmovmskb(tmp, vec3); 3514 } else { 3515 pmovmskb(tmp, vec3); 3516 } 3517 bsfl(ch, tmp); 3518 addptr(result, ch); 3519 3520 bind(FOUND_SEQ_CHAR); 3521 subptr(result, str1); 3522 3523 bind(DONE_LABEL); 3524 } // stringL_indexof_char 3525 3526 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3527 switch (eltype) { 3528 case T_BOOLEAN: return sizeof(jboolean); 3529 case T_BYTE: return sizeof(jbyte); 3530 case T_SHORT: return sizeof(jshort); 3531 case T_CHAR: return sizeof(jchar); 3532 case T_INT: return sizeof(jint); 3533 default: 3534 ShouldNotReachHere(); 3535 return -1; 3536 } 3537 } 3538 3539 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3540 switch (eltype) { 3541 // T_BOOLEAN used as surrogate for unsigned byte 3542 case T_BOOLEAN: movzbl(dst, src); break; 3543 case T_BYTE: movsbl(dst, src); break; 3544 case T_SHORT: movswl(dst, src); break; 3545 case T_CHAR: movzwl(dst, src); break; 3546 case T_INT: movl(dst, src); break; 3547 default: 3548 ShouldNotReachHere(); 3549 } 3550 } 3551 3552 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3553 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3554 } 3555 3556 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3557 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3558 } 3559 3560 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3561 const int vlen = Assembler::AVX_256bit; 3562 switch (eltype) { 3563 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3564 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3565 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3566 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3567 case T_INT: 3568 // do nothing 3569 break; 3570 default: 3571 ShouldNotReachHere(); 3572 } 3573 } 3574 3575 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3576 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3577 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3578 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3579 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3580 BasicType eltype) { 3581 ShortBranchVerifier sbv(this); 3582 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3583 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3584 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3585 3586 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3587 SHORT_UNROLLED_LOOP_EXIT, 3588 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3589 UNROLLED_VECTOR_LOOP_BEGIN, 3590 END; 3591 switch (eltype) { 3592 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3593 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3594 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3595 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3596 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3597 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3598 } 3599 3600 // For "renaming" for readibility of the code 3601 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3602 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3603 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3604 3605 const int elsize = arrays_hashcode_elsize(eltype); 3606 3607 /* 3608 if (cnt1 >= 2) { 3609 if (cnt1 >= 32) { 3610 UNROLLED VECTOR LOOP 3611 } 3612 UNROLLED SCALAR LOOP 3613 } 3614 SINGLE SCALAR 3615 */ 3616 3617 cmpl(cnt1, 32); 3618 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3619 3620 // cnt1 >= 32 && generate_vectorized_loop 3621 xorl(index, index); 3622 3623 // vresult = IntVector.zero(I256); 3624 for (int idx = 0; idx < 4; idx++) { 3625 vpxor(vresult[idx], vresult[idx]); 3626 } 3627 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3628 Register bound = tmp2; 3629 Register next = tmp3; 3630 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3631 movl(next, Address(tmp2, 0)); 3632 movdl(vnext, next); 3633 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3634 3635 // index = 0; 3636 // bound = cnt1 & ~(32 - 1); 3637 movl(bound, cnt1); 3638 andl(bound, ~(32 - 1)); 3639 // for (; index < bound; index += 32) { 3640 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3641 // result *= next; 3642 imull(result, next); 3643 // loop fission to upfront the cost of fetching from memory, OOO execution 3644 // can then hopefully do a better job of prefetching 3645 for (int idx = 0; idx < 4; idx++) { 3646 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3647 } 3648 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3649 for (int idx = 0; idx < 4; idx++) { 3650 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3651 arrays_hashcode_elvcast(vtmp[idx], eltype); 3652 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3653 } 3654 // index += 32; 3655 addl(index, 32); 3656 // index < bound; 3657 cmpl(index, bound); 3658 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3659 // } 3660 3661 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3662 subl(cnt1, bound); 3663 // release bound 3664 3665 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3666 for (int idx = 0; idx < 4; idx++) { 3667 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3668 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3669 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3670 } 3671 // result += vresult.reduceLanes(ADD); 3672 for (int idx = 0; idx < 4; idx++) { 3673 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3674 } 3675 3676 // } else if (cnt1 < 32) { 3677 3678 bind(SHORT_UNROLLED_BEGIN); 3679 // int i = 1; 3680 movl(index, 1); 3681 cmpl(index, cnt1); 3682 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3683 3684 // for (; i < cnt1 ; i += 2) { 3685 bind(SHORT_UNROLLED_LOOP_BEGIN); 3686 movl(tmp3, 961); 3687 imull(result, tmp3); 3688 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3689 movl(tmp3, tmp2); 3690 shll(tmp3, 5); 3691 subl(tmp3, tmp2); 3692 addl(result, tmp3); 3693 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3694 addl(result, tmp3); 3695 addl(index, 2); 3696 cmpl(index, cnt1); 3697 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3698 3699 // } 3700 // if (i >= cnt1) { 3701 bind(SHORT_UNROLLED_LOOP_EXIT); 3702 jccb(Assembler::greater, END); 3703 movl(tmp2, result); 3704 shll(result, 5); 3705 subl(result, tmp2); 3706 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3707 addl(result, tmp3); 3708 // } 3709 bind(END); 3710 3711 BLOCK_COMMENT("} // arrays_hashcode"); 3712 3713 } // arrays_hashcode 3714 3715 // helper function for string_compare 3716 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3717 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3718 Address::ScaleFactor scale2, Register index, int ae) { 3719 if (ae == StrIntrinsicNode::LL) { 3720 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3721 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3722 } else if (ae == StrIntrinsicNode::UU) { 3723 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3724 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3725 } else { 3726 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3727 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3728 } 3729 } 3730 3731 // Compare strings, used for char[] and byte[]. 3732 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3733 Register cnt1, Register cnt2, Register result, 3734 XMMRegister vec1, int ae, KRegister mask) { 3735 ShortBranchVerifier sbv(this); 3736 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3737 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3738 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3739 int stride2x2 = 0x40; 3740 Address::ScaleFactor scale = Address::no_scale; 3741 Address::ScaleFactor scale1 = Address::no_scale; 3742 Address::ScaleFactor scale2 = Address::no_scale; 3743 3744 if (ae != StrIntrinsicNode::LL) { 3745 stride2x2 = 0x20; 3746 } 3747 3748 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3749 shrl(cnt2, 1); 3750 } 3751 // Compute the minimum of the string lengths and the 3752 // difference of the string lengths (stack). 3753 // Do the conditional move stuff 3754 movl(result, cnt1); 3755 subl(cnt1, cnt2); 3756 push(cnt1); 3757 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3758 3759 // Is the minimum length zero? 3760 testl(cnt2, cnt2); 3761 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3762 if (ae == StrIntrinsicNode::LL) { 3763 // Load first bytes 3764 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3765 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3766 } else if (ae == StrIntrinsicNode::UU) { 3767 // Load first characters 3768 load_unsigned_short(result, Address(str1, 0)); 3769 load_unsigned_short(cnt1, Address(str2, 0)); 3770 } else { 3771 load_unsigned_byte(result, Address(str1, 0)); 3772 load_unsigned_short(cnt1, Address(str2, 0)); 3773 } 3774 subl(result, cnt1); 3775 jcc(Assembler::notZero, POP_LABEL); 3776 3777 if (ae == StrIntrinsicNode::UU) { 3778 // Divide length by 2 to get number of chars 3779 shrl(cnt2, 1); 3780 } 3781 cmpl(cnt2, 1); 3782 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3783 3784 // Check if the strings start at the same location and setup scale and stride 3785 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3786 cmpptr(str1, str2); 3787 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3788 if (ae == StrIntrinsicNode::LL) { 3789 scale = Address::times_1; 3790 stride = 16; 3791 } else { 3792 scale = Address::times_2; 3793 stride = 8; 3794 } 3795 } else { 3796 scale1 = Address::times_1; 3797 scale2 = Address::times_2; 3798 // scale not used 3799 stride = 8; 3800 } 3801 3802 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3803 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3804 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3805 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3806 Label COMPARE_TAIL_LONG; 3807 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3808 3809 int pcmpmask = 0x19; 3810 if (ae == StrIntrinsicNode::LL) { 3811 pcmpmask &= ~0x01; 3812 } 3813 3814 // Setup to compare 16-chars (32-bytes) vectors, 3815 // start from first character again because it has aligned address. 3816 if (ae == StrIntrinsicNode::LL) { 3817 stride2 = 32; 3818 } else { 3819 stride2 = 16; 3820 } 3821 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3822 adr_stride = stride << scale; 3823 } else { 3824 adr_stride1 = 8; //stride << scale1; 3825 adr_stride2 = 16; //stride << scale2; 3826 } 3827 3828 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3829 // rax and rdx are used by pcmpestri as elements counters 3830 movl(result, cnt2); 3831 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3832 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3833 3834 // fast path : compare first 2 8-char vectors. 3835 bind(COMPARE_16_CHARS); 3836 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3837 movdqu(vec1, Address(str1, 0)); 3838 } else { 3839 pmovzxbw(vec1, Address(str1, 0)); 3840 } 3841 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3842 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3843 3844 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3845 movdqu(vec1, Address(str1, adr_stride)); 3846 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3847 } else { 3848 pmovzxbw(vec1, Address(str1, adr_stride1)); 3849 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3850 } 3851 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3852 addl(cnt1, stride); 3853 3854 // Compare the characters at index in cnt1 3855 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3856 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3857 subl(result, cnt2); 3858 jmp(POP_LABEL); 3859 3860 // Setup the registers to start vector comparison loop 3861 bind(COMPARE_WIDE_VECTORS); 3862 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3863 lea(str1, Address(str1, result, scale)); 3864 lea(str2, Address(str2, result, scale)); 3865 } else { 3866 lea(str1, Address(str1, result, scale1)); 3867 lea(str2, Address(str2, result, scale2)); 3868 } 3869 subl(result, stride2); 3870 subl(cnt2, stride2); 3871 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3872 negptr(result); 3873 3874 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3875 bind(COMPARE_WIDE_VECTORS_LOOP); 3876 3877 #ifdef _LP64 3878 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3879 cmpl(cnt2, stride2x2); 3880 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3881 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3882 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3883 3884 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3885 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3886 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3887 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3888 } else { 3889 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3890 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3891 } 3892 kortestql(mask, mask); 3893 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3894 addptr(result, stride2x2); // update since we already compared at this addr 3895 subl(cnt2, stride2x2); // and sub the size too 3896 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3897 3898 vpxor(vec1, vec1); 3899 jmpb(COMPARE_WIDE_TAIL); 3900 }//if (VM_Version::supports_avx512vlbw()) 3901 #endif // _LP64 3902 3903 3904 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3905 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3906 vmovdqu(vec1, Address(str1, result, scale)); 3907 vpxor(vec1, Address(str2, result, scale)); 3908 } else { 3909 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3910 vpxor(vec1, Address(str2, result, scale2)); 3911 } 3912 vptest(vec1, vec1); 3913 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3914 addptr(result, stride2); 3915 subl(cnt2, stride2); 3916 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3917 // clean upper bits of YMM registers 3918 vpxor(vec1, vec1); 3919 3920 // compare wide vectors tail 3921 bind(COMPARE_WIDE_TAIL); 3922 testptr(result, result); 3923 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3924 3925 movl(result, stride2); 3926 movl(cnt2, result); 3927 negptr(result); 3928 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3929 3930 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3931 bind(VECTOR_NOT_EQUAL); 3932 // clean upper bits of YMM registers 3933 vpxor(vec1, vec1); 3934 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3935 lea(str1, Address(str1, result, scale)); 3936 lea(str2, Address(str2, result, scale)); 3937 } else { 3938 lea(str1, Address(str1, result, scale1)); 3939 lea(str2, Address(str2, result, scale2)); 3940 } 3941 jmp(COMPARE_16_CHARS); 3942 3943 // Compare tail chars, length between 1 to 15 chars 3944 bind(COMPARE_TAIL_LONG); 3945 movl(cnt2, result); 3946 cmpl(cnt2, stride); 3947 jcc(Assembler::less, COMPARE_SMALL_STR); 3948 3949 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3950 movdqu(vec1, Address(str1, 0)); 3951 } else { 3952 pmovzxbw(vec1, Address(str1, 0)); 3953 } 3954 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3955 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3956 subptr(cnt2, stride); 3957 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3958 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3959 lea(str1, Address(str1, result, scale)); 3960 lea(str2, Address(str2, result, scale)); 3961 } else { 3962 lea(str1, Address(str1, result, scale1)); 3963 lea(str2, Address(str2, result, scale2)); 3964 } 3965 negptr(cnt2); 3966 jmpb(WHILE_HEAD_LABEL); 3967 3968 bind(COMPARE_SMALL_STR); 3969 } else if (UseSSE42Intrinsics) { 3970 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3971 int pcmpmask = 0x19; 3972 // Setup to compare 8-char (16-byte) vectors, 3973 // start from first character again because it has aligned address. 3974 movl(result, cnt2); 3975 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3976 if (ae == StrIntrinsicNode::LL) { 3977 pcmpmask &= ~0x01; 3978 } 3979 jcc(Assembler::zero, COMPARE_TAIL); 3980 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3981 lea(str1, Address(str1, result, scale)); 3982 lea(str2, Address(str2, result, scale)); 3983 } else { 3984 lea(str1, Address(str1, result, scale1)); 3985 lea(str2, Address(str2, result, scale2)); 3986 } 3987 negptr(result); 3988 3989 // pcmpestri 3990 // inputs: 3991 // vec1- substring 3992 // rax - negative string length (elements count) 3993 // mem - scanned string 3994 // rdx - string length (elements count) 3995 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3996 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3997 // outputs: 3998 // rcx - first mismatched element index 3999 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 4000 4001 bind(COMPARE_WIDE_VECTORS); 4002 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4003 movdqu(vec1, Address(str1, result, scale)); 4004 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 4005 } else { 4006 pmovzxbw(vec1, Address(str1, result, scale1)); 4007 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4008 } 4009 // After pcmpestri cnt1(rcx) contains mismatched element index 4010 4011 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 4012 addptr(result, stride); 4013 subptr(cnt2, stride); 4014 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4015 4016 // compare wide vectors tail 4017 testptr(result, result); 4018 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 4019 4020 movl(cnt2, stride); 4021 movl(result, stride); 4022 negptr(result); 4023 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4024 movdqu(vec1, Address(str1, result, scale)); 4025 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 4026 } else { 4027 pmovzxbw(vec1, Address(str1, result, scale1)); 4028 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4029 } 4030 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 4031 4032 // Mismatched characters in the vectors 4033 bind(VECTOR_NOT_EQUAL); 4034 addptr(cnt1, result); 4035 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 4036 subl(result, cnt2); 4037 jmpb(POP_LABEL); 4038 4039 bind(COMPARE_TAIL); // limit is zero 4040 movl(cnt2, result); 4041 // Fallthru to tail compare 4042 } 4043 // Shift str2 and str1 to the end of the arrays, negate min 4044 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4045 lea(str1, Address(str1, cnt2, scale)); 4046 lea(str2, Address(str2, cnt2, scale)); 4047 } else { 4048 lea(str1, Address(str1, cnt2, scale1)); 4049 lea(str2, Address(str2, cnt2, scale2)); 4050 } 4051 decrementl(cnt2); // first character was compared already 4052 negptr(cnt2); 4053 4054 // Compare the rest of the elements 4055 bind(WHILE_HEAD_LABEL); 4056 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 4057 subl(result, cnt1); 4058 jccb(Assembler::notZero, POP_LABEL); 4059 increment(cnt2); 4060 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 4061 4062 // Strings are equal up to min length. Return the length difference. 4063 bind(LENGTH_DIFF_LABEL); 4064 pop(result); 4065 if (ae == StrIntrinsicNode::UU) { 4066 // Divide diff by 2 to get number of chars 4067 sarl(result, 1); 4068 } 4069 jmpb(DONE_LABEL); 4070 4071 #ifdef _LP64 4072 if (VM_Version::supports_avx512vlbw()) { 4073 4074 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 4075 4076 kmovql(cnt1, mask); 4077 notq(cnt1); 4078 bsfq(cnt2, cnt1); 4079 if (ae != StrIntrinsicNode::LL) { 4080 // Divide diff by 2 to get number of chars 4081 sarl(cnt2, 1); 4082 } 4083 addq(result, cnt2); 4084 if (ae == StrIntrinsicNode::LL) { 4085 load_unsigned_byte(cnt1, Address(str2, result)); 4086 load_unsigned_byte(result, Address(str1, result)); 4087 } else if (ae == StrIntrinsicNode::UU) { 4088 load_unsigned_short(cnt1, Address(str2, result, scale)); 4089 load_unsigned_short(result, Address(str1, result, scale)); 4090 } else { 4091 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4092 load_unsigned_byte(result, Address(str1, result, scale1)); 4093 } 4094 subl(result, cnt1); 4095 jmpb(POP_LABEL); 4096 }//if (VM_Version::supports_avx512vlbw()) 4097 #endif // _LP64 4098 4099 // Discard the stored length difference 4100 bind(POP_LABEL); 4101 pop(cnt1); 4102 4103 // That's it 4104 bind(DONE_LABEL); 4105 if(ae == StrIntrinsicNode::UL) { 4106 negl(result); 4107 } 4108 4109 } 4110 4111 // Search for Non-ASCII character (Negative byte value) in a byte array, 4112 // return the index of the first such character, otherwise the length 4113 // of the array segment searched. 4114 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4115 // @IntrinsicCandidate 4116 // public static int countPositives(byte[] ba, int off, int len) { 4117 // for (int i = off; i < off + len; i++) { 4118 // if (ba[i] < 0) { 4119 // return i - off; 4120 // } 4121 // } 4122 // return len; 4123 // } 4124 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4125 Register result, Register tmp1, 4126 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4127 // rsi: byte array 4128 // rcx: len 4129 // rax: result 4130 ShortBranchVerifier sbv(this); 4131 assert_different_registers(ary1, len, result, tmp1); 4132 assert_different_registers(vec1, vec2); 4133 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4134 4135 movl(result, len); // copy 4136 // len == 0 4137 testl(len, len); 4138 jcc(Assembler::zero, DONE); 4139 4140 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4141 VM_Version::supports_avx512vlbw() && 4142 VM_Version::supports_bmi2()) { 4143 4144 Label test_64_loop, test_tail, BREAK_LOOP; 4145 movl(tmp1, len); 4146 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4147 4148 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4149 andl(len, 0xffffffc0); // vector count (in chars) 4150 jccb(Assembler::zero, test_tail); 4151 4152 lea(ary1, Address(ary1, len, Address::times_1)); 4153 negptr(len); 4154 4155 bind(test_64_loop); 4156 // Check whether our 64 elements of size byte contain negatives 4157 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4158 kortestql(mask1, mask1); 4159 jcc(Assembler::notZero, BREAK_LOOP); 4160 4161 addptr(len, 64); 4162 jccb(Assembler::notZero, test_64_loop); 4163 4164 bind(test_tail); 4165 // bail out when there is nothing to be done 4166 testl(tmp1, -1); 4167 jcc(Assembler::zero, DONE); 4168 4169 4170 // check the tail for absense of negatives 4171 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4172 #ifdef _LP64 4173 { 4174 Register tmp3_aliased = len; 4175 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4176 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4177 notq(tmp3_aliased); 4178 kmovql(mask2, tmp3_aliased); 4179 } 4180 #else 4181 Label k_init; 4182 jmp(k_init); 4183 4184 // We could not read 64-bits from a general purpose register thus we move 4185 // data required to compose 64 1's to the instruction stream 4186 // We emit 64 byte wide series of elements from 0..63 which later on would 4187 // be used as a compare targets with tail count contained in tmp1 register. 4188 // Result would be a k register having tmp1 consecutive number or 1 4189 // counting from least significant bit. 4190 address tmp = pc(); 4191 emit_int64(0x0706050403020100); 4192 emit_int64(0x0F0E0D0C0B0A0908); 4193 emit_int64(0x1716151413121110); 4194 emit_int64(0x1F1E1D1C1B1A1918); 4195 emit_int64(0x2726252423222120); 4196 emit_int64(0x2F2E2D2C2B2A2928); 4197 emit_int64(0x3736353433323130); 4198 emit_int64(0x3F3E3D3C3B3A3938); 4199 4200 bind(k_init); 4201 lea(len, InternalAddress(tmp)); 4202 // create mask to test for negative byte inside a vector 4203 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 4204 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 4205 4206 #endif 4207 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4208 ktestq(mask1, mask2); 4209 jcc(Assembler::zero, DONE); 4210 4211 // do a full check for negative registers in the tail 4212 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4213 // ary1 already pointing to the right place 4214 jmpb(TAIL_START); 4215 4216 bind(BREAK_LOOP); 4217 // At least one byte in the last 64 byte block was negative. 4218 // Set up to look at the last 64 bytes as if they were a tail 4219 lea(ary1, Address(ary1, len, Address::times_1)); 4220 addptr(result, len); 4221 // Ignore the very last byte: if all others are positive, 4222 // it must be negative, so we can skip right to the 2+1 byte 4223 // end comparison at this point 4224 orl(result, 63); 4225 movl(len, 63); 4226 // Fallthru to tail compare 4227 } else { 4228 4229 if (UseAVX >= 2 && UseSSE >= 2) { 4230 // With AVX2, use 32-byte vector compare 4231 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4232 4233 // Compare 32-byte vectors 4234 testl(len, 0xffffffe0); // vector count (in bytes) 4235 jccb(Assembler::zero, TAIL_START); 4236 4237 andl(len, 0xffffffe0); 4238 lea(ary1, Address(ary1, len, Address::times_1)); 4239 negptr(len); 4240 4241 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4242 movdl(vec2, tmp1); 4243 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4244 4245 bind(COMPARE_WIDE_VECTORS); 4246 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4247 vptest(vec1, vec2); 4248 jccb(Assembler::notZero, BREAK_LOOP); 4249 addptr(len, 32); 4250 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4251 4252 testl(result, 0x0000001f); // any bytes remaining? 4253 jcc(Assembler::zero, DONE); 4254 4255 // Quick test using the already prepared vector mask 4256 movl(len, result); 4257 andl(len, 0x0000001f); 4258 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4259 vptest(vec1, vec2); 4260 jcc(Assembler::zero, DONE); 4261 // There are zeros, jump to the tail to determine exactly where 4262 jmpb(TAIL_START); 4263 4264 bind(BREAK_LOOP); 4265 // At least one byte in the last 32-byte vector is negative. 4266 // Set up to look at the last 32 bytes as if they were a tail 4267 lea(ary1, Address(ary1, len, Address::times_1)); 4268 addptr(result, len); 4269 // Ignore the very last byte: if all others are positive, 4270 // it must be negative, so we can skip right to the 2+1 byte 4271 // end comparison at this point 4272 orl(result, 31); 4273 movl(len, 31); 4274 // Fallthru to tail compare 4275 } else if (UseSSE42Intrinsics) { 4276 // With SSE4.2, use double quad vector compare 4277 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4278 4279 // Compare 16-byte vectors 4280 testl(len, 0xfffffff0); // vector count (in bytes) 4281 jcc(Assembler::zero, TAIL_START); 4282 4283 andl(len, 0xfffffff0); 4284 lea(ary1, Address(ary1, len, Address::times_1)); 4285 negptr(len); 4286 4287 movl(tmp1, 0x80808080); 4288 movdl(vec2, tmp1); 4289 pshufd(vec2, vec2, 0); 4290 4291 bind(COMPARE_WIDE_VECTORS); 4292 movdqu(vec1, Address(ary1, len, Address::times_1)); 4293 ptest(vec1, vec2); 4294 jccb(Assembler::notZero, BREAK_LOOP); 4295 addptr(len, 16); 4296 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4297 4298 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4299 jcc(Assembler::zero, DONE); 4300 4301 // Quick test using the already prepared vector mask 4302 movl(len, result); 4303 andl(len, 0x0000000f); // tail count (in bytes) 4304 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4305 ptest(vec1, vec2); 4306 jcc(Assembler::zero, DONE); 4307 jmpb(TAIL_START); 4308 4309 bind(BREAK_LOOP); 4310 // At least one byte in the last 16-byte vector is negative. 4311 // Set up and look at the last 16 bytes as if they were a tail 4312 lea(ary1, Address(ary1, len, Address::times_1)); 4313 addptr(result, len); 4314 // Ignore the very last byte: if all others are positive, 4315 // it must be negative, so we can skip right to the 2+1 byte 4316 // end comparison at this point 4317 orl(result, 15); 4318 movl(len, 15); 4319 // Fallthru to tail compare 4320 } 4321 } 4322 4323 bind(TAIL_START); 4324 // Compare 4-byte vectors 4325 andl(len, 0xfffffffc); // vector count (in bytes) 4326 jccb(Assembler::zero, COMPARE_CHAR); 4327 4328 lea(ary1, Address(ary1, len, Address::times_1)); 4329 negptr(len); 4330 4331 bind(COMPARE_VECTORS); 4332 movl(tmp1, Address(ary1, len, Address::times_1)); 4333 andl(tmp1, 0x80808080); 4334 jccb(Assembler::notZero, TAIL_ADJUST); 4335 addptr(len, 4); 4336 jccb(Assembler::notZero, COMPARE_VECTORS); 4337 4338 // Compare trailing char (final 2-3 bytes), if any 4339 bind(COMPARE_CHAR); 4340 4341 testl(result, 0x2); // tail char 4342 jccb(Assembler::zero, COMPARE_BYTE); 4343 load_unsigned_short(tmp1, Address(ary1, 0)); 4344 andl(tmp1, 0x00008080); 4345 jccb(Assembler::notZero, CHAR_ADJUST); 4346 lea(ary1, Address(ary1, 2)); 4347 4348 bind(COMPARE_BYTE); 4349 testl(result, 0x1); // tail byte 4350 jccb(Assembler::zero, DONE); 4351 load_unsigned_byte(tmp1, Address(ary1, 0)); 4352 testl(tmp1, 0x00000080); 4353 jccb(Assembler::zero, DONE); 4354 subptr(result, 1); 4355 jmpb(DONE); 4356 4357 bind(TAIL_ADJUST); 4358 // there are negative bits in the last 4 byte block. 4359 // Adjust result and check the next three bytes 4360 addptr(result, len); 4361 orl(result, 3); 4362 lea(ary1, Address(ary1, len, Address::times_1)); 4363 jmpb(COMPARE_CHAR); 4364 4365 bind(CHAR_ADJUST); 4366 // We are looking at a char + optional byte tail, and found that one 4367 // of the bytes in the char is negative. Adjust the result, check the 4368 // first byte and readjust if needed. 4369 andl(result, 0xfffffffc); 4370 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4371 jccb(Assembler::notZero, DONE); 4372 addptr(result, 1); 4373 4374 // That's it 4375 bind(DONE); 4376 if (UseAVX >= 2 && UseSSE >= 2) { 4377 // clean upper bits of YMM registers 4378 vpxor(vec1, vec1); 4379 vpxor(vec2, vec2); 4380 } 4381 } 4382 4383 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4384 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4385 Register limit, Register result, Register chr, 4386 XMMRegister vec1, XMMRegister vec2, bool is_char, 4387 KRegister mask, bool expand_ary2) { 4388 // for expand_ary2, limit is the (smaller) size of the second array. 4389 ShortBranchVerifier sbv(this); 4390 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4391 4392 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4393 "Expansion only implemented for AVX2"); 4394 4395 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4396 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4397 4398 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4399 int scaleIncr = expand_ary2 ? 8 : 16; 4400 4401 if (is_array_equ) { 4402 // Check the input args 4403 cmpoop(ary1, ary2); 4404 jcc(Assembler::equal, TRUE_LABEL); 4405 4406 // Need additional checks for arrays_equals. 4407 testptr(ary1, ary1); 4408 jcc(Assembler::zero, FALSE_LABEL); 4409 testptr(ary2, ary2); 4410 jcc(Assembler::zero, FALSE_LABEL); 4411 4412 // Check the lengths 4413 movl(limit, Address(ary1, length_offset)); 4414 cmpl(limit, Address(ary2, length_offset)); 4415 jcc(Assembler::notEqual, FALSE_LABEL); 4416 } 4417 4418 // count == 0 4419 testl(limit, limit); 4420 jcc(Assembler::zero, TRUE_LABEL); 4421 4422 if (is_array_equ) { 4423 // Load array address 4424 lea(ary1, Address(ary1, base_offset)); 4425 lea(ary2, Address(ary2, base_offset)); 4426 } 4427 4428 if (is_array_equ && is_char) { 4429 // arrays_equals when used for char[]. 4430 shll(limit, 1); // byte count != 0 4431 } 4432 movl(result, limit); // copy 4433 4434 if (UseAVX >= 2) { 4435 // With AVX2, use 32-byte vector compare 4436 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4437 4438 // Compare 32-byte vectors 4439 if (expand_ary2) { 4440 andl(result, 0x0000000f); // tail count (in bytes) 4441 andl(limit, 0xfffffff0); // vector count (in bytes) 4442 jcc(Assembler::zero, COMPARE_TAIL); 4443 } else { 4444 andl(result, 0x0000001f); // tail count (in bytes) 4445 andl(limit, 0xffffffe0); // vector count (in bytes) 4446 jcc(Assembler::zero, COMPARE_TAIL_16); 4447 } 4448 4449 lea(ary1, Address(ary1, limit, scaleFactor)); 4450 lea(ary2, Address(ary2, limit, Address::times_1)); 4451 negptr(limit); 4452 4453 #ifdef _LP64 4454 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4455 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4456 4457 cmpl(limit, -64); 4458 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4459 4460 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4461 4462 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4463 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4464 kortestql(mask, mask); 4465 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4466 addptr(limit, 64); // update since we already compared at this addr 4467 cmpl(limit, -64); 4468 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4469 4470 // At this point we may still need to compare -limit+result bytes. 4471 // We could execute the next two instruction and just continue via non-wide path: 4472 // cmpl(limit, 0); 4473 // jcc(Assembler::equal, COMPARE_TAIL); // true 4474 // But since we stopped at the points ary{1,2}+limit which are 4475 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4476 // (|limit| <= 32 and result < 32), 4477 // we may just compare the last 64 bytes. 4478 // 4479 addptr(result, -64); // it is safe, bc we just came from this area 4480 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4481 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4482 kortestql(mask, mask); 4483 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4484 4485 jmp(TRUE_LABEL); 4486 4487 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4488 4489 }//if (VM_Version::supports_avx512vlbw()) 4490 #endif //_LP64 4491 bind(COMPARE_WIDE_VECTORS); 4492 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4493 if (expand_ary2) { 4494 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4495 } else { 4496 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4497 } 4498 vpxor(vec1, vec2); 4499 4500 vptest(vec1, vec1); 4501 jcc(Assembler::notZero, FALSE_LABEL); 4502 addptr(limit, scaleIncr * 2); 4503 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4504 4505 testl(result, result); 4506 jcc(Assembler::zero, TRUE_LABEL); 4507 4508 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4509 if (expand_ary2) { 4510 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4511 } else { 4512 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4513 } 4514 vpxor(vec1, vec2); 4515 4516 vptest(vec1, vec1); 4517 jcc(Assembler::notZero, FALSE_LABEL); 4518 jmp(TRUE_LABEL); 4519 4520 bind(COMPARE_TAIL_16); // limit is zero 4521 movl(limit, result); 4522 4523 // Compare 16-byte chunks 4524 andl(result, 0x0000000f); // tail count (in bytes) 4525 andl(limit, 0xfffffff0); // vector count (in bytes) 4526 jcc(Assembler::zero, COMPARE_TAIL); 4527 4528 lea(ary1, Address(ary1, limit, scaleFactor)); 4529 lea(ary2, Address(ary2, limit, Address::times_1)); 4530 negptr(limit); 4531 4532 bind(COMPARE_WIDE_VECTORS_16); 4533 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4534 if (expand_ary2) { 4535 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4536 } else { 4537 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4538 } 4539 pxor(vec1, vec2); 4540 4541 ptest(vec1, vec1); 4542 jcc(Assembler::notZero, FALSE_LABEL); 4543 addptr(limit, scaleIncr); 4544 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4545 4546 bind(COMPARE_TAIL); // limit is zero 4547 movl(limit, result); 4548 // Fallthru to tail compare 4549 } else if (UseSSE42Intrinsics) { 4550 // With SSE4.2, use double quad vector compare 4551 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4552 4553 // Compare 16-byte vectors 4554 andl(result, 0x0000000f); // tail count (in bytes) 4555 andl(limit, 0xfffffff0); // vector count (in bytes) 4556 jcc(Assembler::zero, COMPARE_TAIL); 4557 4558 lea(ary1, Address(ary1, limit, Address::times_1)); 4559 lea(ary2, Address(ary2, limit, Address::times_1)); 4560 negptr(limit); 4561 4562 bind(COMPARE_WIDE_VECTORS); 4563 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4564 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4565 pxor(vec1, vec2); 4566 4567 ptest(vec1, vec1); 4568 jcc(Assembler::notZero, FALSE_LABEL); 4569 addptr(limit, 16); 4570 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4571 4572 testl(result, result); 4573 jcc(Assembler::zero, TRUE_LABEL); 4574 4575 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4576 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4577 pxor(vec1, vec2); 4578 4579 ptest(vec1, vec1); 4580 jccb(Assembler::notZero, FALSE_LABEL); 4581 jmpb(TRUE_LABEL); 4582 4583 bind(COMPARE_TAIL); // limit is zero 4584 movl(limit, result); 4585 // Fallthru to tail compare 4586 } 4587 4588 // Compare 4-byte vectors 4589 if (expand_ary2) { 4590 testl(result, result); 4591 jccb(Assembler::zero, TRUE_LABEL); 4592 } else { 4593 andl(limit, 0xfffffffc); // vector count (in bytes) 4594 jccb(Assembler::zero, COMPARE_CHAR); 4595 } 4596 4597 lea(ary1, Address(ary1, limit, scaleFactor)); 4598 lea(ary2, Address(ary2, limit, Address::times_1)); 4599 negptr(limit); 4600 4601 bind(COMPARE_VECTORS); 4602 if (expand_ary2) { 4603 // There are no "vector" operations for bytes to shorts 4604 movzbl(chr, Address(ary2, limit, Address::times_1)); 4605 cmpw(Address(ary1, limit, Address::times_2), chr); 4606 jccb(Assembler::notEqual, FALSE_LABEL); 4607 addptr(limit, 1); 4608 jcc(Assembler::notZero, COMPARE_VECTORS); 4609 jmp(TRUE_LABEL); 4610 } else { 4611 movl(chr, Address(ary1, limit, Address::times_1)); 4612 cmpl(chr, Address(ary2, limit, Address::times_1)); 4613 jccb(Assembler::notEqual, FALSE_LABEL); 4614 addptr(limit, 4); 4615 jcc(Assembler::notZero, COMPARE_VECTORS); 4616 } 4617 4618 // Compare trailing char (final 2 bytes), if any 4619 bind(COMPARE_CHAR); 4620 testl(result, 0x2); // tail char 4621 jccb(Assembler::zero, COMPARE_BYTE); 4622 load_unsigned_short(chr, Address(ary1, 0)); 4623 load_unsigned_short(limit, Address(ary2, 0)); 4624 cmpl(chr, limit); 4625 jccb(Assembler::notEqual, FALSE_LABEL); 4626 4627 if (is_array_equ && is_char) { 4628 bind(COMPARE_BYTE); 4629 } else { 4630 lea(ary1, Address(ary1, 2)); 4631 lea(ary2, Address(ary2, 2)); 4632 4633 bind(COMPARE_BYTE); 4634 testl(result, 0x1); // tail byte 4635 jccb(Assembler::zero, TRUE_LABEL); 4636 load_unsigned_byte(chr, Address(ary1, 0)); 4637 load_unsigned_byte(limit, Address(ary2, 0)); 4638 cmpl(chr, limit); 4639 jccb(Assembler::notEqual, FALSE_LABEL); 4640 } 4641 bind(TRUE_LABEL); 4642 movl(result, 1); // return true 4643 jmpb(DONE); 4644 4645 bind(FALSE_LABEL); 4646 xorl(result, result); // return false 4647 4648 // That's it 4649 bind(DONE); 4650 if (UseAVX >= 2) { 4651 // clean upper bits of YMM registers 4652 vpxor(vec1, vec1); 4653 vpxor(vec2, vec2); 4654 } 4655 } 4656 4657 #ifdef _LP64 4658 4659 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4660 #define __ masm. 4661 Register dst = stub.data<0>(); 4662 XMMRegister src = stub.data<1>(); 4663 address target = stub.data<2>(); 4664 __ bind(stub.entry()); 4665 __ subptr(rsp, 8); 4666 __ movdbl(Address(rsp), src); 4667 __ call(RuntimeAddress(target)); 4668 __ pop(dst); 4669 __ jmp(stub.continuation()); 4670 #undef __ 4671 } 4672 4673 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4674 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4675 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4676 4677 address slowpath_target; 4678 if (dst_bt == T_INT) { 4679 if (src_bt == T_FLOAT) { 4680 cvttss2sil(dst, src); 4681 cmpl(dst, 0x80000000); 4682 slowpath_target = StubRoutines::x86::f2i_fixup(); 4683 } else { 4684 cvttsd2sil(dst, src); 4685 cmpl(dst, 0x80000000); 4686 slowpath_target = StubRoutines::x86::d2i_fixup(); 4687 } 4688 } else { 4689 if (src_bt == T_FLOAT) { 4690 cvttss2siq(dst, src); 4691 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4692 slowpath_target = StubRoutines::x86::f2l_fixup(); 4693 } else { 4694 cvttsd2siq(dst, src); 4695 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4696 slowpath_target = StubRoutines::x86::d2l_fixup(); 4697 } 4698 } 4699 4700 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4701 jcc(Assembler::equal, stub->entry()); 4702 bind(stub->continuation()); 4703 } 4704 4705 #endif // _LP64 4706 4707 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4708 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4709 switch(ideal_opc) { 4710 case Op_LShiftVS: 4711 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4712 case Op_LShiftVI: 4713 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4714 case Op_LShiftVL: 4715 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4716 case Op_RShiftVS: 4717 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4718 case Op_RShiftVI: 4719 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4720 case Op_RShiftVL: 4721 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4722 case Op_URShiftVS: 4723 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4724 case Op_URShiftVI: 4725 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4726 case Op_URShiftVL: 4727 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4728 case Op_RotateRightV: 4729 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4730 case Op_RotateLeftV: 4731 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4732 default: 4733 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4734 break; 4735 } 4736 } 4737 4738 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4739 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) { 4740 if (is_unsigned) { 4741 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4742 } else { 4743 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4744 } 4745 } 4746 4747 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4748 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4749 switch (elem_bt) { 4750 case T_BYTE: 4751 if (ideal_opc == Op_SaturatingAddV) { 4752 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4753 } else { 4754 assert(ideal_opc == Op_SaturatingSubV, ""); 4755 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4756 } 4757 break; 4758 case T_SHORT: 4759 if (ideal_opc == Op_SaturatingAddV) { 4760 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4761 } else { 4762 assert(ideal_opc == Op_SaturatingSubV, ""); 4763 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4764 } 4765 break; 4766 default: 4767 fatal("Unsupported type %s", type2name(elem_bt)); 4768 break; 4769 } 4770 } 4771 4772 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4773 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4774 switch (elem_bt) { 4775 case T_BYTE: 4776 if (ideal_opc == Op_SaturatingAddV) { 4777 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4778 } else { 4779 assert(ideal_opc == Op_SaturatingSubV, ""); 4780 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4781 } 4782 break; 4783 case T_SHORT: 4784 if (ideal_opc == Op_SaturatingAddV) { 4785 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4786 } else { 4787 assert(ideal_opc == Op_SaturatingSubV, ""); 4788 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4789 } 4790 break; 4791 default: 4792 fatal("Unsupported type %s", type2name(elem_bt)); 4793 break; 4794 } 4795 } 4796 4797 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4798 Address src2, bool is_unsigned, bool merge, int vlen_enc) { 4799 if (is_unsigned) { 4800 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4801 } else { 4802 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4803 } 4804 } 4805 4806 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4807 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4808 switch (elem_bt) { 4809 case T_BYTE: 4810 if (ideal_opc == Op_SaturatingAddV) { 4811 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4812 } else { 4813 assert(ideal_opc == Op_SaturatingSubV, ""); 4814 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4815 } 4816 break; 4817 case T_SHORT: 4818 if (ideal_opc == Op_SaturatingAddV) { 4819 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4820 } else { 4821 assert(ideal_opc == Op_SaturatingSubV, ""); 4822 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4823 } 4824 break; 4825 default: 4826 fatal("Unsupported type %s", type2name(elem_bt)); 4827 break; 4828 } 4829 } 4830 4831 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4832 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4833 switch (elem_bt) { 4834 case T_BYTE: 4835 if (ideal_opc == Op_SaturatingAddV) { 4836 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4837 } else { 4838 assert(ideal_opc == Op_SaturatingSubV, ""); 4839 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4840 } 4841 break; 4842 case T_SHORT: 4843 if (ideal_opc == Op_SaturatingAddV) { 4844 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4845 } else { 4846 assert(ideal_opc == Op_SaturatingSubV, ""); 4847 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4848 } 4849 break; 4850 default: 4851 fatal("Unsupported type %s", type2name(elem_bt)); 4852 break; 4853 } 4854 } 4855 4856 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4857 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4858 bool is_varshift) { 4859 switch (ideal_opc) { 4860 case Op_AddVB: 4861 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4862 case Op_AddVS: 4863 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4864 case Op_AddVI: 4865 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4866 case Op_AddVL: 4867 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4868 case Op_AddVF: 4869 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4870 case Op_AddVD: 4871 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4872 case Op_SubVB: 4873 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4874 case Op_SubVS: 4875 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4876 case Op_SubVI: 4877 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4878 case Op_SubVL: 4879 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4880 case Op_SubVF: 4881 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4882 case Op_SubVD: 4883 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4884 case Op_MulVS: 4885 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4886 case Op_MulVI: 4887 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4888 case Op_MulVL: 4889 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4890 case Op_MulVF: 4891 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4892 case Op_MulVD: 4893 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4894 case Op_DivVF: 4895 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4896 case Op_DivVD: 4897 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4898 case Op_SqrtVF: 4899 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4900 case Op_SqrtVD: 4901 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4902 case Op_AbsVB: 4903 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4904 case Op_AbsVS: 4905 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4906 case Op_AbsVI: 4907 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4908 case Op_AbsVL: 4909 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4910 case Op_FmaVF: 4911 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4912 case Op_FmaVD: 4913 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4914 case Op_VectorRearrange: 4915 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4916 case Op_LShiftVS: 4917 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4918 case Op_LShiftVI: 4919 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4920 case Op_LShiftVL: 4921 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4922 case Op_RShiftVS: 4923 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4924 case Op_RShiftVI: 4925 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4926 case Op_RShiftVL: 4927 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4928 case Op_URShiftVS: 4929 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4930 case Op_URShiftVI: 4931 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4932 case Op_URShiftVL: 4933 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4934 case Op_RotateLeftV: 4935 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4936 case Op_RotateRightV: 4937 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4938 case Op_MaxV: 4939 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4940 case Op_MinV: 4941 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4942 case Op_UMinV: 4943 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4944 case Op_UMaxV: 4945 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4946 case Op_XorV: 4947 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4948 case Op_OrV: 4949 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4950 case Op_AndV: 4951 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4952 default: 4953 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4954 break; 4955 } 4956 } 4957 4958 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4959 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4960 switch (ideal_opc) { 4961 case Op_AddVB: 4962 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4963 case Op_AddVS: 4964 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4965 case Op_AddVI: 4966 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4967 case Op_AddVL: 4968 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4969 case Op_AddVF: 4970 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4971 case Op_AddVD: 4972 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4973 case Op_SubVB: 4974 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4975 case Op_SubVS: 4976 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4977 case Op_SubVI: 4978 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4979 case Op_SubVL: 4980 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4981 case Op_SubVF: 4982 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4983 case Op_SubVD: 4984 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4985 case Op_MulVS: 4986 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4987 case Op_MulVI: 4988 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4989 case Op_MulVL: 4990 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4991 case Op_MulVF: 4992 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4993 case Op_MulVD: 4994 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4995 case Op_DivVF: 4996 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4997 case Op_DivVD: 4998 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4999 case Op_FmaVF: 5000 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 5001 case Op_FmaVD: 5002 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 5003 case Op_MaxV: 5004 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5005 case Op_MinV: 5006 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5007 case Op_UMaxV: 5008 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5009 case Op_UMinV: 5010 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5011 case Op_XorV: 5012 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5013 case Op_OrV: 5014 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5015 case Op_AndV: 5016 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5017 default: 5018 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 5019 break; 5020 } 5021 } 5022 5023 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 5024 KRegister src1, KRegister src2) { 5025 BasicType etype = T_ILLEGAL; 5026 switch(mask_len) { 5027 case 2: 5028 case 4: 5029 case 8: etype = T_BYTE; break; 5030 case 16: etype = T_SHORT; break; 5031 case 32: etype = T_INT; break; 5032 case 64: etype = T_LONG; break; 5033 default: fatal("Unsupported type"); break; 5034 } 5035 assert(etype != T_ILLEGAL, ""); 5036 switch(ideal_opc) { 5037 case Op_AndVMask: 5038 kand(etype, dst, src1, src2); break; 5039 case Op_OrVMask: 5040 kor(etype, dst, src1, src2); break; 5041 case Op_XorVMask: 5042 kxor(etype, dst, src1, src2); break; 5043 default: 5044 fatal("Unsupported masked operation"); break; 5045 } 5046 } 5047 5048 /* 5049 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5050 * If src is NaN, the result is 0. 5051 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 5052 * the result is equal to the value of Integer.MIN_VALUE. 5053 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 5054 * the result is equal to the value of Integer.MAX_VALUE. 5055 */ 5056 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5057 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5058 Register rscratch, AddressLiteral float_sign_flip, 5059 int vec_enc) { 5060 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5061 Label done; 5062 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 5063 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5064 vptest(xtmp2, xtmp2, vec_enc); 5065 jccb(Assembler::equal, done); 5066 5067 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 5068 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 5069 5070 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5071 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 5072 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 5073 5074 // Recompute the mask for remaining special value. 5075 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 5076 // Extract SRC values corresponding to TRUE mask lanes. 5077 vpand(xtmp4, xtmp2, src, vec_enc); 5078 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 5079 // values are set. 5080 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 5081 5082 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 5083 bind(done); 5084 } 5085 5086 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5087 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5088 Register rscratch, AddressLiteral float_sign_flip, 5089 int vec_enc) { 5090 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5091 Label done; 5092 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5093 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5094 kortestwl(ktmp1, ktmp1); 5095 jccb(Assembler::equal, done); 5096 5097 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5098 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5099 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5100 5101 kxorwl(ktmp1, ktmp1, ktmp2); 5102 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5103 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5104 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5105 bind(done); 5106 } 5107 5108 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5109 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5110 Register rscratch, AddressLiteral double_sign_flip, 5111 int vec_enc) { 5112 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5113 5114 Label done; 5115 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5116 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 5117 kortestwl(ktmp1, ktmp1); 5118 jccb(Assembler::equal, done); 5119 5120 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5121 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5122 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5123 5124 kxorwl(ktmp1, ktmp1, ktmp2); 5125 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5126 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5127 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5128 bind(done); 5129 } 5130 5131 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5132 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5133 Register rscratch, AddressLiteral float_sign_flip, 5134 int vec_enc) { 5135 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5136 Label done; 5137 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5138 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5139 kortestwl(ktmp1, ktmp1); 5140 jccb(Assembler::equal, done); 5141 5142 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5143 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5144 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5145 5146 kxorwl(ktmp1, ktmp1, ktmp2); 5147 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5148 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5149 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5150 bind(done); 5151 } 5152 5153 /* 5154 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5155 * If src is NaN, the result is 0. 5156 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 5157 * the result is equal to the value of Long.MIN_VALUE. 5158 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 5159 * the result is equal to the value of Long.MAX_VALUE. 5160 */ 5161 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5162 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5163 Register rscratch, AddressLiteral double_sign_flip, 5164 int vec_enc) { 5165 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5166 5167 Label done; 5168 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5169 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 5170 kortestwl(ktmp1, ktmp1); 5171 jccb(Assembler::equal, done); 5172 5173 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5174 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5175 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5176 5177 kxorwl(ktmp1, ktmp1, ktmp2); 5178 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5179 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5180 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5181 bind(done); 5182 } 5183 5184 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5185 XMMRegister xtmp, int index, int vec_enc) { 5186 assert(vec_enc < Assembler::AVX_512bit, ""); 5187 if (vec_enc == Assembler::AVX_256bit) { 5188 vextractf128_high(xtmp, src); 5189 vshufps(dst, src, xtmp, index, vec_enc); 5190 } else { 5191 vshufps(dst, src, zero, index, vec_enc); 5192 } 5193 } 5194 5195 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5196 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5197 AddressLiteral float_sign_flip, int src_vec_enc) { 5198 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5199 5200 Label done; 5201 // Compare the destination lanes with float_sign_flip 5202 // value to get mask for all special values. 5203 movdqu(xtmp1, float_sign_flip, rscratch); 5204 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5205 ptest(xtmp2, xtmp2); 5206 jccb(Assembler::equal, done); 5207 5208 // Flip float_sign_flip to get max integer value. 5209 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5210 pxor(xtmp1, xtmp4); 5211 5212 // Set detination lanes corresponding to unordered source lanes as zero. 5213 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5214 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5215 5216 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5217 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5218 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5219 5220 // Recompute the mask for remaining special value. 5221 pxor(xtmp2, xtmp3); 5222 // Extract mask corresponding to non-negative source lanes. 5223 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5224 5225 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5226 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5227 pand(xtmp3, xtmp2); 5228 5229 // Replace destination lanes holding special value(0x80000000) with max int 5230 // if corresponding source lane holds a +ve value. 5231 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5232 bind(done); 5233 } 5234 5235 5236 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5237 XMMRegister xtmp, Register rscratch, int vec_enc) { 5238 switch(to_elem_bt) { 5239 case T_SHORT: 5240 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5241 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5242 vpackusdw(dst, dst, zero, vec_enc); 5243 if (vec_enc == Assembler::AVX_256bit) { 5244 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5245 } 5246 break; 5247 case T_BYTE: 5248 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5249 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5250 vpackusdw(dst, dst, zero, vec_enc); 5251 if (vec_enc == Assembler::AVX_256bit) { 5252 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5253 } 5254 vpackuswb(dst, dst, zero, vec_enc); 5255 break; 5256 default: assert(false, "%s", type2name(to_elem_bt)); 5257 } 5258 } 5259 5260 /* 5261 * Algorithm for vector D2L and F2I conversions:- 5262 * a) Perform vector D2L/F2I cast. 5263 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5264 * It signifies that source value could be any of the special floating point 5265 * values(NaN,-Inf,Inf,Max,-Min). 5266 * c) Set destination to zero if source is NaN value. 5267 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5268 */ 5269 5270 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5271 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5272 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5273 int to_elem_sz = type2aelembytes(to_elem_bt); 5274 assert(to_elem_sz <= 4, ""); 5275 vcvttps2dq(dst, src, vec_enc); 5276 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5277 if (to_elem_sz < 4) { 5278 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5279 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5280 } 5281 } 5282 5283 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5284 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5285 Register rscratch, int vec_enc) { 5286 int to_elem_sz = type2aelembytes(to_elem_bt); 5287 assert(to_elem_sz <= 4, ""); 5288 vcvttps2dq(dst, src, vec_enc); 5289 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5290 switch(to_elem_bt) { 5291 case T_INT: 5292 break; 5293 case T_SHORT: 5294 evpmovdw(dst, dst, vec_enc); 5295 break; 5296 case T_BYTE: 5297 evpmovdb(dst, dst, vec_enc); 5298 break; 5299 default: assert(false, "%s", type2name(to_elem_bt)); 5300 } 5301 } 5302 5303 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5304 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5305 Register rscratch, int vec_enc) { 5306 evcvttps2qq(dst, src, vec_enc); 5307 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5308 } 5309 5310 // Handling for downcasting from double to integer or sub-word types on AVX2. 5311 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5312 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5313 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5314 int to_elem_sz = type2aelembytes(to_elem_bt); 5315 assert(to_elem_sz < 8, ""); 5316 vcvttpd2dq(dst, src, vec_enc); 5317 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5318 float_sign_flip, vec_enc); 5319 if (to_elem_sz < 4) { 5320 // xtmp4 holds all zero lanes. 5321 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5322 } 5323 } 5324 5325 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5326 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5327 KRegister ktmp2, AddressLiteral sign_flip, 5328 Register rscratch, int vec_enc) { 5329 if (VM_Version::supports_avx512dq()) { 5330 evcvttpd2qq(dst, src, vec_enc); 5331 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5332 switch(to_elem_bt) { 5333 case T_LONG: 5334 break; 5335 case T_INT: 5336 evpmovsqd(dst, dst, vec_enc); 5337 break; 5338 case T_SHORT: 5339 evpmovsqd(dst, dst, vec_enc); 5340 evpmovdw(dst, dst, vec_enc); 5341 break; 5342 case T_BYTE: 5343 evpmovsqd(dst, dst, vec_enc); 5344 evpmovdb(dst, dst, vec_enc); 5345 break; 5346 default: assert(false, "%s", type2name(to_elem_bt)); 5347 } 5348 } else { 5349 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5350 vcvttpd2dq(dst, src, vec_enc); 5351 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5352 switch(to_elem_bt) { 5353 case T_INT: 5354 break; 5355 case T_SHORT: 5356 evpmovdw(dst, dst, vec_enc); 5357 break; 5358 case T_BYTE: 5359 evpmovdb(dst, dst, vec_enc); 5360 break; 5361 default: assert(false, "%s", type2name(to_elem_bt)); 5362 } 5363 } 5364 } 5365 5366 #ifdef _LP64 5367 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5368 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5369 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5370 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5371 // and re-instantiate original MXCSR.RC mode after that. 5372 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5373 5374 mov64(tmp, julong_cast(0.5L)); 5375 evpbroadcastq(xtmp1, tmp, vec_enc); 5376 vaddpd(xtmp1, src , xtmp1, vec_enc); 5377 evcvtpd2qq(dst, xtmp1, vec_enc); 5378 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5379 double_sign_flip, vec_enc);; 5380 5381 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5382 } 5383 5384 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5385 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5386 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5387 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5388 // and re-instantiate original MXCSR.RC mode after that. 5389 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5390 5391 movl(tmp, jint_cast(0.5)); 5392 movq(xtmp1, tmp); 5393 vbroadcastss(xtmp1, xtmp1, vec_enc); 5394 vaddps(xtmp1, src , xtmp1, vec_enc); 5395 vcvtps2dq(dst, xtmp1, vec_enc); 5396 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5397 float_sign_flip, vec_enc); 5398 5399 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5400 } 5401 5402 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5403 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5404 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5405 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5406 // and re-instantiate original MXCSR.RC mode after that. 5407 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5408 5409 movl(tmp, jint_cast(0.5)); 5410 movq(xtmp1, tmp); 5411 vbroadcastss(xtmp1, xtmp1, vec_enc); 5412 vaddps(xtmp1, src , xtmp1, vec_enc); 5413 vcvtps2dq(dst, xtmp1, vec_enc); 5414 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5415 5416 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5417 } 5418 #endif // _LP64 5419 5420 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5421 BasicType from_elem_bt, BasicType to_elem_bt) { 5422 switch (from_elem_bt) { 5423 case T_BYTE: 5424 switch (to_elem_bt) { 5425 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5426 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5427 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5428 default: ShouldNotReachHere(); 5429 } 5430 break; 5431 case T_SHORT: 5432 switch (to_elem_bt) { 5433 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5434 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5435 default: ShouldNotReachHere(); 5436 } 5437 break; 5438 case T_INT: 5439 assert(to_elem_bt == T_LONG, ""); 5440 vpmovzxdq(dst, src, vlen_enc); 5441 break; 5442 default: 5443 ShouldNotReachHere(); 5444 } 5445 } 5446 5447 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5448 BasicType from_elem_bt, BasicType to_elem_bt) { 5449 switch (from_elem_bt) { 5450 case T_BYTE: 5451 switch (to_elem_bt) { 5452 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5453 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5454 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5455 default: ShouldNotReachHere(); 5456 } 5457 break; 5458 case T_SHORT: 5459 switch (to_elem_bt) { 5460 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5461 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5462 default: ShouldNotReachHere(); 5463 } 5464 break; 5465 case T_INT: 5466 assert(to_elem_bt == T_LONG, ""); 5467 vpmovsxdq(dst, src, vlen_enc); 5468 break; 5469 default: 5470 ShouldNotReachHere(); 5471 } 5472 } 5473 5474 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5475 BasicType dst_bt, BasicType src_bt, int vlen) { 5476 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5477 assert(vlen_enc != AVX_512bit, ""); 5478 5479 int dst_bt_size = type2aelembytes(dst_bt); 5480 int src_bt_size = type2aelembytes(src_bt); 5481 if (dst_bt_size > src_bt_size) { 5482 switch (dst_bt_size / src_bt_size) { 5483 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5484 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5485 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5486 default: ShouldNotReachHere(); 5487 } 5488 } else { 5489 assert(dst_bt_size < src_bt_size, ""); 5490 switch (src_bt_size / dst_bt_size) { 5491 case 2: { 5492 if (vlen_enc == AVX_128bit) { 5493 vpacksswb(dst, src, src, vlen_enc); 5494 } else { 5495 vpacksswb(dst, src, src, vlen_enc); 5496 vpermq(dst, dst, 0x08, vlen_enc); 5497 } 5498 break; 5499 } 5500 case 4: { 5501 if (vlen_enc == AVX_128bit) { 5502 vpackssdw(dst, src, src, vlen_enc); 5503 vpacksswb(dst, dst, dst, vlen_enc); 5504 } else { 5505 vpackssdw(dst, src, src, vlen_enc); 5506 vpermq(dst, dst, 0x08, vlen_enc); 5507 vpacksswb(dst, dst, dst, AVX_128bit); 5508 } 5509 break; 5510 } 5511 case 8: { 5512 if (vlen_enc == AVX_128bit) { 5513 vpshufd(dst, src, 0x08, vlen_enc); 5514 vpackssdw(dst, dst, dst, vlen_enc); 5515 vpacksswb(dst, dst, dst, vlen_enc); 5516 } else { 5517 vpshufd(dst, src, 0x08, vlen_enc); 5518 vpermq(dst, dst, 0x08, vlen_enc); 5519 vpackssdw(dst, dst, dst, AVX_128bit); 5520 vpacksswb(dst, dst, dst, AVX_128bit); 5521 } 5522 break; 5523 } 5524 default: ShouldNotReachHere(); 5525 } 5526 } 5527 } 5528 5529 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5530 bool merge, BasicType bt, int vlen_enc) { 5531 if (bt == T_INT) { 5532 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5533 } else { 5534 assert(bt == T_LONG, ""); 5535 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5536 } 5537 } 5538 5539 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5540 bool merge, BasicType bt, int vlen_enc) { 5541 if (bt == T_INT) { 5542 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5543 } else { 5544 assert(bt == T_LONG, ""); 5545 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5546 } 5547 } 5548 5549 #ifdef _LP64 5550 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5551 Register rtmp2, XMMRegister xtmp, int mask_len, 5552 int vec_enc) { 5553 int index = 0; 5554 int vindex = 0; 5555 mov64(rtmp1, 0x0101010101010101L); 5556 pdepq(rtmp1, src, rtmp1); 5557 if (mask_len > 8) { 5558 movq(rtmp2, src); 5559 vpxor(xtmp, xtmp, xtmp, vec_enc); 5560 movq(xtmp, rtmp1); 5561 } 5562 movq(dst, rtmp1); 5563 5564 mask_len -= 8; 5565 while (mask_len > 0) { 5566 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5567 index++; 5568 if ((index % 2) == 0) { 5569 pxor(xtmp, xtmp); 5570 } 5571 mov64(rtmp1, 0x0101010101010101L); 5572 shrq(rtmp2, 8); 5573 pdepq(rtmp1, rtmp2, rtmp1); 5574 pinsrq(xtmp, rtmp1, index % 2); 5575 vindex = index / 2; 5576 if (vindex) { 5577 // Write entire 16 byte vector when both 64 bit 5578 // lanes are update to save redundant instructions. 5579 if (index % 2) { 5580 vinsertf128(dst, dst, xtmp, vindex); 5581 } 5582 } else { 5583 vmovdqu(dst, xtmp); 5584 } 5585 mask_len -= 8; 5586 } 5587 } 5588 5589 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5590 switch(opc) { 5591 case Op_VectorMaskTrueCount: 5592 popcntq(dst, tmp); 5593 break; 5594 case Op_VectorMaskLastTrue: 5595 if (VM_Version::supports_lzcnt()) { 5596 lzcntq(tmp, tmp); 5597 movl(dst, 63); 5598 subl(dst, tmp); 5599 } else { 5600 movl(dst, -1); 5601 bsrq(tmp, tmp); 5602 cmov32(Assembler::notZero, dst, tmp); 5603 } 5604 break; 5605 case Op_VectorMaskFirstTrue: 5606 if (VM_Version::supports_bmi1()) { 5607 if (masklen < 32) { 5608 orl(tmp, 1 << masklen); 5609 tzcntl(dst, tmp); 5610 } else if (masklen == 32) { 5611 tzcntl(dst, tmp); 5612 } else { 5613 assert(masklen == 64, ""); 5614 tzcntq(dst, tmp); 5615 } 5616 } else { 5617 if (masklen < 32) { 5618 orl(tmp, 1 << masklen); 5619 bsfl(dst, tmp); 5620 } else { 5621 assert(masklen == 32 || masklen == 64, ""); 5622 movl(dst, masklen); 5623 if (masklen == 32) { 5624 bsfl(tmp, tmp); 5625 } else { 5626 bsfq(tmp, tmp); 5627 } 5628 cmov32(Assembler::notZero, dst, tmp); 5629 } 5630 } 5631 break; 5632 case Op_VectorMaskToLong: 5633 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5634 break; 5635 default: assert(false, "Unhandled mask operation"); 5636 } 5637 } 5638 5639 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5640 int masklen, int masksize, int vec_enc) { 5641 assert(VM_Version::supports_popcnt(), ""); 5642 5643 if(VM_Version::supports_avx512bw()) { 5644 kmovql(tmp, mask); 5645 } else { 5646 assert(masklen <= 16, ""); 5647 kmovwl(tmp, mask); 5648 } 5649 5650 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5651 // operations needs to be clipped. 5652 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5653 andq(tmp, (1 << masklen) - 1); 5654 } 5655 5656 vector_mask_operation_helper(opc, dst, tmp, masklen); 5657 } 5658 5659 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5660 Register tmp, int masklen, BasicType bt, int vec_enc) { 5661 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5662 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5663 assert(VM_Version::supports_popcnt(), ""); 5664 5665 bool need_clip = false; 5666 switch(bt) { 5667 case T_BOOLEAN: 5668 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5669 vpxor(xtmp, xtmp, xtmp, vec_enc); 5670 vpsubb(xtmp, xtmp, mask, vec_enc); 5671 vpmovmskb(tmp, xtmp, vec_enc); 5672 need_clip = masklen < 16; 5673 break; 5674 case T_BYTE: 5675 vpmovmskb(tmp, mask, vec_enc); 5676 need_clip = masklen < 16; 5677 break; 5678 case T_SHORT: 5679 vpacksswb(xtmp, mask, mask, vec_enc); 5680 if (masklen >= 16) { 5681 vpermpd(xtmp, xtmp, 8, vec_enc); 5682 } 5683 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5684 need_clip = masklen < 16; 5685 break; 5686 case T_INT: 5687 case T_FLOAT: 5688 vmovmskps(tmp, mask, vec_enc); 5689 need_clip = masklen < 4; 5690 break; 5691 case T_LONG: 5692 case T_DOUBLE: 5693 vmovmskpd(tmp, mask, vec_enc); 5694 need_clip = masklen < 2; 5695 break; 5696 default: assert(false, "Unhandled type, %s", type2name(bt)); 5697 } 5698 5699 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5700 // operations needs to be clipped. 5701 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5702 // need_clip implies masklen < 32 5703 andq(tmp, (1 << masklen) - 1); 5704 } 5705 5706 vector_mask_operation_helper(opc, dst, tmp, masklen); 5707 } 5708 5709 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5710 Register rtmp2, int mask_len) { 5711 kmov(rtmp1, src); 5712 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5713 mov64(rtmp2, -1L); 5714 pextq(rtmp2, rtmp2, rtmp1); 5715 kmov(dst, rtmp2); 5716 } 5717 5718 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5719 XMMRegister mask, Register rtmp, Register rscratch, 5720 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5721 int vec_enc) { 5722 assert(type2aelembytes(bt) >= 4, ""); 5723 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5724 address compress_perm_table = nullptr; 5725 address expand_perm_table = nullptr; 5726 if (type2aelembytes(bt) == 8) { 5727 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5728 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5729 vmovmskpd(rtmp, mask, vec_enc); 5730 } else { 5731 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5732 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5733 vmovmskps(rtmp, mask, vec_enc); 5734 } 5735 shlq(rtmp, 5); // for 32 byte permute row. 5736 if (opcode == Op_CompressV) { 5737 lea(rscratch, ExternalAddress(compress_perm_table)); 5738 } else { 5739 lea(rscratch, ExternalAddress(expand_perm_table)); 5740 } 5741 addptr(rtmp, rscratch); 5742 vmovdqu(permv, Address(rtmp)); 5743 vpermps(dst, permv, src, Assembler::AVX_256bit); 5744 vpxor(xtmp, xtmp, xtmp, vec_enc); 5745 // Blend the result with zero vector using permute mask, each column entry 5746 // in a permute table row contains either a valid permute index or a -1 (default) 5747 // value, this can potentially be used as a blending mask after 5748 // compressing/expanding the source vector lanes. 5749 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5750 } 5751 5752 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5753 bool merge, BasicType bt, int vec_enc) { 5754 if (opcode == Op_CompressV) { 5755 switch(bt) { 5756 case T_BYTE: 5757 evpcompressb(dst, mask, src, merge, vec_enc); 5758 break; 5759 case T_CHAR: 5760 case T_SHORT: 5761 evpcompressw(dst, mask, src, merge, vec_enc); 5762 break; 5763 case T_INT: 5764 evpcompressd(dst, mask, src, merge, vec_enc); 5765 break; 5766 case T_FLOAT: 5767 evcompressps(dst, mask, src, merge, vec_enc); 5768 break; 5769 case T_LONG: 5770 evpcompressq(dst, mask, src, merge, vec_enc); 5771 break; 5772 case T_DOUBLE: 5773 evcompresspd(dst, mask, src, merge, vec_enc); 5774 break; 5775 default: 5776 fatal("Unsupported type %s", type2name(bt)); 5777 break; 5778 } 5779 } else { 5780 assert(opcode == Op_ExpandV, ""); 5781 switch(bt) { 5782 case T_BYTE: 5783 evpexpandb(dst, mask, src, merge, vec_enc); 5784 break; 5785 case T_CHAR: 5786 case T_SHORT: 5787 evpexpandw(dst, mask, src, merge, vec_enc); 5788 break; 5789 case T_INT: 5790 evpexpandd(dst, mask, src, merge, vec_enc); 5791 break; 5792 case T_FLOAT: 5793 evexpandps(dst, mask, src, merge, vec_enc); 5794 break; 5795 case T_LONG: 5796 evpexpandq(dst, mask, src, merge, vec_enc); 5797 break; 5798 case T_DOUBLE: 5799 evexpandpd(dst, mask, src, merge, vec_enc); 5800 break; 5801 default: 5802 fatal("Unsupported type %s", type2name(bt)); 5803 break; 5804 } 5805 } 5806 } 5807 #endif 5808 5809 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5810 KRegister ktmp1, int vec_enc) { 5811 if (opcode == Op_SignumVD) { 5812 vsubpd(dst, zero, one, vec_enc); 5813 // if src < 0 ? -1 : 1 5814 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5815 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5816 // if src == NaN, -0.0 or 0.0 return src. 5817 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5818 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5819 } else { 5820 assert(opcode == Op_SignumVF, ""); 5821 vsubps(dst, zero, one, vec_enc); 5822 // if src < 0 ? -1 : 1 5823 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5824 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5825 // if src == NaN, -0.0 or 0.0 return src. 5826 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5827 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5828 } 5829 } 5830 5831 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5832 XMMRegister xtmp1, int vec_enc) { 5833 if (opcode == Op_SignumVD) { 5834 vsubpd(dst, zero, one, vec_enc); 5835 // if src < 0 ? -1 : 1 5836 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5837 // if src == NaN, -0.0 or 0.0 return src. 5838 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5839 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5840 } else { 5841 assert(opcode == Op_SignumVF, ""); 5842 vsubps(dst, zero, one, vec_enc); 5843 // if src < 0 ? -1 : 1 5844 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5845 // if src == NaN, -0.0 or 0.0 return src. 5846 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5847 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5848 } 5849 } 5850 5851 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5852 if (VM_Version::supports_avx512bw()) { 5853 if (mask_len > 32) { 5854 kmovql(dst, src); 5855 } else { 5856 kmovdl(dst, src); 5857 if (mask_len != 32) { 5858 kshiftrdl(dst, dst, 32 - mask_len); 5859 } 5860 } 5861 } else { 5862 assert(mask_len <= 16, ""); 5863 kmovwl(dst, src); 5864 if (mask_len != 16) { 5865 kshiftrwl(dst, dst, 16 - mask_len); 5866 } 5867 } 5868 } 5869 5870 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5871 int lane_size = type2aelembytes(bt); 5872 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5873 if ((is_LP64 || lane_size < 8) && 5874 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5875 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5876 movptr(rtmp, imm32); 5877 switch(lane_size) { 5878 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5879 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5880 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5881 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5882 fatal("Unsupported lane size %d", lane_size); 5883 break; 5884 } 5885 } else { 5886 movptr(rtmp, imm32); 5887 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5888 switch(lane_size) { 5889 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5890 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5891 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5892 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5893 fatal("Unsupported lane size %d", lane_size); 5894 break; 5895 } 5896 } 5897 } 5898 5899 // 5900 // Following is lookup table based popcount computation algorithm:- 5901 // Index Bit set count 5902 // [ 0000 -> 0, 5903 // 0001 -> 1, 5904 // 0010 -> 1, 5905 // 0011 -> 2, 5906 // 0100 -> 1, 5907 // 0101 -> 2, 5908 // 0110 -> 2, 5909 // 0111 -> 3, 5910 // 1000 -> 1, 5911 // 1001 -> 2, 5912 // 1010 -> 3, 5913 // 1011 -> 3, 5914 // 1100 -> 2, 5915 // 1101 -> 3, 5916 // 1111 -> 4 ] 5917 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5918 // shuffle indices for lookup table access. 5919 // b. Right shift each byte of vector lane by 4 positions. 5920 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5921 // shuffle indices for lookup table access. 5922 // d. Add the bitset count of upper and lower 4 bits of each byte. 5923 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5924 // count of all the bytes of a quadword. 5925 // f. Perform step e. for upper 128bit vector lane. 5926 // g. Pack the bitset count of quadwords back to double word. 5927 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5928 5929 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5930 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5931 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5932 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5933 vpsrlw(dst, src, 4, vec_enc); 5934 vpand(dst, dst, xtmp1, vec_enc); 5935 vpand(xtmp1, src, xtmp1, vec_enc); 5936 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5937 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5938 vpshufb(dst, xtmp2, dst, vec_enc); 5939 vpaddb(dst, dst, xtmp1, vec_enc); 5940 } 5941 5942 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5943 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5944 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5945 // Following code is as per steps e,f,g and h of above algorithm. 5946 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5947 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5948 vpsadbw(dst, dst, xtmp2, vec_enc); 5949 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5950 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5951 vpackuswb(dst, xtmp1, dst, vec_enc); 5952 } 5953 5954 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5955 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5956 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5957 // Add the popcount of upper and lower bytes of word. 5958 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5959 vpsrlw(dst, xtmp1, 8, vec_enc); 5960 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5961 vpaddw(dst, dst, xtmp1, vec_enc); 5962 } 5963 5964 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5965 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5966 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5967 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5968 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5969 } 5970 5971 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5972 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5973 switch(bt) { 5974 case T_LONG: 5975 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5976 break; 5977 case T_INT: 5978 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5979 break; 5980 case T_CHAR: 5981 case T_SHORT: 5982 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5983 break; 5984 case T_BYTE: 5985 case T_BOOLEAN: 5986 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5987 break; 5988 default: 5989 fatal("Unsupported type %s", type2name(bt)); 5990 break; 5991 } 5992 } 5993 5994 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5995 KRegister mask, bool merge, int vec_enc) { 5996 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5997 switch(bt) { 5998 case T_LONG: 5999 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 6000 evpopcntq(dst, mask, src, merge, vec_enc); 6001 break; 6002 case T_INT: 6003 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 6004 evpopcntd(dst, mask, src, merge, vec_enc); 6005 break; 6006 case T_CHAR: 6007 case T_SHORT: 6008 assert(VM_Version::supports_avx512_bitalg(), ""); 6009 evpopcntw(dst, mask, src, merge, vec_enc); 6010 break; 6011 case T_BYTE: 6012 case T_BOOLEAN: 6013 assert(VM_Version::supports_avx512_bitalg(), ""); 6014 evpopcntb(dst, mask, src, merge, vec_enc); 6015 break; 6016 default: 6017 fatal("Unsupported type %s", type2name(bt)); 6018 break; 6019 } 6020 } 6021 6022 #ifndef _LP64 6023 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 6024 assert(VM_Version::supports_avx512bw(), ""); 6025 kmovdl(tmp, src); 6026 kunpckdql(dst, tmp, tmp); 6027 } 6028 #endif 6029 6030 // Bit reversal algorithm first reverses the bits of each byte followed by 6031 // a byte level reversal for multi-byte primitive types (short/int/long). 6032 // Algorithm performs a lookup table access to get reverse bit sequence 6033 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 6034 // is obtained by swapping the reverse bit sequences of upper and lower 6035 // nibble of a byte. 6036 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6037 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6038 if (VM_Version::supports_avx512vlbw()) { 6039 6040 // Get the reverse bit sequence of lower nibble of each byte. 6041 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 6042 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6043 evpandq(dst, xtmp2, src, vec_enc); 6044 vpshufb(dst, xtmp1, dst, vec_enc); 6045 vpsllq(dst, dst, 4, vec_enc); 6046 6047 // Get the reverse bit sequence of upper nibble of each byte. 6048 vpandn(xtmp2, xtmp2, src, vec_enc); 6049 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6050 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6051 6052 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6053 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6054 evporq(xtmp2, dst, xtmp2, vec_enc); 6055 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6056 6057 } else if(vec_enc == Assembler::AVX_512bit) { 6058 // Shift based bit reversal. 6059 assert(bt == T_LONG || bt == T_INT, ""); 6060 6061 // Swap lower and upper nibble of each byte. 6062 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 6063 6064 // Swap two least and most significant bits of each nibble. 6065 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 6066 6067 // Swap adjacent pair of bits. 6068 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6069 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 6070 6071 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6072 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 6073 } else { 6074 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 6075 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6076 6077 // Get the reverse bit sequence of lower nibble of each byte. 6078 vpand(dst, xtmp2, src, vec_enc); 6079 vpshufb(dst, xtmp1, dst, vec_enc); 6080 vpsllq(dst, dst, 4, vec_enc); 6081 6082 // Get the reverse bit sequence of upper nibble of each byte. 6083 vpandn(xtmp2, xtmp2, src, vec_enc); 6084 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6085 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6086 6087 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6088 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6089 vpor(xtmp2, dst, xtmp2, vec_enc); 6090 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6091 } 6092 } 6093 6094 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 6095 XMMRegister xtmp, Register rscratch) { 6096 assert(VM_Version::supports_gfni(), ""); 6097 assert(rscratch != noreg || always_reachable(mask), "missing"); 6098 6099 // Galois field instruction based bit reversal based on following algorithm. 6100 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6101 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 6102 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 6103 vector_reverse_byte(bt, dst, xtmp, vec_enc); 6104 } 6105 6106 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 6107 XMMRegister xtmp1, Register rtmp, int vec_enc) { 6108 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 6109 evpandq(dst, xtmp1, src, vec_enc); 6110 vpsllq(dst, dst, nbits, vec_enc); 6111 vpandn(xtmp1, xtmp1, src, vec_enc); 6112 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 6113 evporq(dst, dst, xtmp1, vec_enc); 6114 } 6115 6116 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6117 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6118 // Shift based bit reversal. 6119 assert(VM_Version::supports_evex(), ""); 6120 switch(bt) { 6121 case T_LONG: 6122 // Swap upper and lower double word of each quad word. 6123 evprorq(xtmp1, k0, src, 32, true, vec_enc); 6124 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 6125 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6126 break; 6127 case T_INT: 6128 // Swap upper and lower word of each double word. 6129 evprord(xtmp1, k0, src, 16, true, vec_enc); 6130 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6131 break; 6132 case T_CHAR: 6133 case T_SHORT: 6134 // Swap upper and lower byte of each word. 6135 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 6136 break; 6137 case T_BYTE: 6138 evmovdquq(dst, k0, src, true, vec_enc); 6139 break; 6140 default: 6141 fatal("Unsupported type %s", type2name(bt)); 6142 break; 6143 } 6144 } 6145 6146 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 6147 if (bt == T_BYTE) { 6148 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 6149 evmovdquq(dst, k0, src, true, vec_enc); 6150 } else { 6151 vmovdqu(dst, src); 6152 } 6153 return; 6154 } 6155 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 6156 // pre-computed shuffle indices. 6157 switch(bt) { 6158 case T_LONG: 6159 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 6160 break; 6161 case T_INT: 6162 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 6163 break; 6164 case T_CHAR: 6165 case T_SHORT: 6166 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6167 break; 6168 default: 6169 fatal("Unsupported type %s", type2name(bt)); 6170 break; 6171 } 6172 vpshufb(dst, src, dst, vec_enc); 6173 } 6174 6175 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6176 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6177 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6178 assert(is_integral_type(bt), ""); 6179 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6180 assert(VM_Version::supports_avx512cd(), ""); 6181 switch(bt) { 6182 case T_LONG: 6183 evplzcntq(dst, ktmp, src, merge, vec_enc); 6184 break; 6185 case T_INT: 6186 evplzcntd(dst, ktmp, src, merge, vec_enc); 6187 break; 6188 case T_SHORT: 6189 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6190 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6191 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6192 vpunpckhwd(dst, xtmp1, src, vec_enc); 6193 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6194 vpackusdw(dst, xtmp2, dst, vec_enc); 6195 break; 6196 case T_BYTE: 6197 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6198 // accessing the lookup table. 6199 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6200 // accessing the lookup table. 6201 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6202 assert(VM_Version::supports_avx512bw(), ""); 6203 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6204 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6205 vpand(xtmp2, dst, src, vec_enc); 6206 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6207 vpsrlw(xtmp3, src, 4, vec_enc); 6208 vpand(xtmp3, dst, xtmp3, vec_enc); 6209 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6210 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6211 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6212 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6213 break; 6214 default: 6215 fatal("Unsupported type %s", type2name(bt)); 6216 break; 6217 } 6218 } 6219 6220 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6221 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6222 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6223 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6224 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6225 // accessing the lookup table. 6226 vpand(dst, xtmp2, src, vec_enc); 6227 vpshufb(dst, xtmp1, dst, vec_enc); 6228 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6229 // accessing the lookup table. 6230 vpsrlw(xtmp3, src, 4, vec_enc); 6231 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6232 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6233 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6234 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6235 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6236 vpaddb(dst, dst, xtmp2, vec_enc); 6237 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6238 } 6239 6240 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6241 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6242 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6243 // Add zero counts of lower byte and upper byte of a word if 6244 // upper byte holds a zero value. 6245 vpsrlw(xtmp3, src, 8, vec_enc); 6246 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6247 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6248 vpsllw(xtmp2, dst, 8, vec_enc); 6249 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6250 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6251 vpsrlw(dst, dst, 8, vec_enc); 6252 } 6253 6254 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6255 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6256 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6257 // hence biased exponent can be used to compute leading zero count as per 6258 // following formula:- 6259 // LZCNT = 32 - (biased_exp - 127) 6260 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6261 6262 // Broadcast 0xFF 6263 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6264 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6265 6266 // Extract biased exponent. 6267 vcvtdq2ps(dst, src, vec_enc); 6268 vpsrld(dst, dst, 23, vec_enc); 6269 vpand(dst, dst, xtmp1, vec_enc); 6270 6271 // Broadcast 127. 6272 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6273 // Exponent = biased_exp - 127 6274 vpsubd(dst, dst, xtmp1, vec_enc); 6275 6276 // Exponent = Exponent + 1 6277 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6278 vpaddd(dst, dst, xtmp3, vec_enc); 6279 6280 // Replace -ve exponent with zero, exponent is -ve when src 6281 // lane contains a zero value. 6282 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6283 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6284 6285 // Rematerialize broadcast 32. 6286 vpslld(xtmp1, xtmp3, 5, vec_enc); 6287 // Exponent is 32 if corresponding source lane contains max_int value. 6288 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6289 // LZCNT = 32 - exponent 6290 vpsubd(dst, xtmp1, dst, vec_enc); 6291 6292 // Replace LZCNT with a value 1 if corresponding source lane 6293 // contains max_int value. 6294 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6295 6296 // Replace biased_exp with 0 if source lane value is less than zero. 6297 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6298 vblendvps(dst, dst, xtmp2, src, vec_enc); 6299 } 6300 6301 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6302 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6303 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6304 // Add zero counts of lower word and upper word of a double word if 6305 // upper word holds a zero value. 6306 vpsrld(xtmp3, src, 16, vec_enc); 6307 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6308 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6309 vpslld(xtmp2, dst, 16, vec_enc); 6310 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6311 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6312 vpsrld(dst, dst, 16, vec_enc); 6313 // Add zero counts of lower doubleword and upper doubleword of a 6314 // quadword if upper doubleword holds a zero value. 6315 vpsrlq(xtmp3, src, 32, vec_enc); 6316 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6317 vpsllq(xtmp2, dst, 32, vec_enc); 6318 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6319 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6320 vpsrlq(dst, dst, 32, vec_enc); 6321 } 6322 6323 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6324 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6325 Register rtmp, int vec_enc) { 6326 assert(is_integral_type(bt), "unexpected type"); 6327 assert(vec_enc < Assembler::AVX_512bit, ""); 6328 switch(bt) { 6329 case T_LONG: 6330 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6331 break; 6332 case T_INT: 6333 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6334 break; 6335 case T_SHORT: 6336 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6337 break; 6338 case T_BYTE: 6339 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6340 break; 6341 default: 6342 fatal("Unsupported type %s", type2name(bt)); 6343 break; 6344 } 6345 } 6346 6347 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6348 switch(bt) { 6349 case T_BYTE: 6350 vpsubb(dst, src1, src2, vec_enc); 6351 break; 6352 case T_SHORT: 6353 vpsubw(dst, src1, src2, vec_enc); 6354 break; 6355 case T_INT: 6356 vpsubd(dst, src1, src2, vec_enc); 6357 break; 6358 case T_LONG: 6359 vpsubq(dst, src1, src2, vec_enc); 6360 break; 6361 default: 6362 fatal("Unsupported type %s", type2name(bt)); 6363 break; 6364 } 6365 } 6366 6367 // Trailing zero count computation is based on leading zero count operation as per 6368 // following equation. All AVX3 targets support AVX512CD feature which offers 6369 // direct vector instruction to compute leading zero count. 6370 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6371 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6372 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6373 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6374 assert(is_integral_type(bt), ""); 6375 // xtmp = -1 6376 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6377 // xtmp = xtmp + src 6378 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6379 // xtmp = xtmp & ~src 6380 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6381 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6382 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6383 vpsub(bt, dst, xtmp4, dst, vec_enc); 6384 } 6385 6386 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6387 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6388 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6389 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6390 assert(is_integral_type(bt), ""); 6391 // xtmp = 0 6392 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6393 // xtmp = 0 - src 6394 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6395 // xtmp = xtmp | src 6396 vpor(xtmp3, xtmp3, src, vec_enc); 6397 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6398 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6399 vpsub(bt, dst, xtmp1, dst, vec_enc); 6400 } 6401 6402 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6403 Label done; 6404 Label neg_divisor_fastpath; 6405 cmpl(divisor, 0); 6406 jccb(Assembler::less, neg_divisor_fastpath); 6407 xorl(rdx, rdx); 6408 divl(divisor); 6409 jmpb(done); 6410 bind(neg_divisor_fastpath); 6411 // Fastpath for divisor < 0: 6412 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6413 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6414 movl(rdx, rax); 6415 subl(rdx, divisor); 6416 if (VM_Version::supports_bmi1()) { 6417 andnl(rax, rdx, rax); 6418 } else { 6419 notl(rdx); 6420 andl(rax, rdx); 6421 } 6422 shrl(rax, 31); 6423 bind(done); 6424 } 6425 6426 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6427 Label done; 6428 Label neg_divisor_fastpath; 6429 cmpl(divisor, 0); 6430 jccb(Assembler::less, neg_divisor_fastpath); 6431 xorl(rdx, rdx); 6432 divl(divisor); 6433 jmpb(done); 6434 bind(neg_divisor_fastpath); 6435 // Fastpath when divisor < 0: 6436 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6437 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6438 movl(rdx, rax); 6439 subl(rax, divisor); 6440 if (VM_Version::supports_bmi1()) { 6441 andnl(rax, rax, rdx); 6442 } else { 6443 notl(rax); 6444 andl(rax, rdx); 6445 } 6446 sarl(rax, 31); 6447 andl(rax, divisor); 6448 subl(rdx, rax); 6449 bind(done); 6450 } 6451 6452 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6453 Label done; 6454 Label neg_divisor_fastpath; 6455 6456 cmpl(divisor, 0); 6457 jccb(Assembler::less, neg_divisor_fastpath); 6458 xorl(rdx, rdx); 6459 divl(divisor); 6460 jmpb(done); 6461 bind(neg_divisor_fastpath); 6462 // Fastpath for divisor < 0: 6463 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6464 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6465 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6466 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6467 movl(rdx, rax); 6468 subl(rax, divisor); 6469 if (VM_Version::supports_bmi1()) { 6470 andnl(rax, rax, rdx); 6471 } else { 6472 notl(rax); 6473 andl(rax, rdx); 6474 } 6475 movl(tmp, rax); 6476 shrl(rax, 31); // quotient 6477 sarl(tmp, 31); 6478 andl(tmp, divisor); 6479 subl(rdx, tmp); // remainder 6480 bind(done); 6481 } 6482 6483 #ifdef _LP64 6484 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6485 XMMRegister xtmp2, Register rtmp) { 6486 if(VM_Version::supports_gfni()) { 6487 // Galois field instruction based bit reversal based on following algorithm. 6488 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6489 mov64(rtmp, 0x8040201008040201L); 6490 movq(xtmp1, src); 6491 movq(xtmp2, rtmp); 6492 gf2p8affineqb(xtmp1, xtmp2, 0); 6493 movq(dst, xtmp1); 6494 } else { 6495 // Swap even and odd numbered bits. 6496 movl(rtmp, src); 6497 andl(rtmp, 0x55555555); 6498 shll(rtmp, 1); 6499 movl(dst, src); 6500 andl(dst, 0xAAAAAAAA); 6501 shrl(dst, 1); 6502 orl(dst, rtmp); 6503 6504 // Swap LSB and MSB 2 bits of each nibble. 6505 movl(rtmp, dst); 6506 andl(rtmp, 0x33333333); 6507 shll(rtmp, 2); 6508 andl(dst, 0xCCCCCCCC); 6509 shrl(dst, 2); 6510 orl(dst, rtmp); 6511 6512 // Swap LSB and MSB 4 bits of each byte. 6513 movl(rtmp, dst); 6514 andl(rtmp, 0x0F0F0F0F); 6515 shll(rtmp, 4); 6516 andl(dst, 0xF0F0F0F0); 6517 shrl(dst, 4); 6518 orl(dst, rtmp); 6519 } 6520 bswapl(dst); 6521 } 6522 6523 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6524 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6525 if(VM_Version::supports_gfni()) { 6526 // Galois field instruction based bit reversal based on following algorithm. 6527 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6528 mov64(rtmp1, 0x8040201008040201L); 6529 movq(xtmp1, src); 6530 movq(xtmp2, rtmp1); 6531 gf2p8affineqb(xtmp1, xtmp2, 0); 6532 movq(dst, xtmp1); 6533 } else { 6534 // Swap even and odd numbered bits. 6535 movq(rtmp1, src); 6536 mov64(rtmp2, 0x5555555555555555L); 6537 andq(rtmp1, rtmp2); 6538 shlq(rtmp1, 1); 6539 movq(dst, src); 6540 notq(rtmp2); 6541 andq(dst, rtmp2); 6542 shrq(dst, 1); 6543 orq(dst, rtmp1); 6544 6545 // Swap LSB and MSB 2 bits of each nibble. 6546 movq(rtmp1, dst); 6547 mov64(rtmp2, 0x3333333333333333L); 6548 andq(rtmp1, rtmp2); 6549 shlq(rtmp1, 2); 6550 notq(rtmp2); 6551 andq(dst, rtmp2); 6552 shrq(dst, 2); 6553 orq(dst, rtmp1); 6554 6555 // Swap LSB and MSB 4 bits of each byte. 6556 movq(rtmp1, dst); 6557 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6558 andq(rtmp1, rtmp2); 6559 shlq(rtmp1, 4); 6560 notq(rtmp2); 6561 andq(dst, rtmp2); 6562 shrq(dst, 4); 6563 orq(dst, rtmp1); 6564 } 6565 bswapq(dst); 6566 } 6567 6568 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6569 Label done; 6570 Label neg_divisor_fastpath; 6571 cmpq(divisor, 0); 6572 jccb(Assembler::less, neg_divisor_fastpath); 6573 xorl(rdx, rdx); 6574 divq(divisor); 6575 jmpb(done); 6576 bind(neg_divisor_fastpath); 6577 // Fastpath for divisor < 0: 6578 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6579 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6580 movq(rdx, rax); 6581 subq(rdx, divisor); 6582 if (VM_Version::supports_bmi1()) { 6583 andnq(rax, rdx, rax); 6584 } else { 6585 notq(rdx); 6586 andq(rax, rdx); 6587 } 6588 shrq(rax, 63); 6589 bind(done); 6590 } 6591 6592 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6593 Label done; 6594 Label neg_divisor_fastpath; 6595 cmpq(divisor, 0); 6596 jccb(Assembler::less, neg_divisor_fastpath); 6597 xorq(rdx, rdx); 6598 divq(divisor); 6599 jmp(done); 6600 bind(neg_divisor_fastpath); 6601 // Fastpath when divisor < 0: 6602 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6603 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6604 movq(rdx, rax); 6605 subq(rax, divisor); 6606 if (VM_Version::supports_bmi1()) { 6607 andnq(rax, rax, rdx); 6608 } else { 6609 notq(rax); 6610 andq(rax, rdx); 6611 } 6612 sarq(rax, 63); 6613 andq(rax, divisor); 6614 subq(rdx, rax); 6615 bind(done); 6616 } 6617 6618 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6619 Label done; 6620 Label neg_divisor_fastpath; 6621 cmpq(divisor, 0); 6622 jccb(Assembler::less, neg_divisor_fastpath); 6623 xorq(rdx, rdx); 6624 divq(divisor); 6625 jmp(done); 6626 bind(neg_divisor_fastpath); 6627 // Fastpath for divisor < 0: 6628 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6629 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6630 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6631 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6632 movq(rdx, rax); 6633 subq(rax, divisor); 6634 if (VM_Version::supports_bmi1()) { 6635 andnq(rax, rax, rdx); 6636 } else { 6637 notq(rax); 6638 andq(rax, rdx); 6639 } 6640 movq(tmp, rax); 6641 shrq(rax, 63); // quotient 6642 sarq(tmp, 63); 6643 andq(tmp, divisor); 6644 subq(rdx, tmp); // remainder 6645 bind(done); 6646 } 6647 #endif 6648 6649 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6650 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6651 int vlen_enc) { 6652 assert(VM_Version::supports_avx512bw(), ""); 6653 // Byte shuffles are inlane operations and indices are determined using 6654 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6655 // normalized to index range 0-15. This makes sure that all the multiples 6656 // of an index value are placed at same relative position in 128 bit 6657 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6658 // will be 16th element in their respective 128 bit lanes. 6659 movl(rtmp, 16); 6660 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6661 6662 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6663 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6664 // original shuffle indices and move the shuffled lanes corresponding to true 6665 // mask to destination vector. 6666 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6667 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6668 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6669 6670 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6671 // and broadcasting second 128 bit lane. 6672 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6673 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6674 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6675 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6676 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6677 6678 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6679 // and broadcasting third 128 bit lane. 6680 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6681 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6682 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6683 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6684 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6685 6686 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6687 // and broadcasting third 128 bit lane. 6688 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6689 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6690 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6691 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6692 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6693 } 6694 6695 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6696 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6697 if (vlen_enc == AVX_128bit) { 6698 vpermilps(dst, src, shuffle, vlen_enc); 6699 } else if (bt == T_INT) { 6700 vpermd(dst, shuffle, src, vlen_enc); 6701 } else { 6702 assert(bt == T_FLOAT, ""); 6703 vpermps(dst, shuffle, src, vlen_enc); 6704 } 6705 } 6706 6707 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6708 switch(elem_bt) { 6709 case T_BYTE: 6710 if (ideal_opc == Op_SaturatingAddV) { 6711 vpaddsb(dst, src1, src2, vlen_enc); 6712 } else { 6713 assert(ideal_opc == Op_SaturatingSubV, ""); 6714 vpsubsb(dst, src1, src2, vlen_enc); 6715 } 6716 break; 6717 case T_SHORT: 6718 if (ideal_opc == Op_SaturatingAddV) { 6719 vpaddsw(dst, src1, src2, vlen_enc); 6720 } else { 6721 assert(ideal_opc == Op_SaturatingSubV, ""); 6722 vpsubsw(dst, src1, src2, vlen_enc); 6723 } 6724 break; 6725 default: 6726 fatal("Unsupported type %s", type2name(elem_bt)); 6727 break; 6728 } 6729 } 6730 6731 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6732 switch(elem_bt) { 6733 case T_BYTE: 6734 if (ideal_opc == Op_SaturatingAddV) { 6735 vpaddusb(dst, src1, src2, vlen_enc); 6736 } else { 6737 assert(ideal_opc == Op_SaturatingSubV, ""); 6738 vpsubusb(dst, src1, src2, vlen_enc); 6739 } 6740 break; 6741 case T_SHORT: 6742 if (ideal_opc == Op_SaturatingAddV) { 6743 vpaddusw(dst, src1, src2, vlen_enc); 6744 } else { 6745 assert(ideal_opc == Op_SaturatingSubV, ""); 6746 vpsubusw(dst, src1, src2, vlen_enc); 6747 } 6748 break; 6749 default: 6750 fatal("Unsupported type %s", type2name(elem_bt)); 6751 break; 6752 } 6753 } 6754 6755 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6756 XMMRegister src2, KRegister ktmp, int vlen_enc) { 6757 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input. 6758 // overflow_mask = Inp1 <u Inp2 6759 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc); 6760 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative) 6761 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false); 6762 } 6763 6764 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6765 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6766 // Emulate unsigned comparison using signed comparison 6767 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE 6768 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true); 6769 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc); 6770 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc); 6771 6772 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc); 6773 6774 // Res = INP1 - INP2 (non-commutative and non-associative) 6775 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6776 // Res = Mask ? Zero : Res 6777 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); 6778 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc); 6779 } 6780 6781 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6782 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) { 6783 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation. 6784 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2) 6785 // Res = Signed Add INP1, INP2 6786 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6787 // T1 = SRC1 | SRC2 6788 vpor(xtmp1, src1, src2, vlen_enc); 6789 // Max_Unsigned = -1 6790 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6791 // Unsigned compare: Mask = Res <u T1 6792 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc); 6793 // res = Mask ? Max_Unsigned : Res 6794 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc); 6795 } 6796 6797 // 6798 // Section 2-13 Hacker's Delight list following overflow detection check for saturating 6799 // unsigned addition operation. 6800 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1 6801 // 6802 // We empirically determined its semantic equivalence to following reduced expression 6803 // overflow_mask = (a + b) <u (a | b) 6804 // 6805 // and also verified it though Alive2 solver. 6806 // (https://alive2.llvm.org/ce/z/XDQ7dY) 6807 // 6808 6809 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6810 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) { 6811 // Res = Signed Add INP1, INP2 6812 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6813 // Compute T1 = INP1 | INP2 6814 vpor(xtmp3, src1, src2, vlen_enc); 6815 // T1 = Minimum signed value. 6816 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6817 // Convert T1 to signed value, T1 = T1 + MIN_VALUE 6818 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc); 6819 // Convert Res to signed value, Res<s> = Res + MIN_VALUE 6820 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc); 6821 // Compute overflow detection mask = Res<1> <s T1 6822 if (elem_bt == T_INT) { 6823 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc); 6824 } else { 6825 assert(elem_bt == T_LONG, ""); 6826 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc); 6827 } 6828 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6829 } 6830 6831 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6832 int vlen_enc, bool xtmp2_hold_M1) { 6833 if (VM_Version::supports_avx512dq()) { 6834 evpmovq2m(ktmp, src, vlen_enc); 6835 } else { 6836 assert(VM_Version::supports_evex(), ""); 6837 if (!xtmp2_hold_M1) { 6838 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6839 } 6840 evpsraq(xtmp1, src, 63, vlen_enc); 6841 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6842 } 6843 } 6844 6845 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6846 int vlen_enc, bool xtmp2_hold_M1) { 6847 if (VM_Version::supports_avx512dq()) { 6848 evpmovd2m(ktmp, src, vlen_enc); 6849 } else { 6850 assert(VM_Version::supports_evex(), ""); 6851 if (!xtmp2_hold_M1) { 6852 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6853 } 6854 vpsrad(xtmp1, src, 31, vlen_enc); 6855 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6856 } 6857 } 6858 6859 6860 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 6861 if (elem_bt == T_LONG) { 6862 if (VM_Version::supports_evex()) { 6863 evpsraq(dst, src, 63, vlen_enc); 6864 } else { 6865 vpsrad(dst, src, 31, vlen_enc); 6866 vpshufd(dst, dst, 0xF5, vlen_enc); 6867 } 6868 } else { 6869 assert(elem_bt == T_INT, ""); 6870 vpsrad(dst, src, 31, vlen_enc); 6871 } 6872 } 6873 6874 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6875 if (compute_allones) { 6876 if (vlen_enc == Assembler::AVX_512bit) { 6877 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6878 } else { 6879 vpcmpeqq(allones, allones, allones, vlen_enc); 6880 } 6881 } 6882 if (elem_bt == T_LONG) { 6883 vpsrlq(dst, allones, 1, vlen_enc); 6884 } else { 6885 assert(elem_bt == T_INT, ""); 6886 vpsrld(dst, allones, 1, vlen_enc); 6887 } 6888 } 6889 6890 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6891 if (compute_allones) { 6892 if (vlen_enc == Assembler::AVX_512bit) { 6893 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6894 } else { 6895 vpcmpeqq(allones, allones, allones, vlen_enc); 6896 } 6897 } 6898 if (elem_bt == T_LONG) { 6899 vpsllq(dst, allones, 63, vlen_enc); 6900 } else { 6901 assert(elem_bt == T_INT, ""); 6902 vpslld(dst, allones, 31, vlen_enc); 6903 } 6904 } 6905 6906 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2, 6907 Assembler::ComparisonPredicate cond, int vlen_enc) { 6908 switch(elem_bt) { 6909 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break; 6910 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break; 6911 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break; 6912 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break; 6913 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6914 } 6915 } 6916 6917 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6918 switch(elem_bt) { 6919 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break; 6920 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break; 6921 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break; 6922 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break; 6923 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6924 } 6925 } 6926 6927 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1, 6928 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) { 6929 if (elem_bt == T_LONG) { 6930 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6931 } else { 6932 assert(elem_bt == T_INT, ""); 6933 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6934 } 6935 } 6936 6937 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6938 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6939 KRegister ktmp1, KRegister ktmp2, int vlen_enc) { 6940 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6941 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6942 // Overflow detection based on Hacker's delight section 2-13. 6943 if (ideal_opc == Op_SaturatingAddV) { 6944 // res = src1 + src2 6945 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6946 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6947 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6948 vpxor(xtmp1, dst, src1, vlen_enc); 6949 vpxor(xtmp2, dst, src2, vlen_enc); 6950 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6951 } else { 6952 assert(ideal_opc == Op_SaturatingSubV, ""); 6953 // res = src1 - src2 6954 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6955 // Overflow occurs when both inputs have opposite polarity and 6956 // result polarity does not comply with first input polarity. 6957 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6958 vpxor(xtmp1, src1, src2, vlen_enc); 6959 vpxor(xtmp2, dst, src1, vlen_enc); 6960 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6961 } 6962 6963 // Compute overflow detection mask. 6964 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc); 6965 // Note: xtmp1 hold -1 in all its lanes after above call. 6966 6967 // Compute mask based on first input polarity. 6968 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true); 6969 6970 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6971 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6972 6973 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to 6974 // set bits in first input polarity mask holds a min value. 6975 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc); 6976 // Blend destination lanes with saturated values using overflow detection mask. 6977 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc); 6978 } 6979 6980 6981 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6982 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6983 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) { 6984 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6985 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6986 // Overflow detection based on Hacker's delight section 2-13. 6987 if (ideal_opc == Op_SaturatingAddV) { 6988 // res = src1 + src2 6989 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6990 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6991 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6992 vpxor(xtmp1, dst, src1, vlen_enc); 6993 vpxor(xtmp2, dst, src2, vlen_enc); 6994 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6995 } else { 6996 assert(ideal_opc == Op_SaturatingSubV, ""); 6997 // res = src1 - src2 6998 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6999 // Overflow occurs when both inputs have opposite polarity and 7000 // result polarity does not comply with first input polarity. 7001 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 7002 vpxor(xtmp1, src1, src2, vlen_enc); 7003 vpxor(xtmp2, dst, src1, vlen_enc); 7004 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 7005 } 7006 7007 // Sign-extend to compute overflow detection mask. 7008 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc); 7009 7010 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc); 7011 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc); 7012 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 7013 7014 // Compose saturating min/max vector using first input polarity mask. 7015 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc); 7016 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc); 7017 7018 // Blend result with saturating vector using overflow detection mask. 7019 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 7020 } 7021 7022 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7023 switch(elem_bt) { 7024 case T_BYTE: 7025 if (ideal_opc == Op_SaturatingAddV) { 7026 vpaddsb(dst, src1, src2, vlen_enc); 7027 } else { 7028 assert(ideal_opc == Op_SaturatingSubV, ""); 7029 vpsubsb(dst, src1, src2, vlen_enc); 7030 } 7031 break; 7032 case T_SHORT: 7033 if (ideal_opc == Op_SaturatingAddV) { 7034 vpaddsw(dst, src1, src2, vlen_enc); 7035 } else { 7036 assert(ideal_opc == Op_SaturatingSubV, ""); 7037 vpsubsw(dst, src1, src2, vlen_enc); 7038 } 7039 break; 7040 default: 7041 fatal("Unsupported type %s", type2name(elem_bt)); 7042 break; 7043 } 7044 } 7045 7046 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7047 switch(elem_bt) { 7048 case T_BYTE: 7049 if (ideal_opc == Op_SaturatingAddV) { 7050 vpaddusb(dst, src1, src2, vlen_enc); 7051 } else { 7052 assert(ideal_opc == Op_SaturatingSubV, ""); 7053 vpsubusb(dst, src1, src2, vlen_enc); 7054 } 7055 break; 7056 case T_SHORT: 7057 if (ideal_opc == Op_SaturatingAddV) { 7058 vpaddusw(dst, src1, src2, vlen_enc); 7059 } else { 7060 assert(ideal_opc == Op_SaturatingSubV, ""); 7061 vpsubusw(dst, src1, src2, vlen_enc); 7062 } 7063 break; 7064 default: 7065 fatal("Unsupported type %s", type2name(elem_bt)); 7066 break; 7067 } 7068 } 7069 7070 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 7071 XMMRegister src2, int vlen_enc) { 7072 switch(elem_bt) { 7073 case T_BYTE: 7074 evpermi2b(dst, src1, src2, vlen_enc); 7075 break; 7076 case T_SHORT: 7077 evpermi2w(dst, src1, src2, vlen_enc); 7078 break; 7079 case T_INT: 7080 evpermi2d(dst, src1, src2, vlen_enc); 7081 break; 7082 case T_LONG: 7083 evpermi2q(dst, src1, src2, vlen_enc); 7084 break; 7085 case T_FLOAT: 7086 evpermi2ps(dst, src1, src2, vlen_enc); 7087 break; 7088 case T_DOUBLE: 7089 evpermi2pd(dst, src1, src2, vlen_enc); 7090 break; 7091 default: 7092 fatal("Unsupported type %s", type2name(elem_bt)); 7093 break; 7094 } 7095 } 7096 7097 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) { 7098 if (is_unsigned) { 7099 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7100 } else { 7101 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7102 } 7103 } 7104 7105 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) { 7106 if (is_unsigned) { 7107 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7108 } else { 7109 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7110 } 7111 }