1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/globals.hpp" 37 #include "runtime/objectMonitor.hpp" 38 #include "runtime/stubRoutines.hpp" 39 #include "utilities/checkedCast.hpp" 40 #include "utilities/globalDefinitions.hpp" 41 #include "utilities/powerOfTwo.hpp" 42 #include "utilities/sizes.hpp" 43 44 #ifdef PRODUCT 45 #define BLOCK_COMMENT(str) /* nothing */ 46 #define STOP(error) stop(error) 47 #else 48 #define BLOCK_COMMENT(str) block_comment(str) 49 #define STOP(error) block_comment(error); stop(error) 50 #endif 51 52 // C2 compiled method's prolog code. 53 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) { 54 if (C->clinit_barrier_on_entry()) { 55 assert(VM_Version::supports_fast_class_init_checks(), "sanity"); 56 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started"); 57 58 Label L_skip_barrier; 59 Register klass = rscratch1; 60 61 mov_metadata(klass, C->method()->holder()->constant_encoding()); 62 clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 63 64 jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 65 66 bind(L_skip_barrier); 67 } 68 69 int framesize = C->output()->frame_size_in_bytes(); 70 int bangsize = C->output()->bang_size_in_bytes(); 71 bool fp_mode_24b = false; 72 int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0; 73 74 // WARNING: Initial instruction MUST be 5 bytes or longer so that 75 // NativeJump::patch_verified_entry will be able to patch out the entry 76 // code safely. The push to verify stack depth is ok at 5 bytes, 77 // the frame allocation can be either 3 or 6 bytes. So if we don't do 78 // stack bang then we must use the 6 byte frame allocation even if 79 // we have no frame. :-( 80 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 81 82 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 83 // Remove word for return addr 84 framesize -= wordSize; 85 stack_bang_size -= wordSize; 86 87 // Calls to C2R adapters often do not accept exceptional returns. 88 // We require that their callers must bang for them. But be careful, because 89 // some VM calls (such as call site linkage) can use several kilobytes of 90 // stack. But the stack safety zone should account for that. 91 // See bugs 4446381, 4468289, 4497237. 92 if (stack_bang_size > 0) { 93 generate_stack_overflow_check(stack_bang_size); 94 95 // We always push rbp, so that on return to interpreter rbp, will be 96 // restored correctly and we can correct the stack. 97 push(rbp); 98 // Save caller's stack pointer into RBP if the frame pointer is preserved. 99 if (PreserveFramePointer) { 100 mov(rbp, rsp); 101 } 102 // Remove word for ebp 103 framesize -= wordSize; 104 105 // Create frame 106 if (framesize) { 107 subptr(rsp, framesize); 108 } 109 } else { 110 // Create frame (force generation of a 4 byte immediate value) 111 subptr_imm32(rsp, framesize); 112 113 // Save RBP register now. 114 framesize -= wordSize; 115 movptr(Address(rsp, framesize), rbp); 116 // Save caller's stack pointer into RBP if the frame pointer is preserved. 117 if (PreserveFramePointer) { 118 movptr(rbp, rsp); 119 if (framesize > 0) { 120 addptr(rbp, framesize); 121 } 122 } 123 } 124 125 if (C->needs_stack_repair()) { 126 // Save stack increment just below the saved rbp (also account for fixed framesize and rbp) 127 assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned"); 128 movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize); 129 } 130 131 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 132 framesize -= wordSize; 133 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 134 } 135 136 #ifndef _LP64 137 // If method sets FPU control word do it now 138 if (fp_mode_24b) { 139 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 140 } 141 if (UseSSE >= 2 && VerifyFPU) { 142 verify_FPU(0, "FPU stack must be clean on entry"); 143 } 144 #endif 145 146 #ifdef ASSERT 147 if (VerifyStackAtCalls) { 148 Label L; 149 push(rax); 150 mov(rax, rsp); 151 andptr(rax, StackAlignmentInBytes-1); 152 cmpptr(rax, StackAlignmentInBytes-wordSize); 153 pop(rax); 154 jcc(Assembler::equal, L); 155 STOP("Stack is not properly aligned!"); 156 bind(L); 157 } 158 #endif 159 } 160 161 void C2_MacroAssembler::entry_barrier() { 162 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 163 #ifdef _LP64 164 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 165 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 166 Label dummy_slow_path; 167 Label dummy_continuation; 168 Label* slow_path = &dummy_slow_path; 169 Label* continuation = &dummy_continuation; 170 if (!Compile::current()->output()->in_scratch_emit_size()) { 171 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 172 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 173 Compile::current()->output()->add_stub(stub); 174 slow_path = &stub->entry(); 175 continuation = &stub->continuation(); 176 } 177 bs->nmethod_entry_barrier(this, slow_path, continuation); 178 } 179 #else 180 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 181 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 182 #endif 183 } 184 185 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 186 switch (vlen_in_bytes) { 187 case 4: // fall-through 188 case 8: // fall-through 189 case 16: return Assembler::AVX_128bit; 190 case 32: return Assembler::AVX_256bit; 191 case 64: return Assembler::AVX_512bit; 192 193 default: { 194 ShouldNotReachHere(); 195 return Assembler::AVX_NoVec; 196 } 197 } 198 } 199 200 // fast_lock and fast_unlock used by C2 201 202 // Because the transitions from emitted code to the runtime 203 // monitorenter/exit helper stubs are so slow it's critical that 204 // we inline both the stack-locking fast path and the inflated fast path. 205 // 206 // See also: cmpFastLock and cmpFastUnlock. 207 // 208 // What follows is a specialized inline transliteration of the code 209 // in enter() and exit(). If we're concerned about I$ bloat another 210 // option would be to emit TrySlowEnter and TrySlowExit methods 211 // at startup-time. These methods would accept arguments as 212 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 213 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 214 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 215 // In practice, however, the # of lock sites is bounded and is usually small. 216 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 217 // if the processor uses simple bimodal branch predictors keyed by EIP 218 // Since the helper routines would be called from multiple synchronization 219 // sites. 220 // 221 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 222 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 223 // to those specialized methods. That'd give us a mostly platform-independent 224 // implementation that the JITs could optimize and inline at their pleasure. 225 // Done correctly, the only time we'd need to cross to native could would be 226 // to park() or unpark() threads. We'd also need a few more unsafe operators 227 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 228 // (b) explicit barriers or fence operations. 229 // 230 // TODO: 231 // 232 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 233 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 234 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 235 // the lock operators would typically be faster than reifying Self. 236 // 237 // * Ideally I'd define the primitives as: 238 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 239 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 240 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 241 // Instead, we're stuck with a rather awkward and brittle register assignments below. 242 // Furthermore the register assignments are overconstrained, possibly resulting in 243 // sub-optimal code near the synchronization site. 244 // 245 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 246 // Alternately, use a better sp-proximity test. 247 // 248 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 249 // Either one is sufficient to uniquely identify a thread. 250 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 251 // 252 // * Intrinsify notify() and notifyAll() for the common cases where the 253 // object is locked by the calling thread but the waitlist is empty. 254 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 255 // 256 // * use jccb and jmpb instead of jcc and jmp to improve code density. 257 // But beware of excessive branch density on AMD Opterons. 258 // 259 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 260 // or failure of the fast path. If the fast path fails then we pass 261 // control to the slow path, typically in C. In fast_lock and 262 // fast_unlock we often branch to DONE_LABEL, just to find that C2 263 // will emit a conditional branch immediately after the node. 264 // So we have branches to branches and lots of ICC.ZF games. 265 // Instead, it might be better to have C2 pass a "FailureLabel" 266 // into fast_lock and fast_unlock. In the case of success, control 267 // will drop through the node. ICC.ZF is undefined at exit. 268 // In the case of failure, the node will branch directly to the 269 // FailureLabel 270 271 272 // obj: object to lock 273 // box: on-stack box address (displaced header location) - KILLED 274 // rax,: tmp -- KILLED 275 // scr: tmp -- KILLED 276 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 277 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 278 Metadata* method_data) { 279 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 280 // Ensure the register assignments are disjoint 281 assert(tmpReg == rax, ""); 282 assert(cx1Reg == noreg, ""); 283 assert(cx2Reg == noreg, ""); 284 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 285 286 // Possible cases that we'll encounter in fast_lock 287 // ------------------------------------------------ 288 // * Inflated 289 // -- unlocked 290 // -- Locked 291 // = by self 292 // = by other 293 // * neutral 294 // * stack-locked 295 // -- by self 296 // = sp-proximity test hits 297 // = sp-proximity test generates false-negative 298 // -- by other 299 // 300 301 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 302 303 if (DiagnoseSyncOnValueBasedClasses != 0) { 304 load_klass(tmpReg, objReg, scrReg); 305 testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 306 jcc(Assembler::notZero, DONE_LABEL); 307 } 308 309 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 310 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 311 jcc(Assembler::notZero, IsInflated); 312 313 if (LockingMode == LM_MONITOR) { 314 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 315 testptr(objReg, objReg); 316 } else { 317 assert(LockingMode == LM_LEGACY, "must be"); 318 // Attempt stack-locking ... 319 orptr (tmpReg, markWord::unlocked_value); 320 if (EnableValhalla) { 321 // Mask inline_type bit such that we go to the slow path if object is an inline type 322 andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place)); 323 } 324 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 325 lock(); 326 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 327 jcc(Assembler::equal, COUNT); // Success 328 329 // Recursive locking. 330 // The object is stack-locked: markword contains stack pointer to BasicLock. 331 // Locked by current thread if difference with current SP is less than one page. 332 subptr(tmpReg, rsp); 333 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 334 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 335 movptr(Address(boxReg, 0), tmpReg); 336 } 337 jmp(DONE_LABEL); 338 339 bind(IsInflated); 340 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 341 342 #ifndef _LP64 343 // The object is inflated. 344 345 // boxReg refers to the on-stack BasicLock in the current frame. 346 // We'd like to write: 347 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 348 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 349 // additional latency as we have another ST in the store buffer that must drain. 350 351 // avoid ST-before-CAS 352 // register juggle because we need tmpReg for cmpxchgptr below 353 movptr(scrReg, boxReg); 354 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 355 356 // Optimistic form: consider XORL tmpReg,tmpReg 357 movptr(tmpReg, NULL_WORD); 358 359 // Appears unlocked - try to swing _owner from null to non-null. 360 // Ideally, I'd manifest "Self" with get_thread and then attempt 361 // to CAS the register containing Self into m->Owner. 362 // But we don't have enough registers, so instead we can either try to CAS 363 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 364 // we later store "Self" into m->Owner. Transiently storing a stack address 365 // (rsp or the address of the box) into m->owner is harmless. 366 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 367 lock(); 368 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 369 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 370 // If we weren't able to swing _owner from null to the BasicLock 371 // then take the slow path. 372 jccb (Assembler::notZero, NO_COUNT); 373 // update _owner from BasicLock to thread 374 get_thread (scrReg); // beware: clobbers ICCs 375 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 376 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 377 378 // If the CAS fails we can either retry or pass control to the slow path. 379 // We use the latter tactic. 380 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 381 // If the CAS was successful ... 382 // Self has acquired the lock 383 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 384 // Intentional fall-through into DONE_LABEL ... 385 #else // _LP64 386 // It's inflated and we use scrReg for ObjectMonitor* in this section. 387 movq(scrReg, tmpReg); 388 xorq(tmpReg, tmpReg); 389 lock(); 390 cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 391 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 392 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 393 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 394 // Propagate ICC.ZF from CAS above into DONE_LABEL. 395 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 396 397 cmpptr(thread, rax); // Check if we are already the owner (recursive lock) 398 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 399 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 400 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 401 #endif // _LP64 402 bind(DONE_LABEL); 403 404 // ZFlag == 1 count in fast path 405 // ZFlag == 0 count in slow path 406 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 407 408 bind(COUNT); 409 // Count monitors in fast path 410 increment(Address(thread, JavaThread::held_monitor_count_offset())); 411 412 xorl(tmpReg, tmpReg); // Set ZF == 1 413 414 bind(NO_COUNT); 415 416 // At NO_COUNT the icc ZFlag is set as follows ... 417 // fast_unlock uses the same protocol. 418 // ZFlag == 1 -> Success 419 // ZFlag == 0 -> Failure - force control through the slow path 420 } 421 422 // obj: object to unlock 423 // box: box address (displaced header location), killed. Must be EAX. 424 // tmp: killed, cannot be obj nor box. 425 // 426 // Some commentary on balanced locking: 427 // 428 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 429 // Methods that don't have provably balanced locking are forced to run in the 430 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 431 // The interpreter provides two properties: 432 // I1: At return-time the interpreter automatically and quietly unlocks any 433 // objects acquired the current activation (frame). Recall that the 434 // interpreter maintains an on-stack list of locks currently held by 435 // a frame. 436 // I2: If a method attempts to unlock an object that is not held by the 437 // the frame the interpreter throws IMSX. 438 // 439 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 440 // B() doesn't have provably balanced locking so it runs in the interpreter. 441 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 442 // is still locked by A(). 443 // 444 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 445 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 446 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 447 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 448 // Arguably given that the spec legislates the JNI case as undefined our implementation 449 // could reasonably *avoid* checking owner in fast_unlock(). 450 // In the interest of performance we elide m->Owner==Self check in unlock. 451 // A perfectly viable alternative is to elide the owner check except when 452 // Xcheck:jni is enabled. 453 454 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { 455 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 456 assert(boxReg == rax, ""); 457 assert_different_registers(objReg, boxReg, tmpReg); 458 459 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 460 461 if (LockingMode == LM_LEGACY) { 462 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 463 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 464 } 465 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 466 if (LockingMode != LM_MONITOR) { 467 testptr(tmpReg, markWord::monitor_value); // Inflated? 468 jcc(Assembler::zero, Stacked); 469 } 470 471 // It's inflated. 472 473 // Despite our balanced locking property we still check that m->_owner == Self 474 // as java routines or native JNI code called by this thread might 475 // have released the lock. 476 // Refer to the comments in synchronizer.cpp for how we might encode extra 477 // state in _succ so we can avoid fetching EntryList|cxq. 478 // 479 // If there's no contention try a 1-0 exit. That is, exit without 480 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 481 // we detect and recover from the race that the 1-0 exit admits. 482 // 483 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 484 // before it STs null into _owner, releasing the lock. Updates 485 // to data protected by the critical section must be visible before 486 // we drop the lock (and thus before any other thread could acquire 487 // the lock and observe the fields protected by the lock). 488 // IA32's memory-model is SPO, so STs are ordered with respect to 489 // each other and there's no need for an explicit barrier (fence). 490 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 491 Label LSuccess, LNotRecursive; 492 493 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 494 jccb(Assembler::equal, LNotRecursive); 495 496 // Recursive inflated unlock 497 decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 498 jmpb(LSuccess); 499 500 bind(LNotRecursive); 501 502 // Set owner to null. 503 // Release to satisfy the JMM 504 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 505 // We need a full fence after clearing owner to avoid stranding. 506 // StoreLoad achieves this. 507 membar(StoreLoad); 508 509 // Check if the entry lists are empty. 510 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 511 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 512 jccb(Assembler::zero, LSuccess); // If so we are done. 513 514 // Check if there is a successor. 515 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 516 jccb(Assembler::notZero, LSuccess); // If so we are done. 517 518 // Save the monitor pointer in the current thread, so we can try to 519 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 520 andptr(tmpReg, ~(int32_t)markWord::monitor_value); 521 #ifndef _LP64 522 get_thread(boxReg); 523 movptr(Address(boxReg, JavaThread::unlocked_inflated_monitor_offset()), tmpReg); 524 #else // _LP64 525 movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg); 526 #endif 527 528 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 529 jmpb (DONE_LABEL); 530 531 bind (LSuccess); 532 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 533 jmpb (DONE_LABEL); 534 535 if (LockingMode == LM_LEGACY) { 536 bind (Stacked); 537 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 538 lock(); 539 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 540 // Intentional fall-thru into DONE_LABEL 541 } 542 543 bind(DONE_LABEL); 544 545 // ZFlag == 1 count in fast path 546 // ZFlag == 0 count in slow path 547 jccb(Assembler::notZero, NO_COUNT); 548 549 bind(COUNT); 550 // Count monitors in fast path 551 #ifndef _LP64 552 get_thread(tmpReg); 553 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 554 #else // _LP64 555 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 556 #endif 557 558 xorl(tmpReg, tmpReg); // Set ZF == 1 559 560 bind(NO_COUNT); 561 } 562 563 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 564 Register t, Register thread) { 565 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 566 assert(rax_reg == rax, "Used for CAS"); 567 assert_different_registers(obj, box, rax_reg, t, thread); 568 569 // Handle inflated monitor. 570 Label inflated; 571 // Finish fast lock successfully. ZF value is irrelevant. 572 Label locked; 573 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 574 Label slow_path; 575 576 if (UseObjectMonitorTable) { 577 // Clear cache in case fast locking succeeds. 578 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 579 } 580 581 if (DiagnoseSyncOnValueBasedClasses != 0) { 582 load_klass(rax_reg, obj, t); 583 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 584 jcc(Assembler::notZero, slow_path); 585 } 586 587 const Register mark = t; 588 589 { // Lightweight Lock 590 591 Label push; 592 593 const Register top = UseObjectMonitorTable ? rax_reg : box; 594 595 // Load the mark. 596 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 597 598 // Prefetch top. 599 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 600 601 // Check for monitor (0b10). 602 testptr(mark, markWord::monitor_value); 603 jcc(Assembler::notZero, inflated); 604 605 // Check if lock-stack is full. 606 cmpl(top, LockStack::end_offset() - 1); 607 jcc(Assembler::greater, slow_path); 608 609 // Check if recursive. 610 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 611 jccb(Assembler::equal, push); 612 613 // Try to lock. Transition lock bits 0b01 => 0b00 614 movptr(rax_reg, mark); 615 orptr(rax_reg, markWord::unlocked_value); 616 andptr(mark, ~(int32_t)markWord::unlocked_value); 617 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 618 jcc(Assembler::notEqual, slow_path); 619 620 if (UseObjectMonitorTable) { 621 // Need to reload top, clobbered by CAS. 622 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 623 } 624 bind(push); 625 // After successful lock, push object on lock-stack. 626 movptr(Address(thread, top), obj); 627 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 628 jmpb(locked); 629 } 630 631 { // Handle inflated monitor. 632 bind(inflated); 633 634 const Register monitor = t; 635 636 if (!UseObjectMonitorTable) { 637 assert(mark == monitor, "should be the same here"); 638 } else { 639 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 640 // Fetch ObjectMonitor* from the cache or take the slow-path. 641 Label monitor_found; 642 643 // Load cache address 644 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 645 646 const int num_unrolled = 2; 647 for (int i = 0; i < num_unrolled; i++) { 648 cmpptr(obj, Address(t)); 649 jccb(Assembler::equal, monitor_found); 650 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 651 } 652 653 Label loop; 654 655 // Search for obj in cache. 656 bind(loop); 657 658 // Check for match. 659 cmpptr(obj, Address(t)); 660 jccb(Assembler::equal, monitor_found); 661 662 // Search until null encountered, guaranteed _null_sentinel at end. 663 cmpptr(Address(t), 1); 664 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 665 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 666 jmpb(loop); 667 668 // Cache hit. 669 bind(monitor_found); 670 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 671 } 672 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 673 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 674 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 675 676 Label monitor_locked; 677 // Lock the monitor. 678 679 // CAS owner (null => current thread). 680 xorptr(rax_reg, rax_reg); 681 lock(); cmpxchgptr(thread, owner_address); 682 jccb(Assembler::equal, monitor_locked); 683 684 // Check if recursive. 685 cmpptr(thread, rax_reg); 686 jccb(Assembler::notEqual, slow_path); 687 688 // Recursive. 689 increment(recursions_address); 690 691 bind(monitor_locked); 692 if (UseObjectMonitorTable) { 693 // Cache the monitor for unlock 694 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 695 } 696 } 697 698 bind(locked); 699 increment(Address(thread, JavaThread::held_monitor_count_offset())); 700 // Set ZF = 1 701 xorl(rax_reg, rax_reg); 702 703 #ifdef ASSERT 704 // Check that locked label is reached with ZF set. 705 Label zf_correct; 706 Label zf_bad_zero; 707 jcc(Assembler::zero, zf_correct); 708 jmp(zf_bad_zero); 709 #endif 710 711 bind(slow_path); 712 #ifdef ASSERT 713 // Check that slow_path label is reached with ZF not set. 714 jcc(Assembler::notZero, zf_correct); 715 stop("Fast Lock ZF != 0"); 716 bind(zf_bad_zero); 717 stop("Fast Lock ZF != 1"); 718 bind(zf_correct); 719 #endif 720 // C2 uses the value of ZF to determine the continuation. 721 } 722 723 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 724 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 725 assert(reg_rax == rax, "Used for CAS"); 726 assert_different_registers(obj, reg_rax, t); 727 728 // Handle inflated monitor. 729 Label inflated, inflated_check_lock_stack; 730 // Finish fast unlock successfully. MUST jump with ZF == 1 731 Label unlocked, slow_path; 732 733 const Register mark = t; 734 const Register monitor = t; 735 const Register top = UseObjectMonitorTable ? t : reg_rax; 736 const Register box = reg_rax; 737 738 Label dummy; 739 C2FastUnlockLightweightStub* stub = nullptr; 740 741 if (!Compile::current()->output()->in_scratch_emit_size()) { 742 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 743 Compile::current()->output()->add_stub(stub); 744 } 745 746 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 747 748 { // Lightweight Unlock 749 750 // Load top. 751 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 752 753 if (!UseObjectMonitorTable) { 754 // Prefetch mark. 755 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 756 } 757 758 // Check if obj is top of lock-stack. 759 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 760 // Top of lock stack was not obj. Must be monitor. 761 jcc(Assembler::notEqual, inflated_check_lock_stack); 762 763 // Pop lock-stack. 764 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 765 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 766 767 // Check if recursive. 768 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 769 jcc(Assembler::equal, unlocked); 770 771 // We elide the monitor check, let the CAS fail instead. 772 773 if (UseObjectMonitorTable) { 774 // Load mark. 775 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 776 } 777 778 // Try to unlock. Transition lock bits 0b00 => 0b01 779 movptr(reg_rax, mark); 780 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 781 orptr(mark, markWord::unlocked_value); 782 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 783 jcc(Assembler::notEqual, push_and_slow_path); 784 jmp(unlocked); 785 } 786 787 788 { // Handle inflated monitor. 789 bind(inflated_check_lock_stack); 790 #ifdef ASSERT 791 Label check_done; 792 subl(top, oopSize); 793 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 794 jcc(Assembler::below, check_done); 795 cmpptr(obj, Address(thread, top)); 796 jccb(Assembler::notEqual, inflated_check_lock_stack); 797 stop("Fast Unlock lock on stack"); 798 bind(check_done); 799 if (UseObjectMonitorTable) { 800 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 801 } 802 testptr(mark, markWord::monitor_value); 803 jccb(Assembler::notZero, inflated); 804 stop("Fast Unlock not monitor"); 805 #endif 806 807 bind(inflated); 808 809 if (!UseObjectMonitorTable) { 810 assert(mark == monitor, "should be the same here"); 811 } else { 812 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 813 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 814 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 815 cmpptr(monitor, alignof(ObjectMonitor*)); 816 jcc(Assembler::below, slow_path); 817 } 818 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 819 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 820 const Address cxq_address{monitor, ObjectMonitor::cxq_offset() - monitor_tag}; 821 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 822 const Address EntryList_address{monitor, ObjectMonitor::EntryList_offset() - monitor_tag}; 823 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 824 825 Label recursive; 826 827 // Check if recursive. 828 cmpptr(recursions_address, 0); 829 jccb(Assembler::notZero, recursive); 830 831 // Set owner to null. 832 // Release to satisfy the JMM 833 movptr(owner_address, NULL_WORD); 834 // We need a full fence after clearing owner to avoid stranding. 835 // StoreLoad achieves this. 836 membar(StoreLoad); 837 838 // Check if the entry lists are empty. 839 movptr(reg_rax, cxq_address); 840 orptr(reg_rax, EntryList_address); 841 jccb(Assembler::zero, unlocked); // If so we are done. 842 843 // Check if there is a successor. 844 cmpptr(succ_address, NULL_WORD); 845 jccb(Assembler::notZero, unlocked); // If so we are done. 846 847 // Save the monitor pointer in the current thread, so we can try to 848 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 849 if (!UseObjectMonitorTable) { 850 andptr(monitor, ~(int32_t)markWord::monitor_value); 851 } 852 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 853 854 orl(t, 1); // Fast Unlock ZF = 0 855 jmpb(slow_path); 856 857 // Recursive unlock. 858 bind(recursive); 859 decrement(recursions_address); 860 } 861 862 bind(unlocked); 863 decrement(Address(thread, JavaThread::held_monitor_count_offset())); 864 xorl(t, t); // Fast Unlock ZF = 1 865 866 #ifdef ASSERT 867 // Check that unlocked label is reached with ZF set. 868 Label zf_correct; 869 jcc(Assembler::zero, zf_correct); 870 stop("Fast Unlock ZF != 1"); 871 #endif 872 873 bind(slow_path); 874 if (stub != nullptr) { 875 bind(stub->slow_path_continuation()); 876 } 877 #ifdef ASSERT 878 // Check that stub->continuation() label is reached with ZF not set. 879 jccb(Assembler::notZero, zf_correct); 880 stop("Fast Unlock ZF != 0"); 881 bind(zf_correct); 882 #endif 883 // C2 uses the value of ZF to determine the continuation. 884 } 885 886 //------------------------------------------------------------------------------------------- 887 // Generic instructions support for use in .ad files C2 code generation 888 889 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 890 if (dst != src) { 891 movdqu(dst, src); 892 } 893 if (opcode == Op_AbsVD) { 894 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 895 } else { 896 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 897 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 898 } 899 } 900 901 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 902 if (opcode == Op_AbsVD) { 903 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 904 } else { 905 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 906 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 907 } 908 } 909 910 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 911 if (dst != src) { 912 movdqu(dst, src); 913 } 914 if (opcode == Op_AbsVF) { 915 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 916 } else { 917 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 918 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 919 } 920 } 921 922 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 923 if (opcode == Op_AbsVF) { 924 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 925 } else { 926 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 927 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 928 } 929 } 930 931 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 932 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 933 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 934 935 if (opcode == Op_MinV) { 936 if (elem_bt == T_BYTE) { 937 pminsb(dst, src); 938 } else if (elem_bt == T_SHORT) { 939 pminsw(dst, src); 940 } else if (elem_bt == T_INT) { 941 pminsd(dst, src); 942 } else { 943 assert(elem_bt == T_LONG, "required"); 944 assert(tmp == xmm0, "required"); 945 assert_different_registers(dst, src, tmp); 946 movdqu(xmm0, dst); 947 pcmpgtq(xmm0, src); 948 blendvpd(dst, src); // xmm0 as mask 949 } 950 } else { // opcode == Op_MaxV 951 if (elem_bt == T_BYTE) { 952 pmaxsb(dst, src); 953 } else if (elem_bt == T_SHORT) { 954 pmaxsw(dst, src); 955 } else if (elem_bt == T_INT) { 956 pmaxsd(dst, src); 957 } else { 958 assert(elem_bt == T_LONG, "required"); 959 assert(tmp == xmm0, "required"); 960 assert_different_registers(dst, src, tmp); 961 movdqu(xmm0, src); 962 pcmpgtq(xmm0, dst); 963 blendvpd(dst, src); // xmm0 as mask 964 } 965 } 966 } 967 968 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 969 XMMRegister dst, XMMRegister src1, XMMRegister src2, 970 int vlen_enc) { 971 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 972 973 if (opcode == Op_MinV) { 974 if (elem_bt == T_BYTE) { 975 vpminsb(dst, src1, src2, vlen_enc); 976 } else if (elem_bt == T_SHORT) { 977 vpminsw(dst, src1, src2, vlen_enc); 978 } else if (elem_bt == T_INT) { 979 vpminsd(dst, src1, src2, vlen_enc); 980 } else { 981 assert(elem_bt == T_LONG, "required"); 982 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 983 vpminsq(dst, src1, src2, vlen_enc); 984 } else { 985 assert_different_registers(dst, src1, src2); 986 vpcmpgtq(dst, src1, src2, vlen_enc); 987 vblendvpd(dst, src1, src2, dst, vlen_enc); 988 } 989 } 990 } else { // opcode == Op_MaxV 991 if (elem_bt == T_BYTE) { 992 vpmaxsb(dst, src1, src2, vlen_enc); 993 } else if (elem_bt == T_SHORT) { 994 vpmaxsw(dst, src1, src2, vlen_enc); 995 } else if (elem_bt == T_INT) { 996 vpmaxsd(dst, src1, src2, vlen_enc); 997 } else { 998 assert(elem_bt == T_LONG, "required"); 999 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1000 vpmaxsq(dst, src1, src2, vlen_enc); 1001 } else { 1002 assert_different_registers(dst, src1, src2); 1003 vpcmpgtq(dst, src1, src2, vlen_enc); 1004 vblendvpd(dst, src2, src1, dst, vlen_enc); 1005 } 1006 } 1007 } 1008 } 1009 1010 // Float/Double min max 1011 1012 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1013 XMMRegister dst, XMMRegister a, XMMRegister b, 1014 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1015 int vlen_enc) { 1016 assert(UseAVX > 0, "required"); 1017 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1018 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1019 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1020 assert_different_registers(a, tmp, atmp, btmp); 1021 assert_different_registers(b, tmp, atmp, btmp); 1022 1023 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1024 bool is_double_word = is_double_word_type(elem_bt); 1025 1026 /* Note on 'non-obvious' assembly sequence: 1027 * 1028 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1029 * and Java on how they handle floats: 1030 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1031 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1032 * 1033 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1034 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1035 * (only useful when signs differ, noop otherwise) 1036 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1037 1038 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1039 * btmp = (b < +0.0) ? a : b 1040 * atmp = (b < +0.0) ? b : a 1041 * Tmp = Max_Float(atmp , btmp) 1042 * Res = (atmp == NaN) ? atmp : Tmp 1043 */ 1044 1045 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1046 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1047 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1048 XMMRegister mask; 1049 1050 if (!is_double_word && is_min) { 1051 mask = a; 1052 vblend = &MacroAssembler::vblendvps; 1053 vmaxmin = &MacroAssembler::vminps; 1054 vcmp = &MacroAssembler::vcmpps; 1055 } else if (!is_double_word && !is_min) { 1056 mask = b; 1057 vblend = &MacroAssembler::vblendvps; 1058 vmaxmin = &MacroAssembler::vmaxps; 1059 vcmp = &MacroAssembler::vcmpps; 1060 } else if (is_double_word && is_min) { 1061 mask = a; 1062 vblend = &MacroAssembler::vblendvpd; 1063 vmaxmin = &MacroAssembler::vminpd; 1064 vcmp = &MacroAssembler::vcmppd; 1065 } else { 1066 assert(is_double_word && !is_min, "sanity"); 1067 mask = b; 1068 vblend = &MacroAssembler::vblendvpd; 1069 vmaxmin = &MacroAssembler::vmaxpd; 1070 vcmp = &MacroAssembler::vcmppd; 1071 } 1072 1073 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1074 XMMRegister maxmin, scratch; 1075 if (dst == btmp) { 1076 maxmin = btmp; 1077 scratch = tmp; 1078 } else { 1079 maxmin = tmp; 1080 scratch = btmp; 1081 } 1082 1083 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1084 if (precompute_mask && !is_double_word) { 1085 vpsrad(tmp, mask, 32, vlen_enc); 1086 mask = tmp; 1087 } else if (precompute_mask && is_double_word) { 1088 vpxor(tmp, tmp, tmp, vlen_enc); 1089 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1090 mask = tmp; 1091 } 1092 1093 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1094 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1095 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1096 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1097 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1098 } 1099 1100 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1101 XMMRegister dst, XMMRegister a, XMMRegister b, 1102 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1103 int vlen_enc) { 1104 assert(UseAVX > 2, "required"); 1105 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1106 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1107 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1108 assert_different_registers(dst, a, atmp, btmp); 1109 assert_different_registers(dst, b, atmp, btmp); 1110 1111 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1112 bool is_double_word = is_double_word_type(elem_bt); 1113 bool merge = true; 1114 1115 if (!is_double_word && is_min) { 1116 evpmovd2m(ktmp, a, vlen_enc); 1117 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1118 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1119 vminps(dst, atmp, btmp, vlen_enc); 1120 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1121 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1122 } else if (!is_double_word && !is_min) { 1123 evpmovd2m(ktmp, b, vlen_enc); 1124 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1125 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1126 vmaxps(dst, atmp, btmp, vlen_enc); 1127 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1128 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1129 } else if (is_double_word && is_min) { 1130 evpmovq2m(ktmp, a, vlen_enc); 1131 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1132 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1133 vminpd(dst, atmp, btmp, vlen_enc); 1134 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1135 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1136 } else { 1137 assert(is_double_word && !is_min, "sanity"); 1138 evpmovq2m(ktmp, b, vlen_enc); 1139 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1140 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1141 vmaxpd(dst, atmp, btmp, vlen_enc); 1142 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1143 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1144 } 1145 } 1146 1147 // Float/Double signum 1148 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1149 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1150 1151 Label DONE_LABEL; 1152 1153 if (opcode == Op_SignumF) { 1154 assert(UseSSE > 0, "required"); 1155 ucomiss(dst, zero); 1156 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1157 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1158 movflt(dst, one); 1159 jcc(Assembler::above, DONE_LABEL); 1160 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1161 } else if (opcode == Op_SignumD) { 1162 assert(UseSSE > 1, "required"); 1163 ucomisd(dst, zero); 1164 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1165 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1166 movdbl(dst, one); 1167 jcc(Assembler::above, DONE_LABEL); 1168 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1169 } 1170 1171 bind(DONE_LABEL); 1172 } 1173 1174 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1175 if (sign) { 1176 pmovsxbw(dst, src); 1177 } else { 1178 pmovzxbw(dst, src); 1179 } 1180 } 1181 1182 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1183 if (sign) { 1184 vpmovsxbw(dst, src, vector_len); 1185 } else { 1186 vpmovzxbw(dst, src, vector_len); 1187 } 1188 } 1189 1190 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1191 if (sign) { 1192 vpmovsxbd(dst, src, vector_len); 1193 } else { 1194 vpmovzxbd(dst, src, vector_len); 1195 } 1196 } 1197 1198 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1199 if (sign) { 1200 vpmovsxwd(dst, src, vector_len); 1201 } else { 1202 vpmovzxwd(dst, src, vector_len); 1203 } 1204 } 1205 1206 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1207 int shift, int vector_len) { 1208 if (opcode == Op_RotateLeftV) { 1209 if (etype == T_INT) { 1210 evprold(dst, src, shift, vector_len); 1211 } else { 1212 assert(etype == T_LONG, "expected type T_LONG"); 1213 evprolq(dst, src, shift, vector_len); 1214 } 1215 } else { 1216 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1217 if (etype == T_INT) { 1218 evprord(dst, src, shift, vector_len); 1219 } else { 1220 assert(etype == T_LONG, "expected type T_LONG"); 1221 evprorq(dst, src, shift, vector_len); 1222 } 1223 } 1224 } 1225 1226 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1227 XMMRegister shift, int vector_len) { 1228 if (opcode == Op_RotateLeftV) { 1229 if (etype == T_INT) { 1230 evprolvd(dst, src, shift, vector_len); 1231 } else { 1232 assert(etype == T_LONG, "expected type T_LONG"); 1233 evprolvq(dst, src, shift, vector_len); 1234 } 1235 } else { 1236 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1237 if (etype == T_INT) { 1238 evprorvd(dst, src, shift, vector_len); 1239 } else { 1240 assert(etype == T_LONG, "expected type T_LONG"); 1241 evprorvq(dst, src, shift, vector_len); 1242 } 1243 } 1244 } 1245 1246 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1247 if (opcode == Op_RShiftVI) { 1248 psrad(dst, shift); 1249 } else if (opcode == Op_LShiftVI) { 1250 pslld(dst, shift); 1251 } else { 1252 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1253 psrld(dst, shift); 1254 } 1255 } 1256 1257 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1258 switch (opcode) { 1259 case Op_RShiftVI: psrad(dst, shift); break; 1260 case Op_LShiftVI: pslld(dst, shift); break; 1261 case Op_URShiftVI: psrld(dst, shift); break; 1262 1263 default: assert(false, "%s", NodeClassNames[opcode]); 1264 } 1265 } 1266 1267 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1268 if (opcode == Op_RShiftVI) { 1269 vpsrad(dst, nds, shift, vector_len); 1270 } else if (opcode == Op_LShiftVI) { 1271 vpslld(dst, nds, shift, vector_len); 1272 } else { 1273 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1274 vpsrld(dst, nds, shift, vector_len); 1275 } 1276 } 1277 1278 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1279 switch (opcode) { 1280 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1281 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1282 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1283 1284 default: assert(false, "%s", NodeClassNames[opcode]); 1285 } 1286 } 1287 1288 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1289 switch (opcode) { 1290 case Op_RShiftVB: // fall-through 1291 case Op_RShiftVS: psraw(dst, shift); break; 1292 1293 case Op_LShiftVB: // fall-through 1294 case Op_LShiftVS: psllw(dst, shift); break; 1295 1296 case Op_URShiftVS: // fall-through 1297 case Op_URShiftVB: psrlw(dst, shift); break; 1298 1299 default: assert(false, "%s", NodeClassNames[opcode]); 1300 } 1301 } 1302 1303 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1304 switch (opcode) { 1305 case Op_RShiftVB: // fall-through 1306 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1307 1308 case Op_LShiftVB: // fall-through 1309 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1310 1311 case Op_URShiftVS: // fall-through 1312 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1313 1314 default: assert(false, "%s", NodeClassNames[opcode]); 1315 } 1316 } 1317 1318 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1319 switch (opcode) { 1320 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1321 case Op_LShiftVL: psllq(dst, shift); break; 1322 case Op_URShiftVL: psrlq(dst, shift); break; 1323 1324 default: assert(false, "%s", NodeClassNames[opcode]); 1325 } 1326 } 1327 1328 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1329 if (opcode == Op_RShiftVL) { 1330 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1331 } else if (opcode == Op_LShiftVL) { 1332 psllq(dst, shift); 1333 } else { 1334 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1335 psrlq(dst, shift); 1336 } 1337 } 1338 1339 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1340 switch (opcode) { 1341 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1342 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1343 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1344 1345 default: assert(false, "%s", NodeClassNames[opcode]); 1346 } 1347 } 1348 1349 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1350 if (opcode == Op_RShiftVL) { 1351 evpsraq(dst, nds, shift, vector_len); 1352 } else if (opcode == Op_LShiftVL) { 1353 vpsllq(dst, nds, shift, vector_len); 1354 } else { 1355 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1356 vpsrlq(dst, nds, shift, vector_len); 1357 } 1358 } 1359 1360 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1361 switch (opcode) { 1362 case Op_RShiftVB: // fall-through 1363 case Op_RShiftVS: // fall-through 1364 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1365 1366 case Op_LShiftVB: // fall-through 1367 case Op_LShiftVS: // fall-through 1368 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1369 1370 case Op_URShiftVB: // fall-through 1371 case Op_URShiftVS: // fall-through 1372 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1373 1374 default: assert(false, "%s", NodeClassNames[opcode]); 1375 } 1376 } 1377 1378 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1379 switch (opcode) { 1380 case Op_RShiftVB: // fall-through 1381 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1382 1383 case Op_LShiftVB: // fall-through 1384 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1385 1386 case Op_URShiftVB: // fall-through 1387 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1388 1389 default: assert(false, "%s", NodeClassNames[opcode]); 1390 } 1391 } 1392 1393 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1394 assert(UseAVX >= 2, "required"); 1395 switch (opcode) { 1396 case Op_RShiftVL: { 1397 if (UseAVX > 2) { 1398 assert(tmp == xnoreg, "not used"); 1399 if (!VM_Version::supports_avx512vl()) { 1400 vlen_enc = Assembler::AVX_512bit; 1401 } 1402 evpsravq(dst, src, shift, vlen_enc); 1403 } else { 1404 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1405 vpsrlvq(dst, src, shift, vlen_enc); 1406 vpsrlvq(tmp, tmp, shift, vlen_enc); 1407 vpxor(dst, dst, tmp, vlen_enc); 1408 vpsubq(dst, dst, tmp, vlen_enc); 1409 } 1410 break; 1411 } 1412 case Op_LShiftVL: { 1413 assert(tmp == xnoreg, "not used"); 1414 vpsllvq(dst, src, shift, vlen_enc); 1415 break; 1416 } 1417 case Op_URShiftVL: { 1418 assert(tmp == xnoreg, "not used"); 1419 vpsrlvq(dst, src, shift, vlen_enc); 1420 break; 1421 } 1422 default: assert(false, "%s", NodeClassNames[opcode]); 1423 } 1424 } 1425 1426 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1427 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1428 assert(opcode == Op_LShiftVB || 1429 opcode == Op_RShiftVB || 1430 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1431 bool sign = (opcode != Op_URShiftVB); 1432 assert(vector_len == 0, "required"); 1433 vextendbd(sign, dst, src, 1); 1434 vpmovzxbd(vtmp, shift, 1); 1435 varshiftd(opcode, dst, dst, vtmp, 1); 1436 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1437 vextracti128_high(vtmp, dst); 1438 vpackusdw(dst, dst, vtmp, 0); 1439 } 1440 1441 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1442 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1443 assert(opcode == Op_LShiftVB || 1444 opcode == Op_RShiftVB || 1445 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1446 bool sign = (opcode != Op_URShiftVB); 1447 int ext_vector_len = vector_len + 1; 1448 vextendbw(sign, dst, src, ext_vector_len); 1449 vpmovzxbw(vtmp, shift, ext_vector_len); 1450 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1451 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1452 if (vector_len == 0) { 1453 vextracti128_high(vtmp, dst); 1454 vpackuswb(dst, dst, vtmp, vector_len); 1455 } else { 1456 vextracti64x4_high(vtmp, dst); 1457 vpackuswb(dst, dst, vtmp, vector_len); 1458 vpermq(dst, dst, 0xD8, vector_len); 1459 } 1460 } 1461 1462 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1463 switch(typ) { 1464 case T_BYTE: 1465 pinsrb(dst, val, idx); 1466 break; 1467 case T_SHORT: 1468 pinsrw(dst, val, idx); 1469 break; 1470 case T_INT: 1471 pinsrd(dst, val, idx); 1472 break; 1473 case T_LONG: 1474 pinsrq(dst, val, idx); 1475 break; 1476 default: 1477 assert(false,"Should not reach here."); 1478 break; 1479 } 1480 } 1481 1482 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1483 switch(typ) { 1484 case T_BYTE: 1485 vpinsrb(dst, src, val, idx); 1486 break; 1487 case T_SHORT: 1488 vpinsrw(dst, src, val, idx); 1489 break; 1490 case T_INT: 1491 vpinsrd(dst, src, val, idx); 1492 break; 1493 case T_LONG: 1494 vpinsrq(dst, src, val, idx); 1495 break; 1496 default: 1497 assert(false,"Should not reach here."); 1498 break; 1499 } 1500 } 1501 1502 #ifdef _LP64 1503 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1504 XMMRegister dst, Register base, 1505 Register idx_base, 1506 Register offset, Register mask, 1507 Register mask_idx, Register rtmp, 1508 int vlen_enc) { 1509 vpxor(dst, dst, dst, vlen_enc); 1510 if (elem_bt == T_SHORT) { 1511 for (int i = 0; i < 4; i++) { 1512 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1513 Label skip_load; 1514 btq(mask, mask_idx); 1515 jccb(Assembler::carryClear, skip_load); 1516 movl(rtmp, Address(idx_base, i * 4)); 1517 if (offset != noreg) { 1518 addl(rtmp, offset); 1519 } 1520 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1521 bind(skip_load); 1522 incq(mask_idx); 1523 } 1524 } else { 1525 assert(elem_bt == T_BYTE, ""); 1526 for (int i = 0; i < 8; i++) { 1527 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1528 Label skip_load; 1529 btq(mask, mask_idx); 1530 jccb(Assembler::carryClear, skip_load); 1531 movl(rtmp, Address(idx_base, i * 4)); 1532 if (offset != noreg) { 1533 addl(rtmp, offset); 1534 } 1535 pinsrb(dst, Address(base, rtmp), i); 1536 bind(skip_load); 1537 incq(mask_idx); 1538 } 1539 } 1540 } 1541 #endif // _LP64 1542 1543 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1544 Register base, Register idx_base, 1545 Register offset, Register rtmp, 1546 int vlen_enc) { 1547 vpxor(dst, dst, dst, vlen_enc); 1548 if (elem_bt == T_SHORT) { 1549 for (int i = 0; i < 4; i++) { 1550 // dst[i] = src[offset + idx_base[i]] 1551 movl(rtmp, Address(idx_base, i * 4)); 1552 if (offset != noreg) { 1553 addl(rtmp, offset); 1554 } 1555 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1556 } 1557 } else { 1558 assert(elem_bt == T_BYTE, ""); 1559 for (int i = 0; i < 8; i++) { 1560 // dst[i] = src[offset + idx_base[i]] 1561 movl(rtmp, Address(idx_base, i * 4)); 1562 if (offset != noreg) { 1563 addl(rtmp, offset); 1564 } 1565 pinsrb(dst, Address(base, rtmp), i); 1566 } 1567 } 1568 } 1569 1570 /* 1571 * Gather using hybrid algorithm, first partially unroll scalar loop 1572 * to accumulate values from gather indices into a quad-word(64bit) slice. 1573 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1574 * permutation to place the slice into appropriate vector lane 1575 * locations in destination vector. Following pseudo code describes the 1576 * algorithm in detail: 1577 * 1578 * DST_VEC = ZERO_VEC 1579 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1580 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1581 * FOREACH_ITER: 1582 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1583 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1584 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1585 * PERM_INDEX = PERM_INDEX - TWO_VEC 1586 * 1587 * With each iteration, doubleword permute indices (0,1) corresponding 1588 * to gathered quadword gets right shifted by two lane positions. 1589 * 1590 */ 1591 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1592 Register base, Register idx_base, 1593 Register offset, Register mask, 1594 XMMRegister xtmp1, XMMRegister xtmp2, 1595 XMMRegister temp_dst, Register rtmp, 1596 Register mask_idx, Register length, 1597 int vector_len, int vlen_enc) { 1598 Label GATHER8_LOOP; 1599 assert(is_subword_type(elem_ty), ""); 1600 movl(length, vector_len); 1601 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1602 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1603 vallones(xtmp2, vlen_enc); 1604 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1605 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1606 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1607 1608 bind(GATHER8_LOOP); 1609 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1610 if (mask == noreg) { 1611 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1612 } else { 1613 LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc)); 1614 } 1615 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1616 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1617 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1618 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1619 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1620 vpor(dst, dst, temp_dst, vlen_enc); 1621 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1622 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1623 jcc(Assembler::notEqual, GATHER8_LOOP); 1624 } 1625 1626 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1627 switch(typ) { 1628 case T_INT: 1629 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1630 break; 1631 case T_FLOAT: 1632 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1633 break; 1634 case T_LONG: 1635 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1636 break; 1637 case T_DOUBLE: 1638 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1639 break; 1640 default: 1641 assert(false,"Should not reach here."); 1642 break; 1643 } 1644 } 1645 1646 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1647 switch(typ) { 1648 case T_INT: 1649 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1650 break; 1651 case T_FLOAT: 1652 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1653 break; 1654 case T_LONG: 1655 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1656 break; 1657 case T_DOUBLE: 1658 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1659 break; 1660 default: 1661 assert(false,"Should not reach here."); 1662 break; 1663 } 1664 } 1665 1666 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1667 switch(typ) { 1668 case T_INT: 1669 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1670 break; 1671 case T_FLOAT: 1672 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1673 break; 1674 case T_LONG: 1675 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1676 break; 1677 case T_DOUBLE: 1678 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1679 break; 1680 default: 1681 assert(false,"Should not reach here."); 1682 break; 1683 } 1684 } 1685 1686 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1687 if (vlen_in_bytes <= 16) { 1688 pxor (dst, dst); 1689 psubb(dst, src); 1690 switch (elem_bt) { 1691 case T_BYTE: /* nothing to do */ break; 1692 case T_SHORT: pmovsxbw(dst, dst); break; 1693 case T_INT: pmovsxbd(dst, dst); break; 1694 case T_FLOAT: pmovsxbd(dst, dst); break; 1695 case T_LONG: pmovsxbq(dst, dst); break; 1696 case T_DOUBLE: pmovsxbq(dst, dst); break; 1697 1698 default: assert(false, "%s", type2name(elem_bt)); 1699 } 1700 } else { 1701 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1702 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1703 1704 vpxor (dst, dst, dst, vlen_enc); 1705 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1706 1707 switch (elem_bt) { 1708 case T_BYTE: /* nothing to do */ break; 1709 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1710 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1711 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1712 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1713 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1714 1715 default: assert(false, "%s", type2name(elem_bt)); 1716 } 1717 } 1718 } 1719 1720 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1721 if (novlbwdq) { 1722 vpmovsxbd(xtmp, src, vlen_enc); 1723 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1724 Assembler::eq, true, vlen_enc, noreg); 1725 } else { 1726 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1727 vpsubb(xtmp, xtmp, src, vlen_enc); 1728 evpmovb2m(dst, xtmp, vlen_enc); 1729 } 1730 } 1731 1732 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1733 switch (vlen_in_bytes) { 1734 case 4: movdl(dst, src); break; 1735 case 8: movq(dst, src); break; 1736 case 16: movdqu(dst, src); break; 1737 case 32: vmovdqu(dst, src); break; 1738 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1739 default: ShouldNotReachHere(); 1740 } 1741 } 1742 1743 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1744 assert(rscratch != noreg || always_reachable(src), "missing"); 1745 1746 if (reachable(src)) { 1747 load_vector(dst, as_Address(src), vlen_in_bytes); 1748 } else { 1749 lea(rscratch, src); 1750 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1751 } 1752 } 1753 1754 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1755 int vlen_enc = vector_length_encoding(vlen); 1756 if (VM_Version::supports_avx()) { 1757 if (bt == T_LONG) { 1758 if (VM_Version::supports_avx2()) { 1759 vpbroadcastq(dst, src, vlen_enc); 1760 } else { 1761 vmovddup(dst, src, vlen_enc); 1762 } 1763 } else if (bt == T_DOUBLE) { 1764 if (vlen_enc != Assembler::AVX_128bit) { 1765 vbroadcastsd(dst, src, vlen_enc, noreg); 1766 } else { 1767 vmovddup(dst, src, vlen_enc); 1768 } 1769 } else { 1770 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1771 vpbroadcastd(dst, src, vlen_enc); 1772 } else { 1773 vbroadcastss(dst, src, vlen_enc); 1774 } 1775 } 1776 } else if (VM_Version::supports_sse3()) { 1777 movddup(dst, src); 1778 } else { 1779 movq(dst, src); 1780 if (vlen == 16) { 1781 punpcklqdq(dst, dst); 1782 } 1783 } 1784 } 1785 1786 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1787 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1788 int offset = exact_log2(type2aelembytes(bt)) << 6; 1789 if (is_floating_point_type(bt)) { 1790 offset += 128; 1791 } 1792 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1793 load_vector(dst, addr, vlen_in_bytes); 1794 } 1795 1796 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1797 1798 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1799 int vector_len = Assembler::AVX_128bit; 1800 1801 switch (opcode) { 1802 case Op_AndReductionV: pand(dst, src); break; 1803 case Op_OrReductionV: por (dst, src); break; 1804 case Op_XorReductionV: pxor(dst, src); break; 1805 case Op_MinReductionV: 1806 switch (typ) { 1807 case T_BYTE: pminsb(dst, src); break; 1808 case T_SHORT: pminsw(dst, src); break; 1809 case T_INT: pminsd(dst, src); break; 1810 case T_LONG: assert(UseAVX > 2, "required"); 1811 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1812 default: assert(false, "wrong type"); 1813 } 1814 break; 1815 case Op_MaxReductionV: 1816 switch (typ) { 1817 case T_BYTE: pmaxsb(dst, src); break; 1818 case T_SHORT: pmaxsw(dst, src); break; 1819 case T_INT: pmaxsd(dst, src); break; 1820 case T_LONG: assert(UseAVX > 2, "required"); 1821 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1822 default: assert(false, "wrong type"); 1823 } 1824 break; 1825 case Op_AddReductionVF: addss(dst, src); break; 1826 case Op_AddReductionVD: addsd(dst, src); break; 1827 case Op_AddReductionVI: 1828 switch (typ) { 1829 case T_BYTE: paddb(dst, src); break; 1830 case T_SHORT: paddw(dst, src); break; 1831 case T_INT: paddd(dst, src); break; 1832 default: assert(false, "wrong type"); 1833 } 1834 break; 1835 case Op_AddReductionVL: paddq(dst, src); break; 1836 case Op_MulReductionVF: mulss(dst, src); break; 1837 case Op_MulReductionVD: mulsd(dst, src); break; 1838 case Op_MulReductionVI: 1839 switch (typ) { 1840 case T_SHORT: pmullw(dst, src); break; 1841 case T_INT: pmulld(dst, src); break; 1842 default: assert(false, "wrong type"); 1843 } 1844 break; 1845 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1846 evpmullq(dst, dst, src, vector_len); break; 1847 default: assert(false, "wrong opcode"); 1848 } 1849 } 1850 1851 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1852 switch (opcode) { 1853 case Op_AddReductionVF: addps(dst, src); break; 1854 case Op_AddReductionVD: addpd(dst, src); break; 1855 case Op_MulReductionVF: mulps(dst, src); break; 1856 case Op_MulReductionVD: mulpd(dst, src); break; 1857 default: assert(false, "%s", NodeClassNames[opcode]); 1858 } 1859 } 1860 1861 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1862 int vector_len = Assembler::AVX_256bit; 1863 1864 switch (opcode) { 1865 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1866 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1867 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1868 case Op_MinReductionV: 1869 switch (typ) { 1870 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1871 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1872 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1873 case T_LONG: assert(UseAVX > 2, "required"); 1874 vpminsq(dst, src1, src2, vector_len); break; 1875 default: assert(false, "wrong type"); 1876 } 1877 break; 1878 case Op_MaxReductionV: 1879 switch (typ) { 1880 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1881 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1882 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1883 case T_LONG: assert(UseAVX > 2, "required"); 1884 vpmaxsq(dst, src1, src2, vector_len); break; 1885 default: assert(false, "wrong type"); 1886 } 1887 break; 1888 case Op_AddReductionVI: 1889 switch (typ) { 1890 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1891 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1892 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1893 default: assert(false, "wrong type"); 1894 } 1895 break; 1896 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1897 case Op_MulReductionVI: 1898 switch (typ) { 1899 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1900 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1901 default: assert(false, "wrong type"); 1902 } 1903 break; 1904 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1905 default: assert(false, "wrong opcode"); 1906 } 1907 } 1908 1909 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1910 int vector_len = Assembler::AVX_256bit; 1911 1912 switch (opcode) { 1913 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 1914 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 1915 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 1916 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 1917 default: assert(false, "%s", NodeClassNames[opcode]); 1918 } 1919 } 1920 1921 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1922 XMMRegister dst, XMMRegister src, 1923 XMMRegister vtmp1, XMMRegister vtmp2) { 1924 switch (opcode) { 1925 case Op_AddReductionVF: 1926 case Op_MulReductionVF: 1927 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1928 break; 1929 1930 case Op_AddReductionVD: 1931 case Op_MulReductionVD: 1932 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1933 break; 1934 1935 default: assert(false, "wrong opcode"); 1936 } 1937 } 1938 1939 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 1940 XMMRegister dst, XMMRegister src, 1941 XMMRegister vtmp1, XMMRegister vtmp2) { 1942 switch (opcode) { 1943 case Op_AddReductionVF: 1944 case Op_MulReductionVF: 1945 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1946 break; 1947 1948 case Op_AddReductionVD: 1949 case Op_MulReductionVD: 1950 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1951 break; 1952 1953 default: assert(false, "%s", NodeClassNames[opcode]); 1954 } 1955 } 1956 1957 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1958 Register dst, Register src1, XMMRegister src2, 1959 XMMRegister vtmp1, XMMRegister vtmp2) { 1960 switch (vlen) { 1961 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1962 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1963 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1964 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1965 1966 default: assert(false, "wrong vector length"); 1967 } 1968 } 1969 1970 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1971 Register dst, Register src1, XMMRegister src2, 1972 XMMRegister vtmp1, XMMRegister vtmp2) { 1973 switch (vlen) { 1974 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1975 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1976 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1977 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1978 1979 default: assert(false, "wrong vector length"); 1980 } 1981 } 1982 1983 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1984 Register dst, Register src1, XMMRegister src2, 1985 XMMRegister vtmp1, XMMRegister vtmp2) { 1986 switch (vlen) { 1987 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1988 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1989 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1990 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1991 1992 default: assert(false, "wrong vector length"); 1993 } 1994 } 1995 1996 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1997 Register dst, Register src1, XMMRegister src2, 1998 XMMRegister vtmp1, XMMRegister vtmp2) { 1999 switch (vlen) { 2000 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2001 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2002 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2003 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2004 2005 default: assert(false, "wrong vector length"); 2006 } 2007 } 2008 2009 #ifdef _LP64 2010 void C2_MacroAssembler::reduceL(int opcode, int vlen, 2011 Register dst, Register src1, XMMRegister src2, 2012 XMMRegister vtmp1, XMMRegister vtmp2) { 2013 switch (vlen) { 2014 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2015 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2016 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2017 2018 default: assert(false, "wrong vector length"); 2019 } 2020 } 2021 #endif // _LP64 2022 2023 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2024 switch (vlen) { 2025 case 2: 2026 assert(vtmp2 == xnoreg, ""); 2027 reduce2F(opcode, dst, src, vtmp1); 2028 break; 2029 case 4: 2030 assert(vtmp2 == xnoreg, ""); 2031 reduce4F(opcode, dst, src, vtmp1); 2032 break; 2033 case 8: 2034 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2035 break; 2036 case 16: 2037 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2038 break; 2039 default: assert(false, "wrong vector length"); 2040 } 2041 } 2042 2043 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2044 switch (vlen) { 2045 case 2: 2046 assert(vtmp2 == xnoreg, ""); 2047 reduce2D(opcode, dst, src, vtmp1); 2048 break; 2049 case 4: 2050 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2051 break; 2052 case 8: 2053 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2054 break; 2055 default: assert(false, "wrong vector length"); 2056 } 2057 } 2058 2059 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2060 switch (vlen) { 2061 case 2: 2062 assert(vtmp1 == xnoreg, ""); 2063 assert(vtmp2 == xnoreg, ""); 2064 unorderedReduce2F(opcode, dst, src); 2065 break; 2066 case 4: 2067 assert(vtmp2 == xnoreg, ""); 2068 unorderedReduce4F(opcode, dst, src, vtmp1); 2069 break; 2070 case 8: 2071 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 2072 break; 2073 case 16: 2074 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 2075 break; 2076 default: assert(false, "wrong vector length"); 2077 } 2078 } 2079 2080 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2081 switch (vlen) { 2082 case 2: 2083 assert(vtmp1 == xnoreg, ""); 2084 assert(vtmp2 == xnoreg, ""); 2085 unorderedReduce2D(opcode, dst, src); 2086 break; 2087 case 4: 2088 assert(vtmp2 == xnoreg, ""); 2089 unorderedReduce4D(opcode, dst, src, vtmp1); 2090 break; 2091 case 8: 2092 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2093 break; 2094 default: assert(false, "wrong vector length"); 2095 } 2096 } 2097 2098 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2099 if (opcode == Op_AddReductionVI) { 2100 if (vtmp1 != src2) { 2101 movdqu(vtmp1, src2); 2102 } 2103 phaddd(vtmp1, vtmp1); 2104 } else { 2105 pshufd(vtmp1, src2, 0x1); 2106 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2107 } 2108 movdl(vtmp2, src1); 2109 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2110 movdl(dst, vtmp1); 2111 } 2112 2113 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2114 if (opcode == Op_AddReductionVI) { 2115 if (vtmp1 != src2) { 2116 movdqu(vtmp1, src2); 2117 } 2118 phaddd(vtmp1, src2); 2119 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2120 } else { 2121 pshufd(vtmp2, src2, 0xE); 2122 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2123 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2124 } 2125 } 2126 2127 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2128 if (opcode == Op_AddReductionVI) { 2129 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2130 vextracti128_high(vtmp2, vtmp1); 2131 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2132 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2133 } else { 2134 vextracti128_high(vtmp1, src2); 2135 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2136 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2137 } 2138 } 2139 2140 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2141 vextracti64x4_high(vtmp2, src2); 2142 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2143 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2144 } 2145 2146 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2147 pshufd(vtmp2, src2, 0x1); 2148 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2149 movdqu(vtmp1, vtmp2); 2150 psrldq(vtmp1, 2); 2151 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2152 movdqu(vtmp2, vtmp1); 2153 psrldq(vtmp2, 1); 2154 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2155 movdl(vtmp2, src1); 2156 pmovsxbd(vtmp1, vtmp1); 2157 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2158 pextrb(dst, vtmp1, 0x0); 2159 movsbl(dst, dst); 2160 } 2161 2162 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2163 pshufd(vtmp1, src2, 0xE); 2164 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2165 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2166 } 2167 2168 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2169 vextracti128_high(vtmp2, src2); 2170 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2171 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2172 } 2173 2174 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2175 vextracti64x4_high(vtmp1, src2); 2176 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2177 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2178 } 2179 2180 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2181 pmovsxbw(vtmp2, src2); 2182 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2183 } 2184 2185 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2186 if (UseAVX > 1) { 2187 int vector_len = Assembler::AVX_256bit; 2188 vpmovsxbw(vtmp1, src2, vector_len); 2189 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2190 } else { 2191 pmovsxbw(vtmp2, src2); 2192 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2193 pshufd(vtmp2, src2, 0x1); 2194 pmovsxbw(vtmp2, src2); 2195 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2196 } 2197 } 2198 2199 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2200 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2201 int vector_len = Assembler::AVX_512bit; 2202 vpmovsxbw(vtmp1, src2, vector_len); 2203 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2204 } else { 2205 assert(UseAVX >= 2,"Should not reach here."); 2206 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2207 vextracti128_high(vtmp2, src2); 2208 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2209 } 2210 } 2211 2212 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2213 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2214 vextracti64x4_high(vtmp2, src2); 2215 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2216 } 2217 2218 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2219 if (opcode == Op_AddReductionVI) { 2220 if (vtmp1 != src2) { 2221 movdqu(vtmp1, src2); 2222 } 2223 phaddw(vtmp1, vtmp1); 2224 phaddw(vtmp1, vtmp1); 2225 } else { 2226 pshufd(vtmp2, src2, 0x1); 2227 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2228 movdqu(vtmp1, vtmp2); 2229 psrldq(vtmp1, 2); 2230 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2231 } 2232 movdl(vtmp2, src1); 2233 pmovsxwd(vtmp1, vtmp1); 2234 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2235 pextrw(dst, vtmp1, 0x0); 2236 movswl(dst, dst); 2237 } 2238 2239 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2240 if (opcode == Op_AddReductionVI) { 2241 if (vtmp1 != src2) { 2242 movdqu(vtmp1, src2); 2243 } 2244 phaddw(vtmp1, src2); 2245 } else { 2246 pshufd(vtmp1, src2, 0xE); 2247 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2248 } 2249 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2250 } 2251 2252 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2253 if (opcode == Op_AddReductionVI) { 2254 int vector_len = Assembler::AVX_256bit; 2255 vphaddw(vtmp2, src2, src2, vector_len); 2256 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2257 } else { 2258 vextracti128_high(vtmp2, src2); 2259 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2260 } 2261 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2262 } 2263 2264 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2265 int vector_len = Assembler::AVX_256bit; 2266 vextracti64x4_high(vtmp1, src2); 2267 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2268 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2269 } 2270 2271 #ifdef _LP64 2272 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2273 pshufd(vtmp2, src2, 0xE); 2274 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2275 movdq(vtmp1, src1); 2276 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2277 movdq(dst, vtmp1); 2278 } 2279 2280 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2281 vextracti128_high(vtmp1, src2); 2282 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2283 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2284 } 2285 2286 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2287 vextracti64x4_high(vtmp2, src2); 2288 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2289 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2290 } 2291 2292 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2293 mov64(temp, -1L); 2294 bzhiq(temp, temp, len); 2295 kmovql(dst, temp); 2296 } 2297 #endif // _LP64 2298 2299 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2300 reduce_operation_128(T_FLOAT, opcode, dst, src); 2301 pshufd(vtmp, src, 0x1); 2302 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2303 } 2304 2305 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2306 reduce2F(opcode, dst, src, vtmp); 2307 pshufd(vtmp, src, 0x2); 2308 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2309 pshufd(vtmp, src, 0x3); 2310 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2311 } 2312 2313 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2314 reduce4F(opcode, dst, src, vtmp2); 2315 vextractf128_high(vtmp2, src); 2316 reduce4F(opcode, dst, vtmp2, vtmp1); 2317 } 2318 2319 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2320 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2321 vextracti64x4_high(vtmp1, src); 2322 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2323 } 2324 2325 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2326 pshufd(dst, src, 0x1); 2327 reduce_operation_128(T_FLOAT, opcode, dst, src); 2328 } 2329 2330 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2331 pshufd(vtmp, src, 0xE); 2332 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2333 unorderedReduce2F(opcode, dst, vtmp); 2334 } 2335 2336 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2337 vextractf128_high(vtmp1, src); 2338 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2339 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2340 } 2341 2342 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2343 vextractf64x4_high(vtmp2, src); 2344 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2345 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2346 } 2347 2348 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2349 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2350 pshufd(vtmp, src, 0xE); 2351 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2352 } 2353 2354 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2355 reduce2D(opcode, dst, src, vtmp2); 2356 vextractf128_high(vtmp2, src); 2357 reduce2D(opcode, dst, vtmp2, vtmp1); 2358 } 2359 2360 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2361 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2362 vextracti64x4_high(vtmp1, src); 2363 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2364 } 2365 2366 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2367 pshufd(dst, src, 0xE); 2368 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2369 } 2370 2371 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2372 vextractf128_high(vtmp, src); 2373 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2374 unorderedReduce2D(opcode, dst, vtmp); 2375 } 2376 2377 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2378 vextractf64x4_high(vtmp2, src); 2379 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2380 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2381 } 2382 2383 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2384 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2385 } 2386 2387 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2388 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2389 } 2390 2391 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2392 int vec_enc) { 2393 switch(elem_bt) { 2394 case T_INT: 2395 case T_FLOAT: 2396 vmaskmovps(dst, src, mask, vec_enc); 2397 break; 2398 case T_LONG: 2399 case T_DOUBLE: 2400 vmaskmovpd(dst, src, mask, vec_enc); 2401 break; 2402 default: 2403 fatal("Unsupported type %s", type2name(elem_bt)); 2404 break; 2405 } 2406 } 2407 2408 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2409 int vec_enc) { 2410 switch(elem_bt) { 2411 case T_INT: 2412 case T_FLOAT: 2413 vmaskmovps(dst, src, mask, vec_enc); 2414 break; 2415 case T_LONG: 2416 case T_DOUBLE: 2417 vmaskmovpd(dst, src, mask, vec_enc); 2418 break; 2419 default: 2420 fatal("Unsupported type %s", type2name(elem_bt)); 2421 break; 2422 } 2423 } 2424 2425 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2426 XMMRegister dst, XMMRegister src, 2427 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2428 XMMRegister xmm_0, XMMRegister xmm_1) { 2429 const int permconst[] = {1, 14}; 2430 XMMRegister wsrc = src; 2431 XMMRegister wdst = xmm_0; 2432 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2433 2434 int vlen_enc = Assembler::AVX_128bit; 2435 if (vlen == 16) { 2436 vlen_enc = Assembler::AVX_256bit; 2437 } 2438 2439 for (int i = log2(vlen) - 1; i >=0; i--) { 2440 if (i == 0 && !is_dst_valid) { 2441 wdst = dst; 2442 } 2443 if (i == 3) { 2444 vextracti64x4_high(wtmp, wsrc); 2445 } else if (i == 2) { 2446 vextracti128_high(wtmp, wsrc); 2447 } else { // i = [0,1] 2448 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2449 } 2450 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2451 wsrc = wdst; 2452 vlen_enc = Assembler::AVX_128bit; 2453 } 2454 if (is_dst_valid) { 2455 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2456 } 2457 } 2458 2459 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2460 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2461 XMMRegister xmm_0, XMMRegister xmm_1) { 2462 XMMRegister wsrc = src; 2463 XMMRegister wdst = xmm_0; 2464 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2465 int vlen_enc = Assembler::AVX_128bit; 2466 if (vlen == 8) { 2467 vlen_enc = Assembler::AVX_256bit; 2468 } 2469 for (int i = log2(vlen) - 1; i >=0; i--) { 2470 if (i == 0 && !is_dst_valid) { 2471 wdst = dst; 2472 } 2473 if (i == 1) { 2474 vextracti128_high(wtmp, wsrc); 2475 } else if (i == 2) { 2476 vextracti64x4_high(wtmp, wsrc); 2477 } else { 2478 assert(i == 0, "%d", i); 2479 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2480 } 2481 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2482 wsrc = wdst; 2483 vlen_enc = Assembler::AVX_128bit; 2484 } 2485 if (is_dst_valid) { 2486 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2487 } 2488 } 2489 2490 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2491 switch (bt) { 2492 case T_BYTE: pextrb(dst, src, idx); break; 2493 case T_SHORT: pextrw(dst, src, idx); break; 2494 case T_INT: pextrd(dst, src, idx); break; 2495 case T_LONG: pextrq(dst, src, idx); break; 2496 2497 default: 2498 assert(false,"Should not reach here."); 2499 break; 2500 } 2501 } 2502 2503 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2504 int esize = type2aelembytes(typ); 2505 int elem_per_lane = 16/esize; 2506 int lane = elemindex / elem_per_lane; 2507 int eindex = elemindex % elem_per_lane; 2508 2509 if (lane >= 2) { 2510 assert(UseAVX > 2, "required"); 2511 vextractf32x4(dst, src, lane & 3); 2512 return dst; 2513 } else if (lane > 0) { 2514 assert(UseAVX > 0, "required"); 2515 vextractf128(dst, src, lane); 2516 return dst; 2517 } else { 2518 return src; 2519 } 2520 } 2521 2522 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2523 if (typ == T_BYTE) { 2524 movsbl(dst, dst); 2525 } else if (typ == T_SHORT) { 2526 movswl(dst, dst); 2527 } 2528 } 2529 2530 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2531 int esize = type2aelembytes(typ); 2532 int elem_per_lane = 16/esize; 2533 int eindex = elemindex % elem_per_lane; 2534 assert(is_integral_type(typ),"required"); 2535 2536 if (eindex == 0) { 2537 if (typ == T_LONG) { 2538 movq(dst, src); 2539 } else { 2540 movdl(dst, src); 2541 movsxl(typ, dst); 2542 } 2543 } else { 2544 extract(typ, dst, src, eindex); 2545 movsxl(typ, dst); 2546 } 2547 } 2548 2549 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2550 int esize = type2aelembytes(typ); 2551 int elem_per_lane = 16/esize; 2552 int eindex = elemindex % elem_per_lane; 2553 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2554 2555 if (eindex == 0) { 2556 movq(dst, src); 2557 } else { 2558 if (typ == T_FLOAT) { 2559 if (UseAVX == 0) { 2560 movdqu(dst, src); 2561 shufps(dst, dst, eindex); 2562 } else { 2563 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2564 } 2565 } else { 2566 if (UseAVX == 0) { 2567 movdqu(dst, src); 2568 psrldq(dst, eindex*esize); 2569 } else { 2570 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2571 } 2572 movq(dst, dst); 2573 } 2574 } 2575 // Zero upper bits 2576 if (typ == T_FLOAT) { 2577 if (UseAVX == 0) { 2578 assert(vtmp != xnoreg, "required."); 2579 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2580 pand(dst, vtmp); 2581 } else { 2582 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2583 } 2584 } 2585 } 2586 2587 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2588 switch(typ) { 2589 case T_BYTE: 2590 case T_BOOLEAN: 2591 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2592 break; 2593 case T_SHORT: 2594 case T_CHAR: 2595 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2596 break; 2597 case T_INT: 2598 case T_FLOAT: 2599 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2600 break; 2601 case T_LONG: 2602 case T_DOUBLE: 2603 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2604 break; 2605 default: 2606 assert(false,"Should not reach here."); 2607 break; 2608 } 2609 } 2610 2611 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2612 assert(rscratch != noreg || always_reachable(src2), "missing"); 2613 2614 switch(typ) { 2615 case T_BOOLEAN: 2616 case T_BYTE: 2617 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2618 break; 2619 case T_CHAR: 2620 case T_SHORT: 2621 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2622 break; 2623 case T_INT: 2624 case T_FLOAT: 2625 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2626 break; 2627 case T_LONG: 2628 case T_DOUBLE: 2629 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2630 break; 2631 default: 2632 assert(false,"Should not reach here."); 2633 break; 2634 } 2635 } 2636 2637 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2638 switch(typ) { 2639 case T_BYTE: 2640 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2641 break; 2642 case T_SHORT: 2643 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2644 break; 2645 case T_INT: 2646 case T_FLOAT: 2647 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2648 break; 2649 case T_LONG: 2650 case T_DOUBLE: 2651 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2652 break; 2653 default: 2654 assert(false,"Should not reach here."); 2655 break; 2656 } 2657 } 2658 2659 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2660 assert(vlen_in_bytes <= 32, ""); 2661 int esize = type2aelembytes(bt); 2662 if (vlen_in_bytes == 32) { 2663 assert(vtmp == xnoreg, "required."); 2664 if (esize >= 4) { 2665 vtestps(src1, src2, AVX_256bit); 2666 } else { 2667 vptest(src1, src2, AVX_256bit); 2668 } 2669 return; 2670 } 2671 if (vlen_in_bytes < 16) { 2672 // Duplicate the lower part to fill the whole register, 2673 // Don't need to do so for src2 2674 assert(vtmp != xnoreg, "required"); 2675 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2676 pshufd(vtmp, src1, shuffle_imm); 2677 } else { 2678 assert(vtmp == xnoreg, "required"); 2679 vtmp = src1; 2680 } 2681 if (esize >= 4 && VM_Version::supports_avx()) { 2682 vtestps(vtmp, src2, AVX_128bit); 2683 } else { 2684 ptest(vtmp, src2); 2685 } 2686 } 2687 2688 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2689 assert(UseAVX >= 2, "required"); 2690 #ifdef ASSERT 2691 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2692 bool is_bw_supported = VM_Version::supports_avx512bw(); 2693 if (is_bw && !is_bw_supported) { 2694 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2695 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2696 "XMM register should be 0-15"); 2697 } 2698 #endif // ASSERT 2699 switch (elem_bt) { 2700 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2701 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2702 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2703 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2704 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2705 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2706 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2707 } 2708 } 2709 2710 #ifdef _LP64 2711 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2712 assert(UseAVX >= 2, "required"); 2713 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2714 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2715 if ((UseAVX > 2) && 2716 (!is_bw || VM_Version::supports_avx512bw()) && 2717 (!is_vl || VM_Version::supports_avx512vl())) { 2718 switch (elem_bt) { 2719 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2720 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2721 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2722 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2723 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2724 } 2725 } else { 2726 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2727 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2728 switch (elem_bt) { 2729 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2730 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2731 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2732 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2733 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2734 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2735 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2736 } 2737 } 2738 } 2739 #endif 2740 2741 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2742 switch (to_elem_bt) { 2743 case T_SHORT: 2744 vpmovsxbw(dst, src, vlen_enc); 2745 break; 2746 case T_INT: 2747 vpmovsxbd(dst, src, vlen_enc); 2748 break; 2749 case T_FLOAT: 2750 vpmovsxbd(dst, src, vlen_enc); 2751 vcvtdq2ps(dst, dst, vlen_enc); 2752 break; 2753 case T_LONG: 2754 vpmovsxbq(dst, src, vlen_enc); 2755 break; 2756 case T_DOUBLE: { 2757 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2758 vpmovsxbd(dst, src, mid_vlen_enc); 2759 vcvtdq2pd(dst, dst, vlen_enc); 2760 break; 2761 } 2762 default: 2763 fatal("Unsupported type %s", type2name(to_elem_bt)); 2764 break; 2765 } 2766 } 2767 2768 //------------------------------------------------------------------------------------------- 2769 2770 // IndexOf for constant substrings with size >= 8 chars 2771 // which don't need to be loaded through stack. 2772 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2773 Register cnt1, Register cnt2, 2774 int int_cnt2, Register result, 2775 XMMRegister vec, Register tmp, 2776 int ae) { 2777 ShortBranchVerifier sbv(this); 2778 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2779 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2780 2781 // This method uses the pcmpestri instruction with bound registers 2782 // inputs: 2783 // xmm - substring 2784 // rax - substring length (elements count) 2785 // mem - scanned string 2786 // rdx - string length (elements count) 2787 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2788 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2789 // outputs: 2790 // rcx - matched index in string 2791 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2792 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2793 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2794 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2795 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2796 2797 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2798 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2799 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2800 2801 // Note, inline_string_indexOf() generates checks: 2802 // if (substr.count > string.count) return -1; 2803 // if (substr.count == 0) return 0; 2804 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2805 2806 // Load substring. 2807 if (ae == StrIntrinsicNode::UL) { 2808 pmovzxbw(vec, Address(str2, 0)); 2809 } else { 2810 movdqu(vec, Address(str2, 0)); 2811 } 2812 movl(cnt2, int_cnt2); 2813 movptr(result, str1); // string addr 2814 2815 if (int_cnt2 > stride) { 2816 jmpb(SCAN_TO_SUBSTR); 2817 2818 // Reload substr for rescan, this code 2819 // is executed only for large substrings (> 8 chars) 2820 bind(RELOAD_SUBSTR); 2821 if (ae == StrIntrinsicNode::UL) { 2822 pmovzxbw(vec, Address(str2, 0)); 2823 } else { 2824 movdqu(vec, Address(str2, 0)); 2825 } 2826 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2827 2828 bind(RELOAD_STR); 2829 // We came here after the beginning of the substring was 2830 // matched but the rest of it was not so we need to search 2831 // again. Start from the next element after the previous match. 2832 2833 // cnt2 is number of substring reminding elements and 2834 // cnt1 is number of string reminding elements when cmp failed. 2835 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2836 subl(cnt1, cnt2); 2837 addl(cnt1, int_cnt2); 2838 movl(cnt2, int_cnt2); // Now restore cnt2 2839 2840 decrementl(cnt1); // Shift to next element 2841 cmpl(cnt1, cnt2); 2842 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2843 2844 addptr(result, (1<<scale1)); 2845 2846 } // (int_cnt2 > 8) 2847 2848 // Scan string for start of substr in 16-byte vectors 2849 bind(SCAN_TO_SUBSTR); 2850 pcmpestri(vec, Address(result, 0), mode); 2851 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2852 subl(cnt1, stride); 2853 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2854 cmpl(cnt1, cnt2); 2855 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2856 addptr(result, 16); 2857 jmpb(SCAN_TO_SUBSTR); 2858 2859 // Found a potential substr 2860 bind(FOUND_CANDIDATE); 2861 // Matched whole vector if first element matched (tmp(rcx) == 0). 2862 if (int_cnt2 == stride) { 2863 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2864 } else { // int_cnt2 > 8 2865 jccb(Assembler::overflow, FOUND_SUBSTR); 2866 } 2867 // After pcmpestri tmp(rcx) contains matched element index 2868 // Compute start addr of substr 2869 lea(result, Address(result, tmp, scale1)); 2870 2871 // Make sure string is still long enough 2872 subl(cnt1, tmp); 2873 cmpl(cnt1, cnt2); 2874 if (int_cnt2 == stride) { 2875 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2876 } else { // int_cnt2 > 8 2877 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2878 } 2879 // Left less then substring. 2880 2881 bind(RET_NOT_FOUND); 2882 movl(result, -1); 2883 jmp(EXIT); 2884 2885 if (int_cnt2 > stride) { 2886 // This code is optimized for the case when whole substring 2887 // is matched if its head is matched. 2888 bind(MATCH_SUBSTR_HEAD); 2889 pcmpestri(vec, Address(result, 0), mode); 2890 // Reload only string if does not match 2891 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2892 2893 Label CONT_SCAN_SUBSTR; 2894 // Compare the rest of substring (> 8 chars). 2895 bind(FOUND_SUBSTR); 2896 // First 8 chars are already matched. 2897 negptr(cnt2); 2898 addptr(cnt2, stride); 2899 2900 bind(SCAN_SUBSTR); 2901 subl(cnt1, stride); 2902 cmpl(cnt2, -stride); // Do not read beyond substring 2903 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2904 // Back-up strings to avoid reading beyond substring: 2905 // cnt1 = cnt1 - cnt2 + 8 2906 addl(cnt1, cnt2); // cnt2 is negative 2907 addl(cnt1, stride); 2908 movl(cnt2, stride); negptr(cnt2); 2909 bind(CONT_SCAN_SUBSTR); 2910 if (int_cnt2 < (int)G) { 2911 int tail_off1 = int_cnt2<<scale1; 2912 int tail_off2 = int_cnt2<<scale2; 2913 if (ae == StrIntrinsicNode::UL) { 2914 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2915 } else { 2916 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2917 } 2918 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2919 } else { 2920 // calculate index in register to avoid integer overflow (int_cnt2*2) 2921 movl(tmp, int_cnt2); 2922 addptr(tmp, cnt2); 2923 if (ae == StrIntrinsicNode::UL) { 2924 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2925 } else { 2926 movdqu(vec, Address(str2, tmp, scale2, 0)); 2927 } 2928 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2929 } 2930 // Need to reload strings pointers if not matched whole vector 2931 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2932 addptr(cnt2, stride); 2933 jcc(Assembler::negative, SCAN_SUBSTR); 2934 // Fall through if found full substring 2935 2936 } // (int_cnt2 > 8) 2937 2938 bind(RET_FOUND); 2939 // Found result if we matched full small substring. 2940 // Compute substr offset 2941 subptr(result, str1); 2942 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2943 shrl(result, 1); // index 2944 } 2945 bind(EXIT); 2946 2947 } // string_indexofC8 2948 2949 // Small strings are loaded through stack if they cross page boundary. 2950 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2951 Register cnt1, Register cnt2, 2952 int int_cnt2, Register result, 2953 XMMRegister vec, Register tmp, 2954 int ae) { 2955 ShortBranchVerifier sbv(this); 2956 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2957 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2958 2959 // 2960 // int_cnt2 is length of small (< 8 chars) constant substring 2961 // or (-1) for non constant substring in which case its length 2962 // is in cnt2 register. 2963 // 2964 // Note, inline_string_indexOf() generates checks: 2965 // if (substr.count > string.count) return -1; 2966 // if (substr.count == 0) return 0; 2967 // 2968 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2969 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2970 // This method uses the pcmpestri instruction with bound registers 2971 // inputs: 2972 // xmm - substring 2973 // rax - substring length (elements count) 2974 // mem - scanned string 2975 // rdx - string length (elements count) 2976 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2977 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2978 // outputs: 2979 // rcx - matched index in string 2980 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2981 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2982 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2983 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2984 2985 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2986 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2987 FOUND_CANDIDATE; 2988 2989 { //======================================================== 2990 // We don't know where these strings are located 2991 // and we can't read beyond them. Load them through stack. 2992 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2993 2994 movptr(tmp, rsp); // save old SP 2995 2996 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2997 if (int_cnt2 == (1>>scale2)) { // One byte 2998 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2999 load_unsigned_byte(result, Address(str2, 0)); 3000 movdl(vec, result); // move 32 bits 3001 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 3002 // Not enough header space in 32-bit VM: 12+3 = 15. 3003 movl(result, Address(str2, -1)); 3004 shrl(result, 8); 3005 movdl(vec, result); // move 32 bits 3006 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 3007 load_unsigned_short(result, Address(str2, 0)); 3008 movdl(vec, result); // move 32 bits 3009 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 3010 movdl(vec, Address(str2, 0)); // move 32 bits 3011 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 3012 movq(vec, Address(str2, 0)); // move 64 bits 3013 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 3014 // Array header size is 12 bytes in 32-bit VM 3015 // + 6 bytes for 3 chars == 18 bytes, 3016 // enough space to load vec and shift. 3017 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3018 if (ae == StrIntrinsicNode::UL) { 3019 int tail_off = int_cnt2-8; 3020 pmovzxbw(vec, Address(str2, tail_off)); 3021 psrldq(vec, -2*tail_off); 3022 } 3023 else { 3024 int tail_off = int_cnt2*(1<<scale2); 3025 movdqu(vec, Address(str2, tail_off-16)); 3026 psrldq(vec, 16-tail_off); 3027 } 3028 } 3029 } else { // not constant substring 3030 cmpl(cnt2, stride); 3031 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3032 3033 // We can read beyond string if srt+16 does not cross page boundary 3034 // since heaps are aligned and mapped by pages. 3035 assert(os::vm_page_size() < (int)G, "default page should be small"); 3036 movl(result, str2); // We need only low 32 bits 3037 andl(result, ((int)os::vm_page_size()-1)); 3038 cmpl(result, ((int)os::vm_page_size()-16)); 3039 jccb(Assembler::belowEqual, CHECK_STR); 3040 3041 // Move small strings to stack to allow load 16 bytes into vec. 3042 subptr(rsp, 16); 3043 int stk_offset = wordSize-(1<<scale2); 3044 push(cnt2); 3045 3046 bind(COPY_SUBSTR); 3047 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3048 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3049 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3050 } else if (ae == StrIntrinsicNode::UU) { 3051 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3052 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3053 } 3054 decrement(cnt2); 3055 jccb(Assembler::notZero, COPY_SUBSTR); 3056 3057 pop(cnt2); 3058 movptr(str2, rsp); // New substring address 3059 } // non constant 3060 3061 bind(CHECK_STR); 3062 cmpl(cnt1, stride); 3063 jccb(Assembler::aboveEqual, BIG_STRINGS); 3064 3065 // Check cross page boundary. 3066 movl(result, str1); // We need only low 32 bits 3067 andl(result, ((int)os::vm_page_size()-1)); 3068 cmpl(result, ((int)os::vm_page_size()-16)); 3069 jccb(Assembler::belowEqual, BIG_STRINGS); 3070 3071 subptr(rsp, 16); 3072 int stk_offset = -(1<<scale1); 3073 if (int_cnt2 < 0) { // not constant 3074 push(cnt2); 3075 stk_offset += wordSize; 3076 } 3077 movl(cnt2, cnt1); 3078 3079 bind(COPY_STR); 3080 if (ae == StrIntrinsicNode::LL) { 3081 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3082 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3083 } else { 3084 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3085 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3086 } 3087 decrement(cnt2); 3088 jccb(Assembler::notZero, COPY_STR); 3089 3090 if (int_cnt2 < 0) { // not constant 3091 pop(cnt2); 3092 } 3093 movptr(str1, rsp); // New string address 3094 3095 bind(BIG_STRINGS); 3096 // Load substring. 3097 if (int_cnt2 < 0) { // -1 3098 if (ae == StrIntrinsicNode::UL) { 3099 pmovzxbw(vec, Address(str2, 0)); 3100 } else { 3101 movdqu(vec, Address(str2, 0)); 3102 } 3103 push(cnt2); // substr count 3104 push(str2); // substr addr 3105 push(str1); // string addr 3106 } else { 3107 // Small (< 8 chars) constant substrings are loaded already. 3108 movl(cnt2, int_cnt2); 3109 } 3110 push(tmp); // original SP 3111 3112 } // Finished loading 3113 3114 //======================================================== 3115 // Start search 3116 // 3117 3118 movptr(result, str1); // string addr 3119 3120 if (int_cnt2 < 0) { // Only for non constant substring 3121 jmpb(SCAN_TO_SUBSTR); 3122 3123 // SP saved at sp+0 3124 // String saved at sp+1*wordSize 3125 // Substr saved at sp+2*wordSize 3126 // Substr count saved at sp+3*wordSize 3127 3128 // Reload substr for rescan, this code 3129 // is executed only for large substrings (> 8 chars) 3130 bind(RELOAD_SUBSTR); 3131 movptr(str2, Address(rsp, 2*wordSize)); 3132 movl(cnt2, Address(rsp, 3*wordSize)); 3133 if (ae == StrIntrinsicNode::UL) { 3134 pmovzxbw(vec, Address(str2, 0)); 3135 } else { 3136 movdqu(vec, Address(str2, 0)); 3137 } 3138 // We came here after the beginning of the substring was 3139 // matched but the rest of it was not so we need to search 3140 // again. Start from the next element after the previous match. 3141 subptr(str1, result); // Restore counter 3142 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3143 shrl(str1, 1); 3144 } 3145 addl(cnt1, str1); 3146 decrementl(cnt1); // Shift to next element 3147 cmpl(cnt1, cnt2); 3148 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3149 3150 addptr(result, (1<<scale1)); 3151 } // non constant 3152 3153 // Scan string for start of substr in 16-byte vectors 3154 bind(SCAN_TO_SUBSTR); 3155 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3156 pcmpestri(vec, Address(result, 0), mode); 3157 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3158 subl(cnt1, stride); 3159 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3160 cmpl(cnt1, cnt2); 3161 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3162 addptr(result, 16); 3163 3164 bind(ADJUST_STR); 3165 cmpl(cnt1, stride); // Do not read beyond string 3166 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3167 // Back-up string to avoid reading beyond string. 3168 lea(result, Address(result, cnt1, scale1, -16)); 3169 movl(cnt1, stride); 3170 jmpb(SCAN_TO_SUBSTR); 3171 3172 // Found a potential substr 3173 bind(FOUND_CANDIDATE); 3174 // After pcmpestri tmp(rcx) contains matched element index 3175 3176 // Make sure string is still long enough 3177 subl(cnt1, tmp); 3178 cmpl(cnt1, cnt2); 3179 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3180 // Left less then substring. 3181 3182 bind(RET_NOT_FOUND); 3183 movl(result, -1); 3184 jmp(CLEANUP); 3185 3186 bind(FOUND_SUBSTR); 3187 // Compute start addr of substr 3188 lea(result, Address(result, tmp, scale1)); 3189 if (int_cnt2 > 0) { // Constant substring 3190 // Repeat search for small substring (< 8 chars) 3191 // from new point without reloading substring. 3192 // Have to check that we don't read beyond string. 3193 cmpl(tmp, stride-int_cnt2); 3194 jccb(Assembler::greater, ADJUST_STR); 3195 // Fall through if matched whole substring. 3196 } else { // non constant 3197 assert(int_cnt2 == -1, "should be != 0"); 3198 3199 addl(tmp, cnt2); 3200 // Found result if we matched whole substring. 3201 cmpl(tmp, stride); 3202 jcc(Assembler::lessEqual, RET_FOUND); 3203 3204 // Repeat search for small substring (<= 8 chars) 3205 // from new point 'str1' without reloading substring. 3206 cmpl(cnt2, stride); 3207 // Have to check that we don't read beyond string. 3208 jccb(Assembler::lessEqual, ADJUST_STR); 3209 3210 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3211 // Compare the rest of substring (> 8 chars). 3212 movptr(str1, result); 3213 3214 cmpl(tmp, cnt2); 3215 // First 8 chars are already matched. 3216 jccb(Assembler::equal, CHECK_NEXT); 3217 3218 bind(SCAN_SUBSTR); 3219 pcmpestri(vec, Address(str1, 0), mode); 3220 // Need to reload strings pointers if not matched whole vector 3221 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3222 3223 bind(CHECK_NEXT); 3224 subl(cnt2, stride); 3225 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3226 addptr(str1, 16); 3227 if (ae == StrIntrinsicNode::UL) { 3228 addptr(str2, 8); 3229 } else { 3230 addptr(str2, 16); 3231 } 3232 subl(cnt1, stride); 3233 cmpl(cnt2, stride); // Do not read beyond substring 3234 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3235 // Back-up strings to avoid reading beyond substring. 3236 3237 if (ae == StrIntrinsicNode::UL) { 3238 lea(str2, Address(str2, cnt2, scale2, -8)); 3239 lea(str1, Address(str1, cnt2, scale1, -16)); 3240 } else { 3241 lea(str2, Address(str2, cnt2, scale2, -16)); 3242 lea(str1, Address(str1, cnt2, scale1, -16)); 3243 } 3244 subl(cnt1, cnt2); 3245 movl(cnt2, stride); 3246 addl(cnt1, stride); 3247 bind(CONT_SCAN_SUBSTR); 3248 if (ae == StrIntrinsicNode::UL) { 3249 pmovzxbw(vec, Address(str2, 0)); 3250 } else { 3251 movdqu(vec, Address(str2, 0)); 3252 } 3253 jmp(SCAN_SUBSTR); 3254 3255 bind(RET_FOUND_LONG); 3256 movptr(str1, Address(rsp, wordSize)); 3257 } // non constant 3258 3259 bind(RET_FOUND); 3260 // Compute substr offset 3261 subptr(result, str1); 3262 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3263 shrl(result, 1); // index 3264 } 3265 bind(CLEANUP); 3266 pop(rsp); // restore SP 3267 3268 } // string_indexof 3269 3270 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3271 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3272 ShortBranchVerifier sbv(this); 3273 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3274 3275 int stride = 8; 3276 3277 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3278 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3279 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3280 FOUND_SEQ_CHAR, DONE_LABEL; 3281 3282 movptr(result, str1); 3283 if (UseAVX >= 2) { 3284 cmpl(cnt1, stride); 3285 jcc(Assembler::less, SCAN_TO_CHAR); 3286 cmpl(cnt1, 2*stride); 3287 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3288 movdl(vec1, ch); 3289 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3290 vpxor(vec2, vec2); 3291 movl(tmp, cnt1); 3292 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3293 andl(cnt1,0x0000000F); //tail count (in chars) 3294 3295 bind(SCAN_TO_16_CHAR_LOOP); 3296 vmovdqu(vec3, Address(result, 0)); 3297 vpcmpeqw(vec3, vec3, vec1, 1); 3298 vptest(vec2, vec3); 3299 jcc(Assembler::carryClear, FOUND_CHAR); 3300 addptr(result, 32); 3301 subl(tmp, 2*stride); 3302 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3303 jmp(SCAN_TO_8_CHAR); 3304 bind(SCAN_TO_8_CHAR_INIT); 3305 movdl(vec1, ch); 3306 pshuflw(vec1, vec1, 0x00); 3307 pshufd(vec1, vec1, 0); 3308 pxor(vec2, vec2); 3309 } 3310 bind(SCAN_TO_8_CHAR); 3311 cmpl(cnt1, stride); 3312 jcc(Assembler::less, SCAN_TO_CHAR); 3313 if (UseAVX < 2) { 3314 movdl(vec1, ch); 3315 pshuflw(vec1, vec1, 0x00); 3316 pshufd(vec1, vec1, 0); 3317 pxor(vec2, vec2); 3318 } 3319 movl(tmp, cnt1); 3320 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3321 andl(cnt1,0x00000007); //tail count (in chars) 3322 3323 bind(SCAN_TO_8_CHAR_LOOP); 3324 movdqu(vec3, Address(result, 0)); 3325 pcmpeqw(vec3, vec1); 3326 ptest(vec2, vec3); 3327 jcc(Assembler::carryClear, FOUND_CHAR); 3328 addptr(result, 16); 3329 subl(tmp, stride); 3330 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3331 bind(SCAN_TO_CHAR); 3332 testl(cnt1, cnt1); 3333 jcc(Assembler::zero, RET_NOT_FOUND); 3334 bind(SCAN_TO_CHAR_LOOP); 3335 load_unsigned_short(tmp, Address(result, 0)); 3336 cmpl(ch, tmp); 3337 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3338 addptr(result, 2); 3339 subl(cnt1, 1); 3340 jccb(Assembler::zero, RET_NOT_FOUND); 3341 jmp(SCAN_TO_CHAR_LOOP); 3342 3343 bind(RET_NOT_FOUND); 3344 movl(result, -1); 3345 jmpb(DONE_LABEL); 3346 3347 bind(FOUND_CHAR); 3348 if (UseAVX >= 2) { 3349 vpmovmskb(tmp, vec3); 3350 } else { 3351 pmovmskb(tmp, vec3); 3352 } 3353 bsfl(ch, tmp); 3354 addptr(result, ch); 3355 3356 bind(FOUND_SEQ_CHAR); 3357 subptr(result, str1); 3358 shrl(result, 1); 3359 3360 bind(DONE_LABEL); 3361 } // string_indexof_char 3362 3363 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3364 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3365 ShortBranchVerifier sbv(this); 3366 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3367 3368 int stride = 16; 3369 3370 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3371 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3372 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3373 FOUND_SEQ_CHAR, DONE_LABEL; 3374 3375 movptr(result, str1); 3376 if (UseAVX >= 2) { 3377 cmpl(cnt1, stride); 3378 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3379 cmpl(cnt1, stride*2); 3380 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3381 movdl(vec1, ch); 3382 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3383 vpxor(vec2, vec2); 3384 movl(tmp, cnt1); 3385 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3386 andl(cnt1,0x0000001F); //tail count (in chars) 3387 3388 bind(SCAN_TO_32_CHAR_LOOP); 3389 vmovdqu(vec3, Address(result, 0)); 3390 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3391 vptest(vec2, vec3); 3392 jcc(Assembler::carryClear, FOUND_CHAR); 3393 addptr(result, 32); 3394 subl(tmp, stride*2); 3395 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3396 jmp(SCAN_TO_16_CHAR); 3397 3398 bind(SCAN_TO_16_CHAR_INIT); 3399 movdl(vec1, ch); 3400 pxor(vec2, vec2); 3401 pshufb(vec1, vec2); 3402 } 3403 3404 bind(SCAN_TO_16_CHAR); 3405 cmpl(cnt1, stride); 3406 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3407 if (UseAVX < 2) { 3408 movdl(vec1, ch); 3409 pxor(vec2, vec2); 3410 pshufb(vec1, vec2); 3411 } 3412 movl(tmp, cnt1); 3413 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3414 andl(cnt1,0x0000000F); //tail count (in bytes) 3415 3416 bind(SCAN_TO_16_CHAR_LOOP); 3417 movdqu(vec3, Address(result, 0)); 3418 pcmpeqb(vec3, vec1); 3419 ptest(vec2, vec3); 3420 jcc(Assembler::carryClear, FOUND_CHAR); 3421 addptr(result, 16); 3422 subl(tmp, stride); 3423 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3424 3425 bind(SCAN_TO_CHAR_INIT); 3426 testl(cnt1, cnt1); 3427 jcc(Assembler::zero, RET_NOT_FOUND); 3428 bind(SCAN_TO_CHAR_LOOP); 3429 load_unsigned_byte(tmp, Address(result, 0)); 3430 cmpl(ch, tmp); 3431 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3432 addptr(result, 1); 3433 subl(cnt1, 1); 3434 jccb(Assembler::zero, RET_NOT_FOUND); 3435 jmp(SCAN_TO_CHAR_LOOP); 3436 3437 bind(RET_NOT_FOUND); 3438 movl(result, -1); 3439 jmpb(DONE_LABEL); 3440 3441 bind(FOUND_CHAR); 3442 if (UseAVX >= 2) { 3443 vpmovmskb(tmp, vec3); 3444 } else { 3445 pmovmskb(tmp, vec3); 3446 } 3447 bsfl(ch, tmp); 3448 addptr(result, ch); 3449 3450 bind(FOUND_SEQ_CHAR); 3451 subptr(result, str1); 3452 3453 bind(DONE_LABEL); 3454 } // stringL_indexof_char 3455 3456 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3457 switch (eltype) { 3458 case T_BOOLEAN: return sizeof(jboolean); 3459 case T_BYTE: return sizeof(jbyte); 3460 case T_SHORT: return sizeof(jshort); 3461 case T_CHAR: return sizeof(jchar); 3462 case T_INT: return sizeof(jint); 3463 default: 3464 ShouldNotReachHere(); 3465 return -1; 3466 } 3467 } 3468 3469 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3470 switch (eltype) { 3471 // T_BOOLEAN used as surrogate for unsigned byte 3472 case T_BOOLEAN: movzbl(dst, src); break; 3473 case T_BYTE: movsbl(dst, src); break; 3474 case T_SHORT: movswl(dst, src); break; 3475 case T_CHAR: movzwl(dst, src); break; 3476 case T_INT: movl(dst, src); break; 3477 default: 3478 ShouldNotReachHere(); 3479 } 3480 } 3481 3482 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3483 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3484 } 3485 3486 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3487 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3488 } 3489 3490 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3491 const int vlen = Assembler::AVX_256bit; 3492 switch (eltype) { 3493 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3494 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3495 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3496 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3497 case T_INT: 3498 // do nothing 3499 break; 3500 default: 3501 ShouldNotReachHere(); 3502 } 3503 } 3504 3505 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3506 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3507 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3508 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3509 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3510 BasicType eltype) { 3511 ShortBranchVerifier sbv(this); 3512 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3513 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3514 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3515 3516 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3517 SHORT_UNROLLED_LOOP_EXIT, 3518 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3519 UNROLLED_VECTOR_LOOP_BEGIN, 3520 END; 3521 switch (eltype) { 3522 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3523 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3524 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3525 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3526 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3527 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3528 } 3529 3530 // For "renaming" for readibility of the code 3531 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3532 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3533 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3534 3535 const int elsize = arrays_hashcode_elsize(eltype); 3536 3537 /* 3538 if (cnt1 >= 2) { 3539 if (cnt1 >= 32) { 3540 UNROLLED VECTOR LOOP 3541 } 3542 UNROLLED SCALAR LOOP 3543 } 3544 SINGLE SCALAR 3545 */ 3546 3547 cmpl(cnt1, 32); 3548 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3549 3550 // cnt1 >= 32 && generate_vectorized_loop 3551 xorl(index, index); 3552 3553 // vresult = IntVector.zero(I256); 3554 for (int idx = 0; idx < 4; idx++) { 3555 vpxor(vresult[idx], vresult[idx]); 3556 } 3557 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3558 Register bound = tmp2; 3559 Register next = tmp3; 3560 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3561 movl(next, Address(tmp2, 0)); 3562 movdl(vnext, next); 3563 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3564 3565 // index = 0; 3566 // bound = cnt1 & ~(32 - 1); 3567 movl(bound, cnt1); 3568 andl(bound, ~(32 - 1)); 3569 // for (; index < bound; index += 32) { 3570 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3571 // result *= next; 3572 imull(result, next); 3573 // loop fission to upfront the cost of fetching from memory, OOO execution 3574 // can then hopefully do a better job of prefetching 3575 for (int idx = 0; idx < 4; idx++) { 3576 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3577 } 3578 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3579 for (int idx = 0; idx < 4; idx++) { 3580 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3581 arrays_hashcode_elvcast(vtmp[idx], eltype); 3582 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3583 } 3584 // index += 32; 3585 addl(index, 32); 3586 // index < bound; 3587 cmpl(index, bound); 3588 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3589 // } 3590 3591 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3592 subl(cnt1, bound); 3593 // release bound 3594 3595 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3596 for (int idx = 0; idx < 4; idx++) { 3597 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3598 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3599 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3600 } 3601 // result += vresult.reduceLanes(ADD); 3602 for (int idx = 0; idx < 4; idx++) { 3603 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3604 } 3605 3606 // } else if (cnt1 < 32) { 3607 3608 bind(SHORT_UNROLLED_BEGIN); 3609 // int i = 1; 3610 movl(index, 1); 3611 cmpl(index, cnt1); 3612 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3613 3614 // for (; i < cnt1 ; i += 2) { 3615 bind(SHORT_UNROLLED_LOOP_BEGIN); 3616 movl(tmp3, 961); 3617 imull(result, tmp3); 3618 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3619 movl(tmp3, tmp2); 3620 shll(tmp3, 5); 3621 subl(tmp3, tmp2); 3622 addl(result, tmp3); 3623 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3624 addl(result, tmp3); 3625 addl(index, 2); 3626 cmpl(index, cnt1); 3627 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3628 3629 // } 3630 // if (i >= cnt1) { 3631 bind(SHORT_UNROLLED_LOOP_EXIT); 3632 jccb(Assembler::greater, END); 3633 movl(tmp2, result); 3634 shll(result, 5); 3635 subl(result, tmp2); 3636 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3637 addl(result, tmp3); 3638 // } 3639 bind(END); 3640 3641 BLOCK_COMMENT("} // arrays_hashcode"); 3642 3643 } // arrays_hashcode 3644 3645 // helper function for string_compare 3646 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3647 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3648 Address::ScaleFactor scale2, Register index, int ae) { 3649 if (ae == StrIntrinsicNode::LL) { 3650 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3651 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3652 } else if (ae == StrIntrinsicNode::UU) { 3653 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3654 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3655 } else { 3656 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3657 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3658 } 3659 } 3660 3661 // Compare strings, used for char[] and byte[]. 3662 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3663 Register cnt1, Register cnt2, Register result, 3664 XMMRegister vec1, int ae, KRegister mask) { 3665 ShortBranchVerifier sbv(this); 3666 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3667 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3668 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3669 int stride2x2 = 0x40; 3670 Address::ScaleFactor scale = Address::no_scale; 3671 Address::ScaleFactor scale1 = Address::no_scale; 3672 Address::ScaleFactor scale2 = Address::no_scale; 3673 3674 if (ae != StrIntrinsicNode::LL) { 3675 stride2x2 = 0x20; 3676 } 3677 3678 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3679 shrl(cnt2, 1); 3680 } 3681 // Compute the minimum of the string lengths and the 3682 // difference of the string lengths (stack). 3683 // Do the conditional move stuff 3684 movl(result, cnt1); 3685 subl(cnt1, cnt2); 3686 push(cnt1); 3687 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3688 3689 // Is the minimum length zero? 3690 testl(cnt2, cnt2); 3691 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3692 if (ae == StrIntrinsicNode::LL) { 3693 // Load first bytes 3694 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3695 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3696 } else if (ae == StrIntrinsicNode::UU) { 3697 // Load first characters 3698 load_unsigned_short(result, Address(str1, 0)); 3699 load_unsigned_short(cnt1, Address(str2, 0)); 3700 } else { 3701 load_unsigned_byte(result, Address(str1, 0)); 3702 load_unsigned_short(cnt1, Address(str2, 0)); 3703 } 3704 subl(result, cnt1); 3705 jcc(Assembler::notZero, POP_LABEL); 3706 3707 if (ae == StrIntrinsicNode::UU) { 3708 // Divide length by 2 to get number of chars 3709 shrl(cnt2, 1); 3710 } 3711 cmpl(cnt2, 1); 3712 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3713 3714 // Check if the strings start at the same location and setup scale and stride 3715 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3716 cmpptr(str1, str2); 3717 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3718 if (ae == StrIntrinsicNode::LL) { 3719 scale = Address::times_1; 3720 stride = 16; 3721 } else { 3722 scale = Address::times_2; 3723 stride = 8; 3724 } 3725 } else { 3726 scale1 = Address::times_1; 3727 scale2 = Address::times_2; 3728 // scale not used 3729 stride = 8; 3730 } 3731 3732 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3733 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3734 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3735 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3736 Label COMPARE_TAIL_LONG; 3737 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3738 3739 int pcmpmask = 0x19; 3740 if (ae == StrIntrinsicNode::LL) { 3741 pcmpmask &= ~0x01; 3742 } 3743 3744 // Setup to compare 16-chars (32-bytes) vectors, 3745 // start from first character again because it has aligned address. 3746 if (ae == StrIntrinsicNode::LL) { 3747 stride2 = 32; 3748 } else { 3749 stride2 = 16; 3750 } 3751 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3752 adr_stride = stride << scale; 3753 } else { 3754 adr_stride1 = 8; //stride << scale1; 3755 adr_stride2 = 16; //stride << scale2; 3756 } 3757 3758 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3759 // rax and rdx are used by pcmpestri as elements counters 3760 movl(result, cnt2); 3761 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3762 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3763 3764 // fast path : compare first 2 8-char vectors. 3765 bind(COMPARE_16_CHARS); 3766 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3767 movdqu(vec1, Address(str1, 0)); 3768 } else { 3769 pmovzxbw(vec1, Address(str1, 0)); 3770 } 3771 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3772 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3773 3774 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3775 movdqu(vec1, Address(str1, adr_stride)); 3776 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3777 } else { 3778 pmovzxbw(vec1, Address(str1, adr_stride1)); 3779 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3780 } 3781 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3782 addl(cnt1, stride); 3783 3784 // Compare the characters at index in cnt1 3785 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3786 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3787 subl(result, cnt2); 3788 jmp(POP_LABEL); 3789 3790 // Setup the registers to start vector comparison loop 3791 bind(COMPARE_WIDE_VECTORS); 3792 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3793 lea(str1, Address(str1, result, scale)); 3794 lea(str2, Address(str2, result, scale)); 3795 } else { 3796 lea(str1, Address(str1, result, scale1)); 3797 lea(str2, Address(str2, result, scale2)); 3798 } 3799 subl(result, stride2); 3800 subl(cnt2, stride2); 3801 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3802 negptr(result); 3803 3804 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3805 bind(COMPARE_WIDE_VECTORS_LOOP); 3806 3807 #ifdef _LP64 3808 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3809 cmpl(cnt2, stride2x2); 3810 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3811 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3812 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3813 3814 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3815 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3816 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3817 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3818 } else { 3819 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3820 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3821 } 3822 kortestql(mask, mask); 3823 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3824 addptr(result, stride2x2); // update since we already compared at this addr 3825 subl(cnt2, stride2x2); // and sub the size too 3826 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3827 3828 vpxor(vec1, vec1); 3829 jmpb(COMPARE_WIDE_TAIL); 3830 }//if (VM_Version::supports_avx512vlbw()) 3831 #endif // _LP64 3832 3833 3834 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3835 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3836 vmovdqu(vec1, Address(str1, result, scale)); 3837 vpxor(vec1, Address(str2, result, scale)); 3838 } else { 3839 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3840 vpxor(vec1, Address(str2, result, scale2)); 3841 } 3842 vptest(vec1, vec1); 3843 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3844 addptr(result, stride2); 3845 subl(cnt2, stride2); 3846 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3847 // clean upper bits of YMM registers 3848 vpxor(vec1, vec1); 3849 3850 // compare wide vectors tail 3851 bind(COMPARE_WIDE_TAIL); 3852 testptr(result, result); 3853 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3854 3855 movl(result, stride2); 3856 movl(cnt2, result); 3857 negptr(result); 3858 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3859 3860 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3861 bind(VECTOR_NOT_EQUAL); 3862 // clean upper bits of YMM registers 3863 vpxor(vec1, vec1); 3864 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3865 lea(str1, Address(str1, result, scale)); 3866 lea(str2, Address(str2, result, scale)); 3867 } else { 3868 lea(str1, Address(str1, result, scale1)); 3869 lea(str2, Address(str2, result, scale2)); 3870 } 3871 jmp(COMPARE_16_CHARS); 3872 3873 // Compare tail chars, length between 1 to 15 chars 3874 bind(COMPARE_TAIL_LONG); 3875 movl(cnt2, result); 3876 cmpl(cnt2, stride); 3877 jcc(Assembler::less, COMPARE_SMALL_STR); 3878 3879 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3880 movdqu(vec1, Address(str1, 0)); 3881 } else { 3882 pmovzxbw(vec1, Address(str1, 0)); 3883 } 3884 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3885 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3886 subptr(cnt2, stride); 3887 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3888 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3889 lea(str1, Address(str1, result, scale)); 3890 lea(str2, Address(str2, result, scale)); 3891 } else { 3892 lea(str1, Address(str1, result, scale1)); 3893 lea(str2, Address(str2, result, scale2)); 3894 } 3895 negptr(cnt2); 3896 jmpb(WHILE_HEAD_LABEL); 3897 3898 bind(COMPARE_SMALL_STR); 3899 } else if (UseSSE42Intrinsics) { 3900 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3901 int pcmpmask = 0x19; 3902 // Setup to compare 8-char (16-byte) vectors, 3903 // start from first character again because it has aligned address. 3904 movl(result, cnt2); 3905 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3906 if (ae == StrIntrinsicNode::LL) { 3907 pcmpmask &= ~0x01; 3908 } 3909 jcc(Assembler::zero, COMPARE_TAIL); 3910 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3911 lea(str1, Address(str1, result, scale)); 3912 lea(str2, Address(str2, result, scale)); 3913 } else { 3914 lea(str1, Address(str1, result, scale1)); 3915 lea(str2, Address(str2, result, scale2)); 3916 } 3917 negptr(result); 3918 3919 // pcmpestri 3920 // inputs: 3921 // vec1- substring 3922 // rax - negative string length (elements count) 3923 // mem - scanned string 3924 // rdx - string length (elements count) 3925 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3926 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3927 // outputs: 3928 // rcx - first mismatched element index 3929 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3930 3931 bind(COMPARE_WIDE_VECTORS); 3932 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3933 movdqu(vec1, Address(str1, result, scale)); 3934 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3935 } else { 3936 pmovzxbw(vec1, Address(str1, result, scale1)); 3937 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3938 } 3939 // After pcmpestri cnt1(rcx) contains mismatched element index 3940 3941 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3942 addptr(result, stride); 3943 subptr(cnt2, stride); 3944 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3945 3946 // compare wide vectors tail 3947 testptr(result, result); 3948 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3949 3950 movl(cnt2, stride); 3951 movl(result, stride); 3952 negptr(result); 3953 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3954 movdqu(vec1, Address(str1, result, scale)); 3955 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3956 } else { 3957 pmovzxbw(vec1, Address(str1, result, scale1)); 3958 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3959 } 3960 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3961 3962 // Mismatched characters in the vectors 3963 bind(VECTOR_NOT_EQUAL); 3964 addptr(cnt1, result); 3965 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3966 subl(result, cnt2); 3967 jmpb(POP_LABEL); 3968 3969 bind(COMPARE_TAIL); // limit is zero 3970 movl(cnt2, result); 3971 // Fallthru to tail compare 3972 } 3973 // Shift str2 and str1 to the end of the arrays, negate min 3974 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3975 lea(str1, Address(str1, cnt2, scale)); 3976 lea(str2, Address(str2, cnt2, scale)); 3977 } else { 3978 lea(str1, Address(str1, cnt2, scale1)); 3979 lea(str2, Address(str2, cnt2, scale2)); 3980 } 3981 decrementl(cnt2); // first character was compared already 3982 negptr(cnt2); 3983 3984 // Compare the rest of the elements 3985 bind(WHILE_HEAD_LABEL); 3986 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3987 subl(result, cnt1); 3988 jccb(Assembler::notZero, POP_LABEL); 3989 increment(cnt2); 3990 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3991 3992 // Strings are equal up to min length. Return the length difference. 3993 bind(LENGTH_DIFF_LABEL); 3994 pop(result); 3995 if (ae == StrIntrinsicNode::UU) { 3996 // Divide diff by 2 to get number of chars 3997 sarl(result, 1); 3998 } 3999 jmpb(DONE_LABEL); 4000 4001 #ifdef _LP64 4002 if (VM_Version::supports_avx512vlbw()) { 4003 4004 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 4005 4006 kmovql(cnt1, mask); 4007 notq(cnt1); 4008 bsfq(cnt2, cnt1); 4009 if (ae != StrIntrinsicNode::LL) { 4010 // Divide diff by 2 to get number of chars 4011 sarl(cnt2, 1); 4012 } 4013 addq(result, cnt2); 4014 if (ae == StrIntrinsicNode::LL) { 4015 load_unsigned_byte(cnt1, Address(str2, result)); 4016 load_unsigned_byte(result, Address(str1, result)); 4017 } else if (ae == StrIntrinsicNode::UU) { 4018 load_unsigned_short(cnt1, Address(str2, result, scale)); 4019 load_unsigned_short(result, Address(str1, result, scale)); 4020 } else { 4021 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4022 load_unsigned_byte(result, Address(str1, result, scale1)); 4023 } 4024 subl(result, cnt1); 4025 jmpb(POP_LABEL); 4026 }//if (VM_Version::supports_avx512vlbw()) 4027 #endif // _LP64 4028 4029 // Discard the stored length difference 4030 bind(POP_LABEL); 4031 pop(cnt1); 4032 4033 // That's it 4034 bind(DONE_LABEL); 4035 if(ae == StrIntrinsicNode::UL) { 4036 negl(result); 4037 } 4038 4039 } 4040 4041 // Search for Non-ASCII character (Negative byte value) in a byte array, 4042 // return the index of the first such character, otherwise the length 4043 // of the array segment searched. 4044 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4045 // @IntrinsicCandidate 4046 // public static int countPositives(byte[] ba, int off, int len) { 4047 // for (int i = off; i < off + len; i++) { 4048 // if (ba[i] < 0) { 4049 // return i - off; 4050 // } 4051 // } 4052 // return len; 4053 // } 4054 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4055 Register result, Register tmp1, 4056 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4057 // rsi: byte array 4058 // rcx: len 4059 // rax: result 4060 ShortBranchVerifier sbv(this); 4061 assert_different_registers(ary1, len, result, tmp1); 4062 assert_different_registers(vec1, vec2); 4063 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4064 4065 movl(result, len); // copy 4066 // len == 0 4067 testl(len, len); 4068 jcc(Assembler::zero, DONE); 4069 4070 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4071 VM_Version::supports_avx512vlbw() && 4072 VM_Version::supports_bmi2()) { 4073 4074 Label test_64_loop, test_tail, BREAK_LOOP; 4075 movl(tmp1, len); 4076 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4077 4078 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4079 andl(len, 0xffffffc0); // vector count (in chars) 4080 jccb(Assembler::zero, test_tail); 4081 4082 lea(ary1, Address(ary1, len, Address::times_1)); 4083 negptr(len); 4084 4085 bind(test_64_loop); 4086 // Check whether our 64 elements of size byte contain negatives 4087 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4088 kortestql(mask1, mask1); 4089 jcc(Assembler::notZero, BREAK_LOOP); 4090 4091 addptr(len, 64); 4092 jccb(Assembler::notZero, test_64_loop); 4093 4094 bind(test_tail); 4095 // bail out when there is nothing to be done 4096 testl(tmp1, -1); 4097 jcc(Assembler::zero, DONE); 4098 4099 4100 // check the tail for absense of negatives 4101 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4102 #ifdef _LP64 4103 { 4104 Register tmp3_aliased = len; 4105 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4106 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4107 notq(tmp3_aliased); 4108 kmovql(mask2, tmp3_aliased); 4109 } 4110 #else 4111 Label k_init; 4112 jmp(k_init); 4113 4114 // We could not read 64-bits from a general purpose register thus we move 4115 // data required to compose 64 1's to the instruction stream 4116 // We emit 64 byte wide series of elements from 0..63 which later on would 4117 // be used as a compare targets with tail count contained in tmp1 register. 4118 // Result would be a k register having tmp1 consecutive number or 1 4119 // counting from least significant bit. 4120 address tmp = pc(); 4121 emit_int64(0x0706050403020100); 4122 emit_int64(0x0F0E0D0C0B0A0908); 4123 emit_int64(0x1716151413121110); 4124 emit_int64(0x1F1E1D1C1B1A1918); 4125 emit_int64(0x2726252423222120); 4126 emit_int64(0x2F2E2D2C2B2A2928); 4127 emit_int64(0x3736353433323130); 4128 emit_int64(0x3F3E3D3C3B3A3938); 4129 4130 bind(k_init); 4131 lea(len, InternalAddress(tmp)); 4132 // create mask to test for negative byte inside a vector 4133 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 4134 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 4135 4136 #endif 4137 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4138 ktestq(mask1, mask2); 4139 jcc(Assembler::zero, DONE); 4140 4141 // do a full check for negative registers in the tail 4142 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4143 // ary1 already pointing to the right place 4144 jmpb(TAIL_START); 4145 4146 bind(BREAK_LOOP); 4147 // At least one byte in the last 64 byte block was negative. 4148 // Set up to look at the last 64 bytes as if they were a tail 4149 lea(ary1, Address(ary1, len, Address::times_1)); 4150 addptr(result, len); 4151 // Ignore the very last byte: if all others are positive, 4152 // it must be negative, so we can skip right to the 2+1 byte 4153 // end comparison at this point 4154 orl(result, 63); 4155 movl(len, 63); 4156 // Fallthru to tail compare 4157 } else { 4158 4159 if (UseAVX >= 2 && UseSSE >= 2) { 4160 // With AVX2, use 32-byte vector compare 4161 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4162 4163 // Compare 32-byte vectors 4164 testl(len, 0xffffffe0); // vector count (in bytes) 4165 jccb(Assembler::zero, TAIL_START); 4166 4167 andl(len, 0xffffffe0); 4168 lea(ary1, Address(ary1, len, Address::times_1)); 4169 negptr(len); 4170 4171 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4172 movdl(vec2, tmp1); 4173 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4174 4175 bind(COMPARE_WIDE_VECTORS); 4176 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4177 vptest(vec1, vec2); 4178 jccb(Assembler::notZero, BREAK_LOOP); 4179 addptr(len, 32); 4180 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4181 4182 testl(result, 0x0000001f); // any bytes remaining? 4183 jcc(Assembler::zero, DONE); 4184 4185 // Quick test using the already prepared vector mask 4186 movl(len, result); 4187 andl(len, 0x0000001f); 4188 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4189 vptest(vec1, vec2); 4190 jcc(Assembler::zero, DONE); 4191 // There are zeros, jump to the tail to determine exactly where 4192 jmpb(TAIL_START); 4193 4194 bind(BREAK_LOOP); 4195 // At least one byte in the last 32-byte vector is negative. 4196 // Set up to look at the last 32 bytes as if they were a tail 4197 lea(ary1, Address(ary1, len, Address::times_1)); 4198 addptr(result, len); 4199 // Ignore the very last byte: if all others are positive, 4200 // it must be negative, so we can skip right to the 2+1 byte 4201 // end comparison at this point 4202 orl(result, 31); 4203 movl(len, 31); 4204 // Fallthru to tail compare 4205 } else if (UseSSE42Intrinsics) { 4206 // With SSE4.2, use double quad vector compare 4207 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4208 4209 // Compare 16-byte vectors 4210 testl(len, 0xfffffff0); // vector count (in bytes) 4211 jcc(Assembler::zero, TAIL_START); 4212 4213 andl(len, 0xfffffff0); 4214 lea(ary1, Address(ary1, len, Address::times_1)); 4215 negptr(len); 4216 4217 movl(tmp1, 0x80808080); 4218 movdl(vec2, tmp1); 4219 pshufd(vec2, vec2, 0); 4220 4221 bind(COMPARE_WIDE_VECTORS); 4222 movdqu(vec1, Address(ary1, len, Address::times_1)); 4223 ptest(vec1, vec2); 4224 jccb(Assembler::notZero, BREAK_LOOP); 4225 addptr(len, 16); 4226 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4227 4228 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4229 jcc(Assembler::zero, DONE); 4230 4231 // Quick test using the already prepared vector mask 4232 movl(len, result); 4233 andl(len, 0x0000000f); // tail count (in bytes) 4234 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4235 ptest(vec1, vec2); 4236 jcc(Assembler::zero, DONE); 4237 jmpb(TAIL_START); 4238 4239 bind(BREAK_LOOP); 4240 // At least one byte in the last 16-byte vector is negative. 4241 // Set up and look at the last 16 bytes as if they were a tail 4242 lea(ary1, Address(ary1, len, Address::times_1)); 4243 addptr(result, len); 4244 // Ignore the very last byte: if all others are positive, 4245 // it must be negative, so we can skip right to the 2+1 byte 4246 // end comparison at this point 4247 orl(result, 15); 4248 movl(len, 15); 4249 // Fallthru to tail compare 4250 } 4251 } 4252 4253 bind(TAIL_START); 4254 // Compare 4-byte vectors 4255 andl(len, 0xfffffffc); // vector count (in bytes) 4256 jccb(Assembler::zero, COMPARE_CHAR); 4257 4258 lea(ary1, Address(ary1, len, Address::times_1)); 4259 negptr(len); 4260 4261 bind(COMPARE_VECTORS); 4262 movl(tmp1, Address(ary1, len, Address::times_1)); 4263 andl(tmp1, 0x80808080); 4264 jccb(Assembler::notZero, TAIL_ADJUST); 4265 addptr(len, 4); 4266 jccb(Assembler::notZero, COMPARE_VECTORS); 4267 4268 // Compare trailing char (final 2-3 bytes), if any 4269 bind(COMPARE_CHAR); 4270 4271 testl(result, 0x2); // tail char 4272 jccb(Assembler::zero, COMPARE_BYTE); 4273 load_unsigned_short(tmp1, Address(ary1, 0)); 4274 andl(tmp1, 0x00008080); 4275 jccb(Assembler::notZero, CHAR_ADJUST); 4276 lea(ary1, Address(ary1, 2)); 4277 4278 bind(COMPARE_BYTE); 4279 testl(result, 0x1); // tail byte 4280 jccb(Assembler::zero, DONE); 4281 load_unsigned_byte(tmp1, Address(ary1, 0)); 4282 testl(tmp1, 0x00000080); 4283 jccb(Assembler::zero, DONE); 4284 subptr(result, 1); 4285 jmpb(DONE); 4286 4287 bind(TAIL_ADJUST); 4288 // there are negative bits in the last 4 byte block. 4289 // Adjust result and check the next three bytes 4290 addptr(result, len); 4291 orl(result, 3); 4292 lea(ary1, Address(ary1, len, Address::times_1)); 4293 jmpb(COMPARE_CHAR); 4294 4295 bind(CHAR_ADJUST); 4296 // We are looking at a char + optional byte tail, and found that one 4297 // of the bytes in the char is negative. Adjust the result, check the 4298 // first byte and readjust if needed. 4299 andl(result, 0xfffffffc); 4300 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4301 jccb(Assembler::notZero, DONE); 4302 addptr(result, 1); 4303 4304 // That's it 4305 bind(DONE); 4306 if (UseAVX >= 2 && UseSSE >= 2) { 4307 // clean upper bits of YMM registers 4308 vpxor(vec1, vec1); 4309 vpxor(vec2, vec2); 4310 } 4311 } 4312 4313 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4314 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4315 Register limit, Register result, Register chr, 4316 XMMRegister vec1, XMMRegister vec2, bool is_char, 4317 KRegister mask, bool expand_ary2) { 4318 // for expand_ary2, limit is the (smaller) size of the second array. 4319 ShortBranchVerifier sbv(this); 4320 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4321 4322 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4323 "Expansion only implemented for AVX2"); 4324 4325 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4326 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4327 4328 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4329 int scaleIncr = expand_ary2 ? 8 : 16; 4330 4331 if (is_array_equ) { 4332 // Check the input args 4333 cmpoop(ary1, ary2); 4334 jcc(Assembler::equal, TRUE_LABEL); 4335 4336 // Need additional checks for arrays_equals. 4337 testptr(ary1, ary1); 4338 jcc(Assembler::zero, FALSE_LABEL); 4339 testptr(ary2, ary2); 4340 jcc(Assembler::zero, FALSE_LABEL); 4341 4342 // Check the lengths 4343 movl(limit, Address(ary1, length_offset)); 4344 cmpl(limit, Address(ary2, length_offset)); 4345 jcc(Assembler::notEqual, FALSE_LABEL); 4346 } 4347 4348 // count == 0 4349 testl(limit, limit); 4350 jcc(Assembler::zero, TRUE_LABEL); 4351 4352 if (is_array_equ) { 4353 // Load array address 4354 lea(ary1, Address(ary1, base_offset)); 4355 lea(ary2, Address(ary2, base_offset)); 4356 } 4357 4358 if (is_array_equ && is_char) { 4359 // arrays_equals when used for char[]. 4360 shll(limit, 1); // byte count != 0 4361 } 4362 movl(result, limit); // copy 4363 4364 if (UseAVX >= 2) { 4365 // With AVX2, use 32-byte vector compare 4366 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4367 4368 // Compare 32-byte vectors 4369 if (expand_ary2) { 4370 andl(result, 0x0000000f); // tail count (in bytes) 4371 andl(limit, 0xfffffff0); // vector count (in bytes) 4372 jcc(Assembler::zero, COMPARE_TAIL); 4373 } else { 4374 andl(result, 0x0000001f); // tail count (in bytes) 4375 andl(limit, 0xffffffe0); // vector count (in bytes) 4376 jcc(Assembler::zero, COMPARE_TAIL_16); 4377 } 4378 4379 lea(ary1, Address(ary1, limit, scaleFactor)); 4380 lea(ary2, Address(ary2, limit, Address::times_1)); 4381 negptr(limit); 4382 4383 #ifdef _LP64 4384 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4385 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4386 4387 cmpl(limit, -64); 4388 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4389 4390 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4391 4392 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4393 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4394 kortestql(mask, mask); 4395 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4396 addptr(limit, 64); // update since we already compared at this addr 4397 cmpl(limit, -64); 4398 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4399 4400 // At this point we may still need to compare -limit+result bytes. 4401 // We could execute the next two instruction and just continue via non-wide path: 4402 // cmpl(limit, 0); 4403 // jcc(Assembler::equal, COMPARE_TAIL); // true 4404 // But since we stopped at the points ary{1,2}+limit which are 4405 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4406 // (|limit| <= 32 and result < 32), 4407 // we may just compare the last 64 bytes. 4408 // 4409 addptr(result, -64); // it is safe, bc we just came from this area 4410 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4411 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4412 kortestql(mask, mask); 4413 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4414 4415 jmp(TRUE_LABEL); 4416 4417 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4418 4419 }//if (VM_Version::supports_avx512vlbw()) 4420 #endif //_LP64 4421 bind(COMPARE_WIDE_VECTORS); 4422 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4423 if (expand_ary2) { 4424 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4425 } else { 4426 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4427 } 4428 vpxor(vec1, vec2); 4429 4430 vptest(vec1, vec1); 4431 jcc(Assembler::notZero, FALSE_LABEL); 4432 addptr(limit, scaleIncr * 2); 4433 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4434 4435 testl(result, result); 4436 jcc(Assembler::zero, TRUE_LABEL); 4437 4438 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4439 if (expand_ary2) { 4440 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4441 } else { 4442 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4443 } 4444 vpxor(vec1, vec2); 4445 4446 vptest(vec1, vec1); 4447 jcc(Assembler::notZero, FALSE_LABEL); 4448 jmp(TRUE_LABEL); 4449 4450 bind(COMPARE_TAIL_16); // limit is zero 4451 movl(limit, result); 4452 4453 // Compare 16-byte chunks 4454 andl(result, 0x0000000f); // tail count (in bytes) 4455 andl(limit, 0xfffffff0); // vector count (in bytes) 4456 jcc(Assembler::zero, COMPARE_TAIL); 4457 4458 lea(ary1, Address(ary1, limit, scaleFactor)); 4459 lea(ary2, Address(ary2, limit, Address::times_1)); 4460 negptr(limit); 4461 4462 bind(COMPARE_WIDE_VECTORS_16); 4463 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4464 if (expand_ary2) { 4465 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4466 } else { 4467 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4468 } 4469 pxor(vec1, vec2); 4470 4471 ptest(vec1, vec1); 4472 jcc(Assembler::notZero, FALSE_LABEL); 4473 addptr(limit, scaleIncr); 4474 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4475 4476 bind(COMPARE_TAIL); // limit is zero 4477 movl(limit, result); 4478 // Fallthru to tail compare 4479 } else if (UseSSE42Intrinsics) { 4480 // With SSE4.2, use double quad vector compare 4481 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4482 4483 // Compare 16-byte vectors 4484 andl(result, 0x0000000f); // tail count (in bytes) 4485 andl(limit, 0xfffffff0); // vector count (in bytes) 4486 jcc(Assembler::zero, COMPARE_TAIL); 4487 4488 lea(ary1, Address(ary1, limit, Address::times_1)); 4489 lea(ary2, Address(ary2, limit, Address::times_1)); 4490 negptr(limit); 4491 4492 bind(COMPARE_WIDE_VECTORS); 4493 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4494 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4495 pxor(vec1, vec2); 4496 4497 ptest(vec1, vec1); 4498 jcc(Assembler::notZero, FALSE_LABEL); 4499 addptr(limit, 16); 4500 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4501 4502 testl(result, result); 4503 jcc(Assembler::zero, TRUE_LABEL); 4504 4505 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4506 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4507 pxor(vec1, vec2); 4508 4509 ptest(vec1, vec1); 4510 jccb(Assembler::notZero, FALSE_LABEL); 4511 jmpb(TRUE_LABEL); 4512 4513 bind(COMPARE_TAIL); // limit is zero 4514 movl(limit, result); 4515 // Fallthru to tail compare 4516 } 4517 4518 // Compare 4-byte vectors 4519 if (expand_ary2) { 4520 testl(result, result); 4521 jccb(Assembler::zero, TRUE_LABEL); 4522 } else { 4523 andl(limit, 0xfffffffc); // vector count (in bytes) 4524 jccb(Assembler::zero, COMPARE_CHAR); 4525 } 4526 4527 lea(ary1, Address(ary1, limit, scaleFactor)); 4528 lea(ary2, Address(ary2, limit, Address::times_1)); 4529 negptr(limit); 4530 4531 bind(COMPARE_VECTORS); 4532 if (expand_ary2) { 4533 // There are no "vector" operations for bytes to shorts 4534 movzbl(chr, Address(ary2, limit, Address::times_1)); 4535 cmpw(Address(ary1, limit, Address::times_2), chr); 4536 jccb(Assembler::notEqual, FALSE_LABEL); 4537 addptr(limit, 1); 4538 jcc(Assembler::notZero, COMPARE_VECTORS); 4539 jmp(TRUE_LABEL); 4540 } else { 4541 movl(chr, Address(ary1, limit, Address::times_1)); 4542 cmpl(chr, Address(ary2, limit, Address::times_1)); 4543 jccb(Assembler::notEqual, FALSE_LABEL); 4544 addptr(limit, 4); 4545 jcc(Assembler::notZero, COMPARE_VECTORS); 4546 } 4547 4548 // Compare trailing char (final 2 bytes), if any 4549 bind(COMPARE_CHAR); 4550 testl(result, 0x2); // tail char 4551 jccb(Assembler::zero, COMPARE_BYTE); 4552 load_unsigned_short(chr, Address(ary1, 0)); 4553 load_unsigned_short(limit, Address(ary2, 0)); 4554 cmpl(chr, limit); 4555 jccb(Assembler::notEqual, FALSE_LABEL); 4556 4557 if (is_array_equ && is_char) { 4558 bind(COMPARE_BYTE); 4559 } else { 4560 lea(ary1, Address(ary1, 2)); 4561 lea(ary2, Address(ary2, 2)); 4562 4563 bind(COMPARE_BYTE); 4564 testl(result, 0x1); // tail byte 4565 jccb(Assembler::zero, TRUE_LABEL); 4566 load_unsigned_byte(chr, Address(ary1, 0)); 4567 load_unsigned_byte(limit, Address(ary2, 0)); 4568 cmpl(chr, limit); 4569 jccb(Assembler::notEqual, FALSE_LABEL); 4570 } 4571 bind(TRUE_LABEL); 4572 movl(result, 1); // return true 4573 jmpb(DONE); 4574 4575 bind(FALSE_LABEL); 4576 xorl(result, result); // return false 4577 4578 // That's it 4579 bind(DONE); 4580 if (UseAVX >= 2) { 4581 // clean upper bits of YMM registers 4582 vpxor(vec1, vec1); 4583 vpxor(vec2, vec2); 4584 } 4585 } 4586 4587 #ifdef _LP64 4588 4589 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4590 #define __ masm. 4591 Register dst = stub.data<0>(); 4592 XMMRegister src = stub.data<1>(); 4593 address target = stub.data<2>(); 4594 __ bind(stub.entry()); 4595 __ subptr(rsp, 8); 4596 __ movdbl(Address(rsp), src); 4597 __ call(RuntimeAddress(target)); 4598 __ pop(dst); 4599 __ jmp(stub.continuation()); 4600 #undef __ 4601 } 4602 4603 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4604 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4605 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4606 4607 address slowpath_target; 4608 if (dst_bt == T_INT) { 4609 if (src_bt == T_FLOAT) { 4610 cvttss2sil(dst, src); 4611 cmpl(dst, 0x80000000); 4612 slowpath_target = StubRoutines::x86::f2i_fixup(); 4613 } else { 4614 cvttsd2sil(dst, src); 4615 cmpl(dst, 0x80000000); 4616 slowpath_target = StubRoutines::x86::d2i_fixup(); 4617 } 4618 } else { 4619 if (src_bt == T_FLOAT) { 4620 cvttss2siq(dst, src); 4621 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4622 slowpath_target = StubRoutines::x86::f2l_fixup(); 4623 } else { 4624 cvttsd2siq(dst, src); 4625 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4626 slowpath_target = StubRoutines::x86::d2l_fixup(); 4627 } 4628 } 4629 4630 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4631 jcc(Assembler::equal, stub->entry()); 4632 bind(stub->continuation()); 4633 } 4634 4635 #endif // _LP64 4636 4637 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4638 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4639 switch(ideal_opc) { 4640 case Op_LShiftVS: 4641 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4642 case Op_LShiftVI: 4643 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4644 case Op_LShiftVL: 4645 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4646 case Op_RShiftVS: 4647 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4648 case Op_RShiftVI: 4649 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4650 case Op_RShiftVL: 4651 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4652 case Op_URShiftVS: 4653 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4654 case Op_URShiftVI: 4655 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4656 case Op_URShiftVL: 4657 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4658 case Op_RotateRightV: 4659 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4660 case Op_RotateLeftV: 4661 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4662 default: 4663 fatal("Unsupported masked operation"); break; 4664 } 4665 } 4666 4667 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4668 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4669 bool is_varshift) { 4670 switch (ideal_opc) { 4671 case Op_AddVB: 4672 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4673 case Op_AddVS: 4674 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4675 case Op_AddVI: 4676 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4677 case Op_AddVL: 4678 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4679 case Op_AddVF: 4680 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4681 case Op_AddVD: 4682 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4683 case Op_SubVB: 4684 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4685 case Op_SubVS: 4686 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4687 case Op_SubVI: 4688 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4689 case Op_SubVL: 4690 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4691 case Op_SubVF: 4692 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4693 case Op_SubVD: 4694 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4695 case Op_MulVS: 4696 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4697 case Op_MulVI: 4698 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4699 case Op_MulVL: 4700 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4701 case Op_MulVF: 4702 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4703 case Op_MulVD: 4704 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4705 case Op_DivVF: 4706 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4707 case Op_DivVD: 4708 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4709 case Op_SqrtVF: 4710 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4711 case Op_SqrtVD: 4712 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4713 case Op_AbsVB: 4714 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4715 case Op_AbsVS: 4716 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4717 case Op_AbsVI: 4718 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4719 case Op_AbsVL: 4720 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4721 case Op_FmaVF: 4722 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4723 case Op_FmaVD: 4724 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4725 case Op_VectorRearrange: 4726 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4727 case Op_LShiftVS: 4728 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4729 case Op_LShiftVI: 4730 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4731 case Op_LShiftVL: 4732 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4733 case Op_RShiftVS: 4734 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4735 case Op_RShiftVI: 4736 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4737 case Op_RShiftVL: 4738 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4739 case Op_URShiftVS: 4740 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4741 case Op_URShiftVI: 4742 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4743 case Op_URShiftVL: 4744 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4745 case Op_RotateLeftV: 4746 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4747 case Op_RotateRightV: 4748 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4749 case Op_MaxV: 4750 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4751 case Op_MinV: 4752 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4753 case Op_XorV: 4754 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4755 case Op_OrV: 4756 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4757 case Op_AndV: 4758 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4759 default: 4760 fatal("Unsupported masked operation"); break; 4761 } 4762 } 4763 4764 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4765 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4766 switch (ideal_opc) { 4767 case Op_AddVB: 4768 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4769 case Op_AddVS: 4770 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4771 case Op_AddVI: 4772 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4773 case Op_AddVL: 4774 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4775 case Op_AddVF: 4776 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4777 case Op_AddVD: 4778 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4779 case Op_SubVB: 4780 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4781 case Op_SubVS: 4782 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4783 case Op_SubVI: 4784 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4785 case Op_SubVL: 4786 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4787 case Op_SubVF: 4788 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4789 case Op_SubVD: 4790 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4791 case Op_MulVS: 4792 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4793 case Op_MulVI: 4794 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4795 case Op_MulVL: 4796 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4797 case Op_MulVF: 4798 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4799 case Op_MulVD: 4800 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4801 case Op_DivVF: 4802 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4803 case Op_DivVD: 4804 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4805 case Op_FmaVF: 4806 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4807 case Op_FmaVD: 4808 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4809 case Op_MaxV: 4810 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4811 case Op_MinV: 4812 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4813 case Op_XorV: 4814 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4815 case Op_OrV: 4816 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4817 case Op_AndV: 4818 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4819 default: 4820 fatal("Unsupported masked operation"); break; 4821 } 4822 } 4823 4824 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4825 KRegister src1, KRegister src2) { 4826 BasicType etype = T_ILLEGAL; 4827 switch(mask_len) { 4828 case 2: 4829 case 4: 4830 case 8: etype = T_BYTE; break; 4831 case 16: etype = T_SHORT; break; 4832 case 32: etype = T_INT; break; 4833 case 64: etype = T_LONG; break; 4834 default: fatal("Unsupported type"); break; 4835 } 4836 assert(etype != T_ILLEGAL, ""); 4837 switch(ideal_opc) { 4838 case Op_AndVMask: 4839 kand(etype, dst, src1, src2); break; 4840 case Op_OrVMask: 4841 kor(etype, dst, src1, src2); break; 4842 case Op_XorVMask: 4843 kxor(etype, dst, src1, src2); break; 4844 default: 4845 fatal("Unsupported masked operation"); break; 4846 } 4847 } 4848 4849 /* 4850 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4851 * If src is NaN, the result is 0. 4852 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4853 * the result is equal to the value of Integer.MIN_VALUE. 4854 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4855 * the result is equal to the value of Integer.MAX_VALUE. 4856 */ 4857 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4858 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4859 Register rscratch, AddressLiteral float_sign_flip, 4860 int vec_enc) { 4861 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4862 Label done; 4863 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4864 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4865 vptest(xtmp2, xtmp2, vec_enc); 4866 jccb(Assembler::equal, done); 4867 4868 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4869 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4870 4871 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4872 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4873 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4874 4875 // Recompute the mask for remaining special value. 4876 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4877 // Extract SRC values corresponding to TRUE mask lanes. 4878 vpand(xtmp4, xtmp2, src, vec_enc); 4879 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4880 // values are set. 4881 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4882 4883 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4884 bind(done); 4885 } 4886 4887 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4888 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4889 Register rscratch, AddressLiteral float_sign_flip, 4890 int vec_enc) { 4891 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4892 Label done; 4893 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4894 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4895 kortestwl(ktmp1, ktmp1); 4896 jccb(Assembler::equal, done); 4897 4898 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4899 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4900 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4901 4902 kxorwl(ktmp1, ktmp1, ktmp2); 4903 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4904 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4905 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4906 bind(done); 4907 } 4908 4909 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4910 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4911 Register rscratch, AddressLiteral double_sign_flip, 4912 int vec_enc) { 4913 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4914 4915 Label done; 4916 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4917 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4918 kortestwl(ktmp1, ktmp1); 4919 jccb(Assembler::equal, done); 4920 4921 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4922 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4923 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4924 4925 kxorwl(ktmp1, ktmp1, ktmp2); 4926 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4927 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4928 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4929 bind(done); 4930 } 4931 4932 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4933 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4934 Register rscratch, AddressLiteral float_sign_flip, 4935 int vec_enc) { 4936 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4937 Label done; 4938 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4939 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4940 kortestwl(ktmp1, ktmp1); 4941 jccb(Assembler::equal, done); 4942 4943 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4944 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4945 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4946 4947 kxorwl(ktmp1, ktmp1, ktmp2); 4948 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4949 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4950 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4951 bind(done); 4952 } 4953 4954 /* 4955 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4956 * If src is NaN, the result is 0. 4957 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4958 * the result is equal to the value of Long.MIN_VALUE. 4959 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4960 * the result is equal to the value of Long.MAX_VALUE. 4961 */ 4962 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4963 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4964 Register rscratch, AddressLiteral double_sign_flip, 4965 int vec_enc) { 4966 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4967 4968 Label done; 4969 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4970 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4971 kortestwl(ktmp1, ktmp1); 4972 jccb(Assembler::equal, done); 4973 4974 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4975 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4976 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4977 4978 kxorwl(ktmp1, ktmp1, ktmp2); 4979 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4980 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4981 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4982 bind(done); 4983 } 4984 4985 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 4986 XMMRegister xtmp, int index, int vec_enc) { 4987 assert(vec_enc < Assembler::AVX_512bit, ""); 4988 if (vec_enc == Assembler::AVX_256bit) { 4989 vextractf128_high(xtmp, src); 4990 vshufps(dst, src, xtmp, index, vec_enc); 4991 } else { 4992 vshufps(dst, src, zero, index, vec_enc); 4993 } 4994 } 4995 4996 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4997 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 4998 AddressLiteral float_sign_flip, int src_vec_enc) { 4999 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5000 5001 Label done; 5002 // Compare the destination lanes with float_sign_flip 5003 // value to get mask for all special values. 5004 movdqu(xtmp1, float_sign_flip, rscratch); 5005 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5006 ptest(xtmp2, xtmp2); 5007 jccb(Assembler::equal, done); 5008 5009 // Flip float_sign_flip to get max integer value. 5010 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5011 pxor(xtmp1, xtmp4); 5012 5013 // Set detination lanes corresponding to unordered source lanes as zero. 5014 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5015 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5016 5017 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5018 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5019 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5020 5021 // Recompute the mask for remaining special value. 5022 pxor(xtmp2, xtmp3); 5023 // Extract mask corresponding to non-negative source lanes. 5024 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5025 5026 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5027 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5028 pand(xtmp3, xtmp2); 5029 5030 // Replace destination lanes holding special value(0x80000000) with max int 5031 // if corresponding source lane holds a +ve value. 5032 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5033 bind(done); 5034 } 5035 5036 5037 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5038 XMMRegister xtmp, Register rscratch, int vec_enc) { 5039 switch(to_elem_bt) { 5040 case T_SHORT: 5041 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5042 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5043 vpackusdw(dst, dst, zero, vec_enc); 5044 if (vec_enc == Assembler::AVX_256bit) { 5045 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5046 } 5047 break; 5048 case T_BYTE: 5049 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5050 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5051 vpackusdw(dst, dst, zero, vec_enc); 5052 if (vec_enc == Assembler::AVX_256bit) { 5053 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5054 } 5055 vpackuswb(dst, dst, zero, vec_enc); 5056 break; 5057 default: assert(false, "%s", type2name(to_elem_bt)); 5058 } 5059 } 5060 5061 /* 5062 * Algorithm for vector D2L and F2I conversions:- 5063 * a) Perform vector D2L/F2I cast. 5064 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5065 * It signifies that source value could be any of the special floating point 5066 * values(NaN,-Inf,Inf,Max,-Min). 5067 * c) Set destination to zero if source is NaN value. 5068 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5069 */ 5070 5071 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5072 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5073 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5074 int to_elem_sz = type2aelembytes(to_elem_bt); 5075 assert(to_elem_sz <= 4, ""); 5076 vcvttps2dq(dst, src, vec_enc); 5077 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5078 if (to_elem_sz < 4) { 5079 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5080 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5081 } 5082 } 5083 5084 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5085 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5086 Register rscratch, int vec_enc) { 5087 int to_elem_sz = type2aelembytes(to_elem_bt); 5088 assert(to_elem_sz <= 4, ""); 5089 vcvttps2dq(dst, src, vec_enc); 5090 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5091 switch(to_elem_bt) { 5092 case T_INT: 5093 break; 5094 case T_SHORT: 5095 evpmovdw(dst, dst, vec_enc); 5096 break; 5097 case T_BYTE: 5098 evpmovdb(dst, dst, vec_enc); 5099 break; 5100 default: assert(false, "%s", type2name(to_elem_bt)); 5101 } 5102 } 5103 5104 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5105 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5106 Register rscratch, int vec_enc) { 5107 evcvttps2qq(dst, src, vec_enc); 5108 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5109 } 5110 5111 // Handling for downcasting from double to integer or sub-word types on AVX2. 5112 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5113 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5114 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5115 int to_elem_sz = type2aelembytes(to_elem_bt); 5116 assert(to_elem_sz < 8, ""); 5117 vcvttpd2dq(dst, src, vec_enc); 5118 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5119 float_sign_flip, vec_enc); 5120 if (to_elem_sz < 4) { 5121 // xtmp4 holds all zero lanes. 5122 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5123 } 5124 } 5125 5126 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5127 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5128 KRegister ktmp2, AddressLiteral sign_flip, 5129 Register rscratch, int vec_enc) { 5130 if (VM_Version::supports_avx512dq()) { 5131 evcvttpd2qq(dst, src, vec_enc); 5132 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5133 switch(to_elem_bt) { 5134 case T_LONG: 5135 break; 5136 case T_INT: 5137 evpmovsqd(dst, dst, vec_enc); 5138 break; 5139 case T_SHORT: 5140 evpmovsqd(dst, dst, vec_enc); 5141 evpmovdw(dst, dst, vec_enc); 5142 break; 5143 case T_BYTE: 5144 evpmovsqd(dst, dst, vec_enc); 5145 evpmovdb(dst, dst, vec_enc); 5146 break; 5147 default: assert(false, "%s", type2name(to_elem_bt)); 5148 } 5149 } else { 5150 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5151 vcvttpd2dq(dst, src, vec_enc); 5152 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5153 switch(to_elem_bt) { 5154 case T_INT: 5155 break; 5156 case T_SHORT: 5157 evpmovdw(dst, dst, vec_enc); 5158 break; 5159 case T_BYTE: 5160 evpmovdb(dst, dst, vec_enc); 5161 break; 5162 default: assert(false, "%s", type2name(to_elem_bt)); 5163 } 5164 } 5165 } 5166 5167 #ifdef _LP64 5168 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5169 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5170 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5171 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5172 // and re-instantiate original MXCSR.RC mode after that. 5173 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5174 5175 mov64(tmp, julong_cast(0.5L)); 5176 evpbroadcastq(xtmp1, tmp, vec_enc); 5177 vaddpd(xtmp1, src , xtmp1, vec_enc); 5178 evcvtpd2qq(dst, xtmp1, vec_enc); 5179 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5180 double_sign_flip, vec_enc);; 5181 5182 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5183 } 5184 5185 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5186 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5187 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5188 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5189 // and re-instantiate original MXCSR.RC mode after that. 5190 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5191 5192 movl(tmp, jint_cast(0.5)); 5193 movq(xtmp1, tmp); 5194 vbroadcastss(xtmp1, xtmp1, vec_enc); 5195 vaddps(xtmp1, src , xtmp1, vec_enc); 5196 vcvtps2dq(dst, xtmp1, vec_enc); 5197 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5198 float_sign_flip, vec_enc); 5199 5200 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5201 } 5202 5203 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5204 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5205 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5206 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5207 // and re-instantiate original MXCSR.RC mode after that. 5208 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5209 5210 movl(tmp, jint_cast(0.5)); 5211 movq(xtmp1, tmp); 5212 vbroadcastss(xtmp1, xtmp1, vec_enc); 5213 vaddps(xtmp1, src , xtmp1, vec_enc); 5214 vcvtps2dq(dst, xtmp1, vec_enc); 5215 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5216 5217 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5218 } 5219 #endif // _LP64 5220 5221 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5222 BasicType from_elem_bt, BasicType to_elem_bt) { 5223 switch (from_elem_bt) { 5224 case T_BYTE: 5225 switch (to_elem_bt) { 5226 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5227 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5228 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5229 default: ShouldNotReachHere(); 5230 } 5231 break; 5232 case T_SHORT: 5233 switch (to_elem_bt) { 5234 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5235 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5236 default: ShouldNotReachHere(); 5237 } 5238 break; 5239 case T_INT: 5240 assert(to_elem_bt == T_LONG, ""); 5241 vpmovzxdq(dst, src, vlen_enc); 5242 break; 5243 default: 5244 ShouldNotReachHere(); 5245 } 5246 } 5247 5248 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5249 BasicType from_elem_bt, BasicType to_elem_bt) { 5250 switch (from_elem_bt) { 5251 case T_BYTE: 5252 switch (to_elem_bt) { 5253 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5254 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5255 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5256 default: ShouldNotReachHere(); 5257 } 5258 break; 5259 case T_SHORT: 5260 switch (to_elem_bt) { 5261 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5262 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5263 default: ShouldNotReachHere(); 5264 } 5265 break; 5266 case T_INT: 5267 assert(to_elem_bt == T_LONG, ""); 5268 vpmovsxdq(dst, src, vlen_enc); 5269 break; 5270 default: 5271 ShouldNotReachHere(); 5272 } 5273 } 5274 5275 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5276 BasicType dst_bt, BasicType src_bt, int vlen) { 5277 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5278 assert(vlen_enc != AVX_512bit, ""); 5279 5280 int dst_bt_size = type2aelembytes(dst_bt); 5281 int src_bt_size = type2aelembytes(src_bt); 5282 if (dst_bt_size > src_bt_size) { 5283 switch (dst_bt_size / src_bt_size) { 5284 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5285 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5286 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5287 default: ShouldNotReachHere(); 5288 } 5289 } else { 5290 assert(dst_bt_size < src_bt_size, ""); 5291 switch (src_bt_size / dst_bt_size) { 5292 case 2: { 5293 if (vlen_enc == AVX_128bit) { 5294 vpacksswb(dst, src, src, vlen_enc); 5295 } else { 5296 vpacksswb(dst, src, src, vlen_enc); 5297 vpermq(dst, dst, 0x08, vlen_enc); 5298 } 5299 break; 5300 } 5301 case 4: { 5302 if (vlen_enc == AVX_128bit) { 5303 vpackssdw(dst, src, src, vlen_enc); 5304 vpacksswb(dst, dst, dst, vlen_enc); 5305 } else { 5306 vpackssdw(dst, src, src, vlen_enc); 5307 vpermq(dst, dst, 0x08, vlen_enc); 5308 vpacksswb(dst, dst, dst, AVX_128bit); 5309 } 5310 break; 5311 } 5312 case 8: { 5313 if (vlen_enc == AVX_128bit) { 5314 vpshufd(dst, src, 0x08, vlen_enc); 5315 vpackssdw(dst, dst, dst, vlen_enc); 5316 vpacksswb(dst, dst, dst, vlen_enc); 5317 } else { 5318 vpshufd(dst, src, 0x08, vlen_enc); 5319 vpermq(dst, dst, 0x08, vlen_enc); 5320 vpackssdw(dst, dst, dst, AVX_128bit); 5321 vpacksswb(dst, dst, dst, AVX_128bit); 5322 } 5323 break; 5324 } 5325 default: ShouldNotReachHere(); 5326 } 5327 } 5328 } 5329 5330 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5331 bool merge, BasicType bt, int vlen_enc) { 5332 if (bt == T_INT) { 5333 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5334 } else { 5335 assert(bt == T_LONG, ""); 5336 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5337 } 5338 } 5339 5340 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5341 bool merge, BasicType bt, int vlen_enc) { 5342 if (bt == T_INT) { 5343 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5344 } else { 5345 assert(bt == T_LONG, ""); 5346 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5347 } 5348 } 5349 5350 #ifdef _LP64 5351 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5352 Register rtmp2, XMMRegister xtmp, int mask_len, 5353 int vec_enc) { 5354 int index = 0; 5355 int vindex = 0; 5356 mov64(rtmp1, 0x0101010101010101L); 5357 pdepq(rtmp1, src, rtmp1); 5358 if (mask_len > 8) { 5359 movq(rtmp2, src); 5360 vpxor(xtmp, xtmp, xtmp, vec_enc); 5361 movq(xtmp, rtmp1); 5362 } 5363 movq(dst, rtmp1); 5364 5365 mask_len -= 8; 5366 while (mask_len > 0) { 5367 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5368 index++; 5369 if ((index % 2) == 0) { 5370 pxor(xtmp, xtmp); 5371 } 5372 mov64(rtmp1, 0x0101010101010101L); 5373 shrq(rtmp2, 8); 5374 pdepq(rtmp1, rtmp2, rtmp1); 5375 pinsrq(xtmp, rtmp1, index % 2); 5376 vindex = index / 2; 5377 if (vindex) { 5378 // Write entire 16 byte vector when both 64 bit 5379 // lanes are update to save redundant instructions. 5380 if (index % 2) { 5381 vinsertf128(dst, dst, xtmp, vindex); 5382 } 5383 } else { 5384 vmovdqu(dst, xtmp); 5385 } 5386 mask_len -= 8; 5387 } 5388 } 5389 5390 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5391 switch(opc) { 5392 case Op_VectorMaskTrueCount: 5393 popcntq(dst, tmp); 5394 break; 5395 case Op_VectorMaskLastTrue: 5396 if (VM_Version::supports_lzcnt()) { 5397 lzcntq(tmp, tmp); 5398 movl(dst, 63); 5399 subl(dst, tmp); 5400 } else { 5401 movl(dst, -1); 5402 bsrq(tmp, tmp); 5403 cmov32(Assembler::notZero, dst, tmp); 5404 } 5405 break; 5406 case Op_VectorMaskFirstTrue: 5407 if (VM_Version::supports_bmi1()) { 5408 if (masklen < 32) { 5409 orl(tmp, 1 << masklen); 5410 tzcntl(dst, tmp); 5411 } else if (masklen == 32) { 5412 tzcntl(dst, tmp); 5413 } else { 5414 assert(masklen == 64, ""); 5415 tzcntq(dst, tmp); 5416 } 5417 } else { 5418 if (masklen < 32) { 5419 orl(tmp, 1 << masklen); 5420 bsfl(dst, tmp); 5421 } else { 5422 assert(masklen == 32 || masklen == 64, ""); 5423 movl(dst, masklen); 5424 if (masklen == 32) { 5425 bsfl(tmp, tmp); 5426 } else { 5427 bsfq(tmp, tmp); 5428 } 5429 cmov32(Assembler::notZero, dst, tmp); 5430 } 5431 } 5432 break; 5433 case Op_VectorMaskToLong: 5434 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5435 break; 5436 default: assert(false, "Unhandled mask operation"); 5437 } 5438 } 5439 5440 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5441 int masklen, int masksize, int vec_enc) { 5442 assert(VM_Version::supports_popcnt(), ""); 5443 5444 if(VM_Version::supports_avx512bw()) { 5445 kmovql(tmp, mask); 5446 } else { 5447 assert(masklen <= 16, ""); 5448 kmovwl(tmp, mask); 5449 } 5450 5451 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5452 // operations needs to be clipped. 5453 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5454 andq(tmp, (1 << masklen) - 1); 5455 } 5456 5457 vector_mask_operation_helper(opc, dst, tmp, masklen); 5458 } 5459 5460 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5461 Register tmp, int masklen, BasicType bt, int vec_enc) { 5462 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5463 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5464 assert(VM_Version::supports_popcnt(), ""); 5465 5466 bool need_clip = false; 5467 switch(bt) { 5468 case T_BOOLEAN: 5469 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5470 vpxor(xtmp, xtmp, xtmp, vec_enc); 5471 vpsubb(xtmp, xtmp, mask, vec_enc); 5472 vpmovmskb(tmp, xtmp, vec_enc); 5473 need_clip = masklen < 16; 5474 break; 5475 case T_BYTE: 5476 vpmovmskb(tmp, mask, vec_enc); 5477 need_clip = masklen < 16; 5478 break; 5479 case T_SHORT: 5480 vpacksswb(xtmp, mask, mask, vec_enc); 5481 if (masklen >= 16) { 5482 vpermpd(xtmp, xtmp, 8, vec_enc); 5483 } 5484 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5485 need_clip = masklen < 16; 5486 break; 5487 case T_INT: 5488 case T_FLOAT: 5489 vmovmskps(tmp, mask, vec_enc); 5490 need_clip = masklen < 4; 5491 break; 5492 case T_LONG: 5493 case T_DOUBLE: 5494 vmovmskpd(tmp, mask, vec_enc); 5495 need_clip = masklen < 2; 5496 break; 5497 default: assert(false, "Unhandled type, %s", type2name(bt)); 5498 } 5499 5500 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5501 // operations needs to be clipped. 5502 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5503 // need_clip implies masklen < 32 5504 andq(tmp, (1 << masklen) - 1); 5505 } 5506 5507 vector_mask_operation_helper(opc, dst, tmp, masklen); 5508 } 5509 5510 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5511 Register rtmp2, int mask_len) { 5512 kmov(rtmp1, src); 5513 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5514 mov64(rtmp2, -1L); 5515 pextq(rtmp2, rtmp2, rtmp1); 5516 kmov(dst, rtmp2); 5517 } 5518 5519 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5520 XMMRegister mask, Register rtmp, Register rscratch, 5521 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5522 int vec_enc) { 5523 assert(type2aelembytes(bt) >= 4, ""); 5524 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5525 address compress_perm_table = nullptr; 5526 address expand_perm_table = nullptr; 5527 if (type2aelembytes(bt) == 8) { 5528 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5529 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5530 vmovmskpd(rtmp, mask, vec_enc); 5531 } else { 5532 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5533 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5534 vmovmskps(rtmp, mask, vec_enc); 5535 } 5536 shlq(rtmp, 5); // for 32 byte permute row. 5537 if (opcode == Op_CompressV) { 5538 lea(rscratch, ExternalAddress(compress_perm_table)); 5539 } else { 5540 lea(rscratch, ExternalAddress(expand_perm_table)); 5541 } 5542 addptr(rtmp, rscratch); 5543 vmovdqu(permv, Address(rtmp)); 5544 vpermps(dst, permv, src, Assembler::AVX_256bit); 5545 vpxor(xtmp, xtmp, xtmp, vec_enc); 5546 // Blend the result with zero vector using permute mask, each column entry 5547 // in a permute table row contains either a valid permute index or a -1 (default) 5548 // value, this can potentially be used as a blending mask after 5549 // compressing/expanding the source vector lanes. 5550 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5551 } 5552 5553 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5554 bool merge, BasicType bt, int vec_enc) { 5555 if (opcode == Op_CompressV) { 5556 switch(bt) { 5557 case T_BYTE: 5558 evpcompressb(dst, mask, src, merge, vec_enc); 5559 break; 5560 case T_CHAR: 5561 case T_SHORT: 5562 evpcompressw(dst, mask, src, merge, vec_enc); 5563 break; 5564 case T_INT: 5565 evpcompressd(dst, mask, src, merge, vec_enc); 5566 break; 5567 case T_FLOAT: 5568 evcompressps(dst, mask, src, merge, vec_enc); 5569 break; 5570 case T_LONG: 5571 evpcompressq(dst, mask, src, merge, vec_enc); 5572 break; 5573 case T_DOUBLE: 5574 evcompresspd(dst, mask, src, merge, vec_enc); 5575 break; 5576 default: 5577 fatal("Unsupported type %s", type2name(bt)); 5578 break; 5579 } 5580 } else { 5581 assert(opcode == Op_ExpandV, ""); 5582 switch(bt) { 5583 case T_BYTE: 5584 evpexpandb(dst, mask, src, merge, vec_enc); 5585 break; 5586 case T_CHAR: 5587 case T_SHORT: 5588 evpexpandw(dst, mask, src, merge, vec_enc); 5589 break; 5590 case T_INT: 5591 evpexpandd(dst, mask, src, merge, vec_enc); 5592 break; 5593 case T_FLOAT: 5594 evexpandps(dst, mask, src, merge, vec_enc); 5595 break; 5596 case T_LONG: 5597 evpexpandq(dst, mask, src, merge, vec_enc); 5598 break; 5599 case T_DOUBLE: 5600 evexpandpd(dst, mask, src, merge, vec_enc); 5601 break; 5602 default: 5603 fatal("Unsupported type %s", type2name(bt)); 5604 break; 5605 } 5606 } 5607 } 5608 #endif 5609 5610 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5611 KRegister ktmp1, int vec_enc) { 5612 if (opcode == Op_SignumVD) { 5613 vsubpd(dst, zero, one, vec_enc); 5614 // if src < 0 ? -1 : 1 5615 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5616 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5617 // if src == NaN, -0.0 or 0.0 return src. 5618 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5619 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5620 } else { 5621 assert(opcode == Op_SignumVF, ""); 5622 vsubps(dst, zero, one, vec_enc); 5623 // if src < 0 ? -1 : 1 5624 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5625 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5626 // if src == NaN, -0.0 or 0.0 return src. 5627 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5628 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5629 } 5630 } 5631 5632 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5633 XMMRegister xtmp1, int vec_enc) { 5634 if (opcode == Op_SignumVD) { 5635 vsubpd(dst, zero, one, vec_enc); 5636 // if src < 0 ? -1 : 1 5637 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5638 // if src == NaN, -0.0 or 0.0 return src. 5639 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5640 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5641 } else { 5642 assert(opcode == Op_SignumVF, ""); 5643 vsubps(dst, zero, one, vec_enc); 5644 // if src < 0 ? -1 : 1 5645 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5646 // if src == NaN, -0.0 or 0.0 return src. 5647 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5648 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5649 } 5650 } 5651 5652 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5653 if (VM_Version::supports_avx512bw()) { 5654 if (mask_len > 32) { 5655 kmovql(dst, src); 5656 } else { 5657 kmovdl(dst, src); 5658 if (mask_len != 32) { 5659 kshiftrdl(dst, dst, 32 - mask_len); 5660 } 5661 } 5662 } else { 5663 assert(mask_len <= 16, ""); 5664 kmovwl(dst, src); 5665 if (mask_len != 16) { 5666 kshiftrwl(dst, dst, 16 - mask_len); 5667 } 5668 } 5669 } 5670 5671 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5672 int lane_size = type2aelembytes(bt); 5673 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5674 if ((is_LP64 || lane_size < 8) && 5675 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5676 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5677 movptr(rtmp, imm32); 5678 switch(lane_size) { 5679 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5680 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5681 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5682 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5683 fatal("Unsupported lane size %d", lane_size); 5684 break; 5685 } 5686 } else { 5687 movptr(rtmp, imm32); 5688 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5689 switch(lane_size) { 5690 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5691 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5692 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5693 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5694 fatal("Unsupported lane size %d", lane_size); 5695 break; 5696 } 5697 } 5698 } 5699 5700 // 5701 // Following is lookup table based popcount computation algorithm:- 5702 // Index Bit set count 5703 // [ 0000 -> 0, 5704 // 0001 -> 1, 5705 // 0010 -> 1, 5706 // 0011 -> 2, 5707 // 0100 -> 1, 5708 // 0101 -> 2, 5709 // 0110 -> 2, 5710 // 0111 -> 3, 5711 // 1000 -> 1, 5712 // 1001 -> 2, 5713 // 1010 -> 3, 5714 // 1011 -> 3, 5715 // 1100 -> 2, 5716 // 1101 -> 3, 5717 // 1111 -> 4 ] 5718 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5719 // shuffle indices for lookup table access. 5720 // b. Right shift each byte of vector lane by 4 positions. 5721 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5722 // shuffle indices for lookup table access. 5723 // d. Add the bitset count of upper and lower 4 bits of each byte. 5724 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5725 // count of all the bytes of a quadword. 5726 // f. Perform step e. for upper 128bit vector lane. 5727 // g. Pack the bitset count of quadwords back to double word. 5728 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5729 5730 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5731 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5732 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5733 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5734 vpsrlw(dst, src, 4, vec_enc); 5735 vpand(dst, dst, xtmp1, vec_enc); 5736 vpand(xtmp1, src, xtmp1, vec_enc); 5737 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5738 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5739 vpshufb(dst, xtmp2, dst, vec_enc); 5740 vpaddb(dst, dst, xtmp1, vec_enc); 5741 } 5742 5743 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5744 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5745 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5746 // Following code is as per steps e,f,g and h of above algorithm. 5747 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5748 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5749 vpsadbw(dst, dst, xtmp2, vec_enc); 5750 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5751 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5752 vpackuswb(dst, xtmp1, dst, vec_enc); 5753 } 5754 5755 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5756 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5757 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5758 // Add the popcount of upper and lower bytes of word. 5759 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5760 vpsrlw(dst, xtmp1, 8, vec_enc); 5761 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5762 vpaddw(dst, dst, xtmp1, vec_enc); 5763 } 5764 5765 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5766 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5767 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5768 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5769 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5770 } 5771 5772 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5773 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5774 switch(bt) { 5775 case T_LONG: 5776 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5777 break; 5778 case T_INT: 5779 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5780 break; 5781 case T_CHAR: 5782 case T_SHORT: 5783 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5784 break; 5785 case T_BYTE: 5786 case T_BOOLEAN: 5787 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5788 break; 5789 default: 5790 fatal("Unsupported type %s", type2name(bt)); 5791 break; 5792 } 5793 } 5794 5795 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5796 KRegister mask, bool merge, int vec_enc) { 5797 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5798 switch(bt) { 5799 case T_LONG: 5800 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5801 evpopcntq(dst, mask, src, merge, vec_enc); 5802 break; 5803 case T_INT: 5804 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5805 evpopcntd(dst, mask, src, merge, vec_enc); 5806 break; 5807 case T_CHAR: 5808 case T_SHORT: 5809 assert(VM_Version::supports_avx512_bitalg(), ""); 5810 evpopcntw(dst, mask, src, merge, vec_enc); 5811 break; 5812 case T_BYTE: 5813 case T_BOOLEAN: 5814 assert(VM_Version::supports_avx512_bitalg(), ""); 5815 evpopcntb(dst, mask, src, merge, vec_enc); 5816 break; 5817 default: 5818 fatal("Unsupported type %s", type2name(bt)); 5819 break; 5820 } 5821 } 5822 5823 #ifndef _LP64 5824 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5825 assert(VM_Version::supports_avx512bw(), ""); 5826 kmovdl(tmp, src); 5827 kunpckdql(dst, tmp, tmp); 5828 } 5829 #endif 5830 5831 // Bit reversal algorithm first reverses the bits of each byte followed by 5832 // a byte level reversal for multi-byte primitive types (short/int/long). 5833 // Algorithm performs a lookup table access to get reverse bit sequence 5834 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5835 // is obtained by swapping the reverse bit sequences of upper and lower 5836 // nibble of a byte. 5837 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5838 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5839 if (VM_Version::supports_avx512vlbw()) { 5840 5841 // Get the reverse bit sequence of lower nibble of each byte. 5842 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5843 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5844 evpandq(dst, xtmp2, src, vec_enc); 5845 vpshufb(dst, xtmp1, dst, vec_enc); 5846 vpsllq(dst, dst, 4, vec_enc); 5847 5848 // Get the reverse bit sequence of upper nibble of each byte. 5849 vpandn(xtmp2, xtmp2, src, vec_enc); 5850 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5851 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5852 5853 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5854 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5855 evporq(xtmp2, dst, xtmp2, vec_enc); 5856 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5857 5858 } else if(vec_enc == Assembler::AVX_512bit) { 5859 // Shift based bit reversal. 5860 assert(bt == T_LONG || bt == T_INT, ""); 5861 5862 // Swap lower and upper nibble of each byte. 5863 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5864 5865 // Swap two least and most significant bits of each nibble. 5866 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5867 5868 // Swap adjacent pair of bits. 5869 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5870 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5871 5872 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5873 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5874 } else { 5875 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5876 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5877 5878 // Get the reverse bit sequence of lower nibble of each byte. 5879 vpand(dst, xtmp2, src, vec_enc); 5880 vpshufb(dst, xtmp1, dst, vec_enc); 5881 vpsllq(dst, dst, 4, vec_enc); 5882 5883 // Get the reverse bit sequence of upper nibble of each byte. 5884 vpandn(xtmp2, xtmp2, src, vec_enc); 5885 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5886 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5887 5888 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5889 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5890 vpor(xtmp2, dst, xtmp2, vec_enc); 5891 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5892 } 5893 } 5894 5895 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5896 XMMRegister xtmp, Register rscratch) { 5897 assert(VM_Version::supports_gfni(), ""); 5898 assert(rscratch != noreg || always_reachable(mask), "missing"); 5899 5900 // Galois field instruction based bit reversal based on following algorithm. 5901 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5902 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5903 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5904 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5905 } 5906 5907 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5908 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5909 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5910 evpandq(dst, xtmp1, src, vec_enc); 5911 vpsllq(dst, dst, nbits, vec_enc); 5912 vpandn(xtmp1, xtmp1, src, vec_enc); 5913 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5914 evporq(dst, dst, xtmp1, vec_enc); 5915 } 5916 5917 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5918 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5919 // Shift based bit reversal. 5920 assert(VM_Version::supports_evex(), ""); 5921 switch(bt) { 5922 case T_LONG: 5923 // Swap upper and lower double word of each quad word. 5924 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5925 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5926 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5927 break; 5928 case T_INT: 5929 // Swap upper and lower word of each double word. 5930 evprord(xtmp1, k0, src, 16, true, vec_enc); 5931 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5932 break; 5933 case T_CHAR: 5934 case T_SHORT: 5935 // Swap upper and lower byte of each word. 5936 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 5937 break; 5938 case T_BYTE: 5939 evmovdquq(dst, k0, src, true, vec_enc); 5940 break; 5941 default: 5942 fatal("Unsupported type %s", type2name(bt)); 5943 break; 5944 } 5945 } 5946 5947 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5948 if (bt == T_BYTE) { 5949 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 5950 evmovdquq(dst, k0, src, true, vec_enc); 5951 } else { 5952 vmovdqu(dst, src); 5953 } 5954 return; 5955 } 5956 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 5957 // pre-computed shuffle indices. 5958 switch(bt) { 5959 case T_LONG: 5960 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 5961 break; 5962 case T_INT: 5963 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 5964 break; 5965 case T_CHAR: 5966 case T_SHORT: 5967 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 5968 break; 5969 default: 5970 fatal("Unsupported type %s", type2name(bt)); 5971 break; 5972 } 5973 vpshufb(dst, src, dst, vec_enc); 5974 } 5975 5976 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5977 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5978 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 5979 assert(is_integral_type(bt), ""); 5980 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5981 assert(VM_Version::supports_avx512cd(), ""); 5982 switch(bt) { 5983 case T_LONG: 5984 evplzcntq(dst, ktmp, src, merge, vec_enc); 5985 break; 5986 case T_INT: 5987 evplzcntd(dst, ktmp, src, merge, vec_enc); 5988 break; 5989 case T_SHORT: 5990 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 5991 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 5992 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 5993 vpunpckhwd(dst, xtmp1, src, vec_enc); 5994 evplzcntd(dst, ktmp, dst, merge, vec_enc); 5995 vpackusdw(dst, xtmp2, dst, vec_enc); 5996 break; 5997 case T_BYTE: 5998 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5999 // accessing the lookup table. 6000 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6001 // accessing the lookup table. 6002 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6003 assert(VM_Version::supports_avx512bw(), ""); 6004 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6005 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6006 vpand(xtmp2, dst, src, vec_enc); 6007 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6008 vpsrlw(xtmp3, src, 4, vec_enc); 6009 vpand(xtmp3, dst, xtmp3, vec_enc); 6010 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6011 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6012 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6013 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6014 break; 6015 default: 6016 fatal("Unsupported type %s", type2name(bt)); 6017 break; 6018 } 6019 } 6020 6021 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6022 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6023 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6024 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6025 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6026 // accessing the lookup table. 6027 vpand(dst, xtmp2, src, vec_enc); 6028 vpshufb(dst, xtmp1, dst, vec_enc); 6029 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6030 // accessing the lookup table. 6031 vpsrlw(xtmp3, src, 4, vec_enc); 6032 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6033 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6034 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6035 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6036 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6037 vpaddb(dst, dst, xtmp2, vec_enc); 6038 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6039 } 6040 6041 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6042 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6043 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6044 // Add zero counts of lower byte and upper byte of a word if 6045 // upper byte holds a zero value. 6046 vpsrlw(xtmp3, src, 8, vec_enc); 6047 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6048 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6049 vpsllw(xtmp2, dst, 8, vec_enc); 6050 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6051 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6052 vpsrlw(dst, dst, 8, vec_enc); 6053 } 6054 6055 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6056 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6057 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6058 // hence biased exponent can be used to compute leading zero count as per 6059 // following formula:- 6060 // LZCNT = 32 - (biased_exp - 127) 6061 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6062 6063 // Broadcast 0xFF 6064 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6065 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6066 6067 // Extract biased exponent. 6068 vcvtdq2ps(dst, src, vec_enc); 6069 vpsrld(dst, dst, 23, vec_enc); 6070 vpand(dst, dst, xtmp1, vec_enc); 6071 6072 // Broadcast 127. 6073 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6074 // Exponent = biased_exp - 127 6075 vpsubd(dst, dst, xtmp1, vec_enc); 6076 6077 // Exponent = Exponent + 1 6078 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6079 vpaddd(dst, dst, xtmp3, vec_enc); 6080 6081 // Replace -ve exponent with zero, exponent is -ve when src 6082 // lane contains a zero value. 6083 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6084 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6085 6086 // Rematerialize broadcast 32. 6087 vpslld(xtmp1, xtmp3, 5, vec_enc); 6088 // Exponent is 32 if corresponding source lane contains max_int value. 6089 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6090 // LZCNT = 32 - exponent 6091 vpsubd(dst, xtmp1, dst, vec_enc); 6092 6093 // Replace LZCNT with a value 1 if corresponding source lane 6094 // contains max_int value. 6095 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6096 6097 // Replace biased_exp with 0 if source lane value is less than zero. 6098 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6099 vblendvps(dst, dst, xtmp2, src, vec_enc); 6100 } 6101 6102 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6103 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6104 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6105 // Add zero counts of lower word and upper word of a double word if 6106 // upper word holds a zero value. 6107 vpsrld(xtmp3, src, 16, vec_enc); 6108 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6109 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6110 vpslld(xtmp2, dst, 16, vec_enc); 6111 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6112 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6113 vpsrld(dst, dst, 16, vec_enc); 6114 // Add zero counts of lower doubleword and upper doubleword of a 6115 // quadword if upper doubleword holds a zero value. 6116 vpsrlq(xtmp3, src, 32, vec_enc); 6117 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6118 vpsllq(xtmp2, dst, 32, vec_enc); 6119 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6120 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6121 vpsrlq(dst, dst, 32, vec_enc); 6122 } 6123 6124 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6125 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6126 Register rtmp, int vec_enc) { 6127 assert(is_integral_type(bt), "unexpected type"); 6128 assert(vec_enc < Assembler::AVX_512bit, ""); 6129 switch(bt) { 6130 case T_LONG: 6131 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6132 break; 6133 case T_INT: 6134 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6135 break; 6136 case T_SHORT: 6137 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6138 break; 6139 case T_BYTE: 6140 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6141 break; 6142 default: 6143 fatal("Unsupported type %s", type2name(bt)); 6144 break; 6145 } 6146 } 6147 6148 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6149 switch(bt) { 6150 case T_BYTE: 6151 vpsubb(dst, src1, src2, vec_enc); 6152 break; 6153 case T_SHORT: 6154 vpsubw(dst, src1, src2, vec_enc); 6155 break; 6156 case T_INT: 6157 vpsubd(dst, src1, src2, vec_enc); 6158 break; 6159 case T_LONG: 6160 vpsubq(dst, src1, src2, vec_enc); 6161 break; 6162 default: 6163 fatal("Unsupported type %s", type2name(bt)); 6164 break; 6165 } 6166 } 6167 6168 // Trailing zero count computation is based on leading zero count operation as per 6169 // following equation. All AVX3 targets support AVX512CD feature which offers 6170 // direct vector instruction to compute leading zero count. 6171 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6172 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6173 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6174 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6175 assert(is_integral_type(bt), ""); 6176 // xtmp = -1 6177 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6178 // xtmp = xtmp + src 6179 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6180 // xtmp = xtmp & ~src 6181 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6182 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6183 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6184 vpsub(bt, dst, xtmp4, dst, vec_enc); 6185 } 6186 6187 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6188 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6189 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6190 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6191 assert(is_integral_type(bt), ""); 6192 // xtmp = 0 6193 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6194 // xtmp = 0 - src 6195 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6196 // xtmp = xtmp | src 6197 vpor(xtmp3, xtmp3, src, vec_enc); 6198 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6199 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6200 vpsub(bt, dst, xtmp1, dst, vec_enc); 6201 } 6202 6203 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6204 Label done; 6205 Label neg_divisor_fastpath; 6206 cmpl(divisor, 0); 6207 jccb(Assembler::less, neg_divisor_fastpath); 6208 xorl(rdx, rdx); 6209 divl(divisor); 6210 jmpb(done); 6211 bind(neg_divisor_fastpath); 6212 // Fastpath for divisor < 0: 6213 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6214 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6215 movl(rdx, rax); 6216 subl(rdx, divisor); 6217 if (VM_Version::supports_bmi1()) { 6218 andnl(rax, rdx, rax); 6219 } else { 6220 notl(rdx); 6221 andl(rax, rdx); 6222 } 6223 shrl(rax, 31); 6224 bind(done); 6225 } 6226 6227 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6228 Label done; 6229 Label neg_divisor_fastpath; 6230 cmpl(divisor, 0); 6231 jccb(Assembler::less, neg_divisor_fastpath); 6232 xorl(rdx, rdx); 6233 divl(divisor); 6234 jmpb(done); 6235 bind(neg_divisor_fastpath); 6236 // Fastpath when divisor < 0: 6237 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6238 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6239 movl(rdx, rax); 6240 subl(rax, divisor); 6241 if (VM_Version::supports_bmi1()) { 6242 andnl(rax, rax, rdx); 6243 } else { 6244 notl(rax); 6245 andl(rax, rdx); 6246 } 6247 sarl(rax, 31); 6248 andl(rax, divisor); 6249 subl(rdx, rax); 6250 bind(done); 6251 } 6252 6253 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6254 Label done; 6255 Label neg_divisor_fastpath; 6256 6257 cmpl(divisor, 0); 6258 jccb(Assembler::less, neg_divisor_fastpath); 6259 xorl(rdx, rdx); 6260 divl(divisor); 6261 jmpb(done); 6262 bind(neg_divisor_fastpath); 6263 // Fastpath for divisor < 0: 6264 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6265 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6266 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6267 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6268 movl(rdx, rax); 6269 subl(rax, divisor); 6270 if (VM_Version::supports_bmi1()) { 6271 andnl(rax, rax, rdx); 6272 } else { 6273 notl(rax); 6274 andl(rax, rdx); 6275 } 6276 movl(tmp, rax); 6277 shrl(rax, 31); // quotient 6278 sarl(tmp, 31); 6279 andl(tmp, divisor); 6280 subl(rdx, tmp); // remainder 6281 bind(done); 6282 } 6283 6284 #ifdef _LP64 6285 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6286 XMMRegister xtmp2, Register rtmp) { 6287 if(VM_Version::supports_gfni()) { 6288 // Galois field instruction based bit reversal based on following algorithm. 6289 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6290 mov64(rtmp, 0x8040201008040201L); 6291 movq(xtmp1, src); 6292 movq(xtmp2, rtmp); 6293 gf2p8affineqb(xtmp1, xtmp2, 0); 6294 movq(dst, xtmp1); 6295 } else { 6296 // Swap even and odd numbered bits. 6297 movl(rtmp, src); 6298 andl(rtmp, 0x55555555); 6299 shll(rtmp, 1); 6300 movl(dst, src); 6301 andl(dst, 0xAAAAAAAA); 6302 shrl(dst, 1); 6303 orl(dst, rtmp); 6304 6305 // Swap LSB and MSB 2 bits of each nibble. 6306 movl(rtmp, dst); 6307 andl(rtmp, 0x33333333); 6308 shll(rtmp, 2); 6309 andl(dst, 0xCCCCCCCC); 6310 shrl(dst, 2); 6311 orl(dst, rtmp); 6312 6313 // Swap LSB and MSB 4 bits of each byte. 6314 movl(rtmp, dst); 6315 andl(rtmp, 0x0F0F0F0F); 6316 shll(rtmp, 4); 6317 andl(dst, 0xF0F0F0F0); 6318 shrl(dst, 4); 6319 orl(dst, rtmp); 6320 } 6321 bswapl(dst); 6322 } 6323 6324 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6325 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6326 if(VM_Version::supports_gfni()) { 6327 // Galois field instruction based bit reversal based on following algorithm. 6328 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6329 mov64(rtmp1, 0x8040201008040201L); 6330 movq(xtmp1, src); 6331 movq(xtmp2, rtmp1); 6332 gf2p8affineqb(xtmp1, xtmp2, 0); 6333 movq(dst, xtmp1); 6334 } else { 6335 // Swap even and odd numbered bits. 6336 movq(rtmp1, src); 6337 mov64(rtmp2, 0x5555555555555555L); 6338 andq(rtmp1, rtmp2); 6339 shlq(rtmp1, 1); 6340 movq(dst, src); 6341 notq(rtmp2); 6342 andq(dst, rtmp2); 6343 shrq(dst, 1); 6344 orq(dst, rtmp1); 6345 6346 // Swap LSB and MSB 2 bits of each nibble. 6347 movq(rtmp1, dst); 6348 mov64(rtmp2, 0x3333333333333333L); 6349 andq(rtmp1, rtmp2); 6350 shlq(rtmp1, 2); 6351 notq(rtmp2); 6352 andq(dst, rtmp2); 6353 shrq(dst, 2); 6354 orq(dst, rtmp1); 6355 6356 // Swap LSB and MSB 4 bits of each byte. 6357 movq(rtmp1, dst); 6358 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6359 andq(rtmp1, rtmp2); 6360 shlq(rtmp1, 4); 6361 notq(rtmp2); 6362 andq(dst, rtmp2); 6363 shrq(dst, 4); 6364 orq(dst, rtmp1); 6365 } 6366 bswapq(dst); 6367 } 6368 6369 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6370 Label done; 6371 Label neg_divisor_fastpath; 6372 cmpq(divisor, 0); 6373 jccb(Assembler::less, neg_divisor_fastpath); 6374 xorl(rdx, rdx); 6375 divq(divisor); 6376 jmpb(done); 6377 bind(neg_divisor_fastpath); 6378 // Fastpath for divisor < 0: 6379 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6380 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6381 movq(rdx, rax); 6382 subq(rdx, divisor); 6383 if (VM_Version::supports_bmi1()) { 6384 andnq(rax, rdx, rax); 6385 } else { 6386 notq(rdx); 6387 andq(rax, rdx); 6388 } 6389 shrq(rax, 63); 6390 bind(done); 6391 } 6392 6393 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6394 Label done; 6395 Label neg_divisor_fastpath; 6396 cmpq(divisor, 0); 6397 jccb(Assembler::less, neg_divisor_fastpath); 6398 xorq(rdx, rdx); 6399 divq(divisor); 6400 jmp(done); 6401 bind(neg_divisor_fastpath); 6402 // Fastpath when divisor < 0: 6403 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6404 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6405 movq(rdx, rax); 6406 subq(rax, divisor); 6407 if (VM_Version::supports_bmi1()) { 6408 andnq(rax, rax, rdx); 6409 } else { 6410 notq(rax); 6411 andq(rax, rdx); 6412 } 6413 sarq(rax, 63); 6414 andq(rax, divisor); 6415 subq(rdx, rax); 6416 bind(done); 6417 } 6418 6419 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6420 Label done; 6421 Label neg_divisor_fastpath; 6422 cmpq(divisor, 0); 6423 jccb(Assembler::less, neg_divisor_fastpath); 6424 xorq(rdx, rdx); 6425 divq(divisor); 6426 jmp(done); 6427 bind(neg_divisor_fastpath); 6428 // Fastpath for divisor < 0: 6429 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6430 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6431 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6432 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6433 movq(rdx, rax); 6434 subq(rax, divisor); 6435 if (VM_Version::supports_bmi1()) { 6436 andnq(rax, rax, rdx); 6437 } else { 6438 notq(rax); 6439 andq(rax, rdx); 6440 } 6441 movq(tmp, rax); 6442 shrq(rax, 63); // quotient 6443 sarq(tmp, 63); 6444 andq(tmp, divisor); 6445 subq(rdx, tmp); // remainder 6446 bind(done); 6447 } 6448 #endif 6449 6450 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6451 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6452 int vlen_enc) { 6453 assert(VM_Version::supports_avx512bw(), ""); 6454 // Byte shuffles are inlane operations and indices are determined using 6455 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6456 // normalized to index range 0-15. This makes sure that all the multiples 6457 // of an index value are placed at same relative position in 128 bit 6458 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6459 // will be 16th element in their respective 128 bit lanes. 6460 movl(rtmp, 16); 6461 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6462 6463 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6464 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6465 // original shuffle indices and move the shuffled lanes corresponding to true 6466 // mask to destination vector. 6467 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6468 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6469 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6470 6471 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6472 // and broadcasting second 128 bit lane. 6473 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6474 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6475 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6476 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6477 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6478 6479 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6480 // and broadcasting third 128 bit lane. 6481 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6482 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6483 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6484 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6485 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6486 6487 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6488 // and broadcasting third 128 bit lane. 6489 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6490 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6491 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6492 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6493 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6494 } 6495 6496 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6497 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6498 if (vlen_enc == AVX_128bit) { 6499 vpermilps(dst, src, shuffle, vlen_enc); 6500 } else if (bt == T_INT) { 6501 vpermd(dst, shuffle, src, vlen_enc); 6502 } else { 6503 assert(bt == T_FLOAT, ""); 6504 vpermps(dst, shuffle, src, vlen_enc); 6505 } 6506 } 6507 6508 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6509 XMMRegister src2, int vlen_enc) { 6510 switch(elem_bt) { 6511 case T_BYTE: 6512 evpermi2b(dst, src1, src2, vlen_enc); 6513 break; 6514 case T_SHORT: 6515 evpermi2w(dst, src1, src2, vlen_enc); 6516 break; 6517 case T_INT: 6518 evpermi2d(dst, src1, src2, vlen_enc); 6519 break; 6520 case T_LONG: 6521 evpermi2q(dst, src1, src2, vlen_enc); 6522 break; 6523 case T_FLOAT: 6524 evpermi2ps(dst, src1, src2, vlen_enc); 6525 break; 6526 case T_DOUBLE: 6527 evpermi2pd(dst, src1, src2, vlen_enc); 6528 break; 6529 default: 6530 fatal("Unsupported type %s", type2name(elem_bt)); 6531 break; 6532 } 6533 }