1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/globals.hpp" 37 #include "runtime/objectMonitor.hpp" 38 #include "runtime/stubRoutines.hpp" 39 #include "utilities/checkedCast.hpp" 40 #include "utilities/globalDefinitions.hpp" 41 #include "utilities/powerOfTwo.hpp" 42 #include "utilities/sizes.hpp" 43 44 #ifdef PRODUCT 45 #define BLOCK_COMMENT(str) /* nothing */ 46 #define STOP(error) stop(error) 47 #else 48 #define BLOCK_COMMENT(str) block_comment(str) 49 #define STOP(error) block_comment(error); stop(error) 50 #endif 51 52 // C2 compiled method's prolog code. 53 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) { 54 if (C->clinit_barrier_on_entry()) { 55 assert(VM_Version::supports_fast_class_init_checks(), "sanity"); 56 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started"); 57 58 Label L_skip_barrier; 59 Register klass = rscratch1; 60 61 mov_metadata(klass, C->method()->holder()->constant_encoding()); 62 clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 63 64 jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 65 66 bind(L_skip_barrier); 67 } 68 69 int framesize = C->output()->frame_size_in_bytes(); 70 int bangsize = C->output()->bang_size_in_bytes(); 71 bool fp_mode_24b = false; 72 int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0; 73 74 // WARNING: Initial instruction MUST be 5 bytes or longer so that 75 // NativeJump::patch_verified_entry will be able to patch out the entry 76 // code safely. The push to verify stack depth is ok at 5 bytes, 77 // the frame allocation can be either 3 or 6 bytes. So if we don't do 78 // stack bang then we must use the 6 byte frame allocation even if 79 // we have no frame. :-( 80 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 81 82 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 83 // Remove word for return addr 84 framesize -= wordSize; 85 stack_bang_size -= wordSize; 86 87 // Calls to C2R adapters often do not accept exceptional returns. 88 // We require that their callers must bang for them. But be careful, because 89 // some VM calls (such as call site linkage) can use several kilobytes of 90 // stack. But the stack safety zone should account for that. 91 // See bugs 4446381, 4468289, 4497237. 92 if (stack_bang_size > 0) { 93 generate_stack_overflow_check(stack_bang_size); 94 95 // We always push rbp, so that on return to interpreter rbp, will be 96 // restored correctly and we can correct the stack. 97 push(rbp); 98 // Save caller's stack pointer into RBP if the frame pointer is preserved. 99 if (PreserveFramePointer) { 100 mov(rbp, rsp); 101 } 102 // Remove word for ebp 103 framesize -= wordSize; 104 105 // Create frame 106 if (framesize) { 107 subptr(rsp, framesize); 108 } 109 } else { 110 // Create frame (force generation of a 4 byte immediate value) 111 subptr_imm32(rsp, framesize); 112 113 // Save RBP register now. 114 framesize -= wordSize; 115 movptr(Address(rsp, framesize), rbp); 116 // Save caller's stack pointer into RBP if the frame pointer is preserved. 117 if (PreserveFramePointer) { 118 movptr(rbp, rsp); 119 if (framesize > 0) { 120 addptr(rbp, framesize); 121 } 122 } 123 } 124 125 if (C->needs_stack_repair()) { 126 // Save stack increment just below the saved rbp (also account for fixed framesize and rbp) 127 assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned"); 128 movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize); 129 } 130 131 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 132 framesize -= wordSize; 133 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 134 } 135 136 #ifndef _LP64 137 // If method sets FPU control word do it now 138 if (fp_mode_24b) { 139 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 140 } 141 if (UseSSE >= 2 && VerifyFPU) { 142 verify_FPU(0, "FPU stack must be clean on entry"); 143 } 144 #endif 145 146 #ifdef ASSERT 147 if (VerifyStackAtCalls) { 148 Label L; 149 push(rax); 150 mov(rax, rsp); 151 andptr(rax, StackAlignmentInBytes-1); 152 cmpptr(rax, StackAlignmentInBytes-wordSize); 153 pop(rax); 154 jcc(Assembler::equal, L); 155 STOP("Stack is not properly aligned!"); 156 bind(L); 157 } 158 #endif 159 } 160 161 void C2_MacroAssembler::entry_barrier() { 162 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 163 #ifdef _LP64 164 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 165 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 166 Label dummy_slow_path; 167 Label dummy_continuation; 168 Label* slow_path = &dummy_slow_path; 169 Label* continuation = &dummy_continuation; 170 if (!Compile::current()->output()->in_scratch_emit_size()) { 171 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 172 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 173 Compile::current()->output()->add_stub(stub); 174 slow_path = &stub->entry(); 175 continuation = &stub->continuation(); 176 } 177 bs->nmethod_entry_barrier(this, slow_path, continuation); 178 } 179 #else 180 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 181 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 182 #endif 183 } 184 185 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 186 switch (vlen_in_bytes) { 187 case 4: // fall-through 188 case 8: // fall-through 189 case 16: return Assembler::AVX_128bit; 190 case 32: return Assembler::AVX_256bit; 191 case 64: return Assembler::AVX_512bit; 192 193 default: { 194 ShouldNotReachHere(); 195 return Assembler::AVX_NoVec; 196 } 197 } 198 } 199 200 // fast_lock and fast_unlock used by C2 201 202 // Because the transitions from emitted code to the runtime 203 // monitorenter/exit helper stubs are so slow it's critical that 204 // we inline both the stack-locking fast path and the inflated fast path. 205 // 206 // See also: cmpFastLock and cmpFastUnlock. 207 // 208 // What follows is a specialized inline transliteration of the code 209 // in enter() and exit(). If we're concerned about I$ bloat another 210 // option would be to emit TrySlowEnter and TrySlowExit methods 211 // at startup-time. These methods would accept arguments as 212 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 213 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 214 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 215 // In practice, however, the # of lock sites is bounded and is usually small. 216 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 217 // if the processor uses simple bimodal branch predictors keyed by EIP 218 // Since the helper routines would be called from multiple synchronization 219 // sites. 220 // 221 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 222 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 223 // to those specialized methods. That'd give us a mostly platform-independent 224 // implementation that the JITs could optimize and inline at their pleasure. 225 // Done correctly, the only time we'd need to cross to native could would be 226 // to park() or unpark() threads. We'd also need a few more unsafe operators 227 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 228 // (b) explicit barriers or fence operations. 229 // 230 // TODO: 231 // 232 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 233 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 234 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 235 // the lock operators would typically be faster than reifying Self. 236 // 237 // * Ideally I'd define the primitives as: 238 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 239 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 240 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 241 // Instead, we're stuck with a rather awkward and brittle register assignments below. 242 // Furthermore the register assignments are overconstrained, possibly resulting in 243 // sub-optimal code near the synchronization site. 244 // 245 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 246 // Alternately, use a better sp-proximity test. 247 // 248 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 249 // Either one is sufficient to uniquely identify a thread. 250 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 251 // 252 // * Intrinsify notify() and notifyAll() for the common cases where the 253 // object is locked by the calling thread but the waitlist is empty. 254 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 255 // 256 // * use jccb and jmpb instead of jcc and jmp to improve code density. 257 // But beware of excessive branch density on AMD Opterons. 258 // 259 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 260 // or failure of the fast path. If the fast path fails then we pass 261 // control to the slow path, typically in C. In fast_lock and 262 // fast_unlock we often branch to DONE_LABEL, just to find that C2 263 // will emit a conditional branch immediately after the node. 264 // So we have branches to branches and lots of ICC.ZF games. 265 // Instead, it might be better to have C2 pass a "FailureLabel" 266 // into fast_lock and fast_unlock. In the case of success, control 267 // will drop through the node. ICC.ZF is undefined at exit. 268 // In the case of failure, the node will branch directly to the 269 // FailureLabel 270 271 272 // obj: object to lock 273 // box: on-stack box address (displaced header location) - KILLED 274 // rax,: tmp -- KILLED 275 // scr: tmp -- KILLED 276 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 277 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 278 Metadata* method_data) { 279 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 280 // Ensure the register assignments are disjoint 281 assert(tmpReg == rax, ""); 282 assert(cx1Reg == noreg, ""); 283 assert(cx2Reg == noreg, ""); 284 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 285 286 // Possible cases that we'll encounter in fast_lock 287 // ------------------------------------------------ 288 // * Inflated 289 // -- unlocked 290 // -- Locked 291 // = by self 292 // = by other 293 // * neutral 294 // * stack-locked 295 // -- by self 296 // = sp-proximity test hits 297 // = sp-proximity test generates false-negative 298 // -- by other 299 // 300 301 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 302 303 if (DiagnoseSyncOnValueBasedClasses != 0) { 304 load_klass(tmpReg, objReg, scrReg); 305 testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 306 jcc(Assembler::notZero, DONE_LABEL); 307 } 308 309 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 310 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 311 jcc(Assembler::notZero, IsInflated); 312 313 if (LockingMode == LM_MONITOR) { 314 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 315 testptr(objReg, objReg); 316 } else { 317 assert(LockingMode == LM_LEGACY, "must be"); 318 // Attempt stack-locking ... 319 orptr (tmpReg, markWord::unlocked_value); 320 if (EnableValhalla) { 321 // Mask inline_type bit such that we go to the slow path if object is an inline type 322 andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place)); 323 } 324 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 325 lock(); 326 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 327 jcc(Assembler::equal, COUNT); // Success 328 329 // Recursive locking. 330 // The object is stack-locked: markword contains stack pointer to BasicLock. 331 // Locked by current thread if difference with current SP is less than one page. 332 subptr(tmpReg, rsp); 333 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 334 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 335 movptr(Address(boxReg, 0), tmpReg); 336 } 337 jmp(DONE_LABEL); 338 339 bind(IsInflated); 340 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 341 342 #ifndef _LP64 343 // The object is inflated. 344 345 // boxReg refers to the on-stack BasicLock in the current frame. 346 // We'd like to write: 347 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 348 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 349 // additional latency as we have another ST in the store buffer that must drain. 350 351 // avoid ST-before-CAS 352 // register juggle because we need tmpReg for cmpxchgptr below 353 movptr(scrReg, boxReg); 354 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 355 356 // Optimistic form: consider XORL tmpReg,tmpReg 357 movptr(tmpReg, NULL_WORD); 358 359 // Appears unlocked - try to swing _owner from null to non-null. 360 // Ideally, I'd manifest "Self" with get_thread and then attempt 361 // to CAS the register containing Self into m->Owner. 362 // But we don't have enough registers, so instead we can either try to CAS 363 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 364 // we later store "Self" into m->Owner. Transiently storing a stack address 365 // (rsp or the address of the box) into m->owner is harmless. 366 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 367 lock(); 368 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 369 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 370 // If we weren't able to swing _owner from null to the BasicLock 371 // then take the slow path. 372 jccb (Assembler::notZero, NO_COUNT); 373 // update _owner from BasicLock to thread 374 get_thread (scrReg); // beware: clobbers ICCs 375 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 376 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 377 378 // If the CAS fails we can either retry or pass control to the slow path. 379 // We use the latter tactic. 380 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 381 // If the CAS was successful ... 382 // Self has acquired the lock 383 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 384 // Intentional fall-through into DONE_LABEL ... 385 #else // _LP64 386 // It's inflated and we use scrReg for ObjectMonitor* in this section. 387 movq(scrReg, tmpReg); 388 xorq(tmpReg, tmpReg); 389 lock(); 390 cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 391 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 392 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 393 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 394 // Propagate ICC.ZF from CAS above into DONE_LABEL. 395 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 396 397 cmpptr(thread, rax); // Check if we are already the owner (recursive lock) 398 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 399 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 400 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 401 #endif // _LP64 402 bind(DONE_LABEL); 403 404 // ZFlag == 1 count in fast path 405 // ZFlag == 0 count in slow path 406 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 407 408 bind(COUNT); 409 // Count monitors in fast path 410 increment(Address(thread, JavaThread::held_monitor_count_offset())); 411 412 xorl(tmpReg, tmpReg); // Set ZF == 1 413 414 bind(NO_COUNT); 415 416 // At NO_COUNT the icc ZFlag is set as follows ... 417 // fast_unlock uses the same protocol. 418 // ZFlag == 1 -> Success 419 // ZFlag == 0 -> Failure - force control through the slow path 420 } 421 422 // obj: object to unlock 423 // box: box address (displaced header location), killed. Must be EAX. 424 // tmp: killed, cannot be obj nor box. 425 // 426 // Some commentary on balanced locking: 427 // 428 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 429 // Methods that don't have provably balanced locking are forced to run in the 430 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 431 // The interpreter provides two properties: 432 // I1: At return-time the interpreter automatically and quietly unlocks any 433 // objects acquired the current activation (frame). Recall that the 434 // interpreter maintains an on-stack list of locks currently held by 435 // a frame. 436 // I2: If a method attempts to unlock an object that is not held by the 437 // the frame the interpreter throws IMSX. 438 // 439 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 440 // B() doesn't have provably balanced locking so it runs in the interpreter. 441 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 442 // is still locked by A(). 443 // 444 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 445 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 446 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 447 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 448 // Arguably given that the spec legislates the JNI case as undefined our implementation 449 // could reasonably *avoid* checking owner in fast_unlock(). 450 // In the interest of performance we elide m->Owner==Self check in unlock. 451 // A perfectly viable alternative is to elide the owner check except when 452 // Xcheck:jni is enabled. 453 454 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { 455 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 456 assert(boxReg == rax, ""); 457 assert_different_registers(objReg, boxReg, tmpReg); 458 459 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 460 461 if (LockingMode == LM_LEGACY) { 462 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 463 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 464 } 465 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 466 if (LockingMode != LM_MONITOR) { 467 testptr(tmpReg, markWord::monitor_value); // Inflated? 468 jcc(Assembler::zero, Stacked); 469 } 470 471 // It's inflated. 472 473 // Despite our balanced locking property we still check that m->_owner == Self 474 // as java routines or native JNI code called by this thread might 475 // have released the lock. 476 // Refer to the comments in synchronizer.cpp for how we might encode extra 477 // state in _succ so we can avoid fetching EntryList|cxq. 478 // 479 // If there's no contention try a 1-0 exit. That is, exit without 480 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 481 // we detect and recover from the race that the 1-0 exit admits. 482 // 483 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 484 // before it STs null into _owner, releasing the lock. Updates 485 // to data protected by the critical section must be visible before 486 // we drop the lock (and thus before any other thread could acquire 487 // the lock and observe the fields protected by the lock). 488 // IA32's memory-model is SPO, so STs are ordered with respect to 489 // each other and there's no need for an explicit barrier (fence). 490 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 491 #ifndef _LP64 492 // Note that we could employ various encoding schemes to reduce 493 // the number of loads below (currently 4) to just 2 or 3. 494 // Refer to the comments in synchronizer.cpp. 495 // In practice the chain of fetches doesn't seem to impact performance, however. 496 xorptr(boxReg, boxReg); 497 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 498 jccb (Assembler::notZero, DONE_LABEL); 499 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 500 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 501 jccb (Assembler::notZero, DONE_LABEL); 502 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 503 jmpb (DONE_LABEL); 504 #else // _LP64 505 // It's inflated 506 Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath; 507 508 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 509 jccb(Assembler::equal, LNotRecursive); 510 511 // Recursive inflated unlock 512 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 513 jmpb(LSuccess); 514 515 bind(LNotRecursive); 516 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 517 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 518 jccb (Assembler::notZero, CheckSucc); 519 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 520 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 521 jmpb (DONE_LABEL); 522 523 // Try to avoid passing control into the slow_path ... 524 bind (CheckSucc); 525 526 // The following optional optimization can be elided if necessary 527 // Effectively: if (succ == null) goto slow path 528 // The code reduces the window for a race, however, 529 // and thus benefits performance. 530 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 531 jccb (Assembler::zero, LGoSlowPath); 532 533 xorptr(boxReg, boxReg); 534 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 535 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 536 537 // Memory barrier/fence 538 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 539 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 540 // This is faster on Nehalem and AMD Shanghai/Barcelona. 541 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 542 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 543 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 544 lock(); addl(Address(rsp, 0), 0); 545 546 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 547 jccb (Assembler::notZero, LSuccess); 548 549 // Rare inopportune interleaving - race. 550 // The successor vanished in the small window above. 551 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 552 // We need to ensure progress and succession. 553 // Try to reacquire the lock. 554 // If that fails then the new owner is responsible for succession and this 555 // thread needs to take no further action and can exit via the fast path (success). 556 // If the re-acquire succeeds then pass control into the slow path. 557 // As implemented, this latter mode is horrible because we generated more 558 // coherence traffic on the lock *and* artificially extended the critical section 559 // length while by virtue of passing control into the slow path. 560 561 // box is really RAX -- the following CMPXCHG depends on that binding 562 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 563 lock(); 564 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 565 // There's no successor so we tried to regrab the lock. 566 // If that didn't work, then another thread grabbed the 567 // lock so we're done (and exit was a success). 568 jccb (Assembler::notEqual, LSuccess); 569 // Intentional fall-through into slow path 570 571 bind (LGoSlowPath); 572 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 573 jmpb (DONE_LABEL); 574 575 bind (LSuccess); 576 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 577 jmpb (DONE_LABEL); 578 579 #endif 580 if (LockingMode == LM_LEGACY) { 581 bind (Stacked); 582 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 583 lock(); 584 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 585 // Intentional fall-thru into DONE_LABEL 586 } 587 588 bind(DONE_LABEL); 589 590 // ZFlag == 1 count in fast path 591 // ZFlag == 0 count in slow path 592 jccb(Assembler::notZero, NO_COUNT); 593 594 bind(COUNT); 595 // Count monitors in fast path 596 #ifndef _LP64 597 get_thread(tmpReg); 598 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 599 #else // _LP64 600 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 601 #endif 602 603 xorl(tmpReg, tmpReg); // Set ZF == 1 604 605 bind(NO_COUNT); 606 } 607 608 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 609 Register t, Register thread) { 610 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 611 assert(rax_reg == rax, "Used for CAS"); 612 assert_different_registers(obj, box, rax_reg, t, thread); 613 614 // Handle inflated monitor. 615 Label inflated; 616 // Finish fast lock successfully. ZF value is irrelevant. 617 Label locked; 618 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 619 Label slow_path; 620 621 if (UseObjectMonitorTable) { 622 // Clear cache in case fast locking succeeds. 623 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 624 } 625 626 if (DiagnoseSyncOnValueBasedClasses != 0) { 627 load_klass(rax_reg, obj, t); 628 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 629 jcc(Assembler::notZero, slow_path); 630 } 631 632 const Register mark = t; 633 634 { // Lightweight Lock 635 636 Label push; 637 638 const Register top = UseObjectMonitorTable ? rax_reg : box; 639 640 // Load the mark. 641 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 642 643 // Prefetch top. 644 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 645 646 // Check for monitor (0b10). 647 testptr(mark, markWord::monitor_value); 648 jcc(Assembler::notZero, inflated); 649 650 // Check if lock-stack is full. 651 cmpl(top, LockStack::end_offset() - 1); 652 jcc(Assembler::greater, slow_path); 653 654 // Check if recursive. 655 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 656 jccb(Assembler::equal, push); 657 658 // Try to lock. Transition lock bits 0b01 => 0b00 659 movptr(rax_reg, mark); 660 orptr(rax_reg, markWord::unlocked_value); 661 andptr(mark, ~(int32_t)markWord::unlocked_value); 662 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 663 jcc(Assembler::notEqual, slow_path); 664 665 if (UseObjectMonitorTable) { 666 // Need to reload top, clobbered by CAS. 667 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 668 } 669 bind(push); 670 // After successful lock, push object on lock-stack. 671 movptr(Address(thread, top), obj); 672 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 673 jmpb(locked); 674 } 675 676 { // Handle inflated monitor. 677 bind(inflated); 678 679 const Register monitor = t; 680 681 if (!UseObjectMonitorTable) { 682 assert(mark == monitor, "should be the same here"); 683 } else { 684 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 685 // Fetch ObjectMonitor* from the cache or take the slow-path. 686 Label monitor_found; 687 688 // Load cache address 689 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 690 691 const int num_unrolled = 2; 692 for (int i = 0; i < num_unrolled; i++) { 693 cmpptr(obj, Address(t)); 694 jccb(Assembler::equal, monitor_found); 695 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 696 } 697 698 Label loop; 699 700 // Search for obj in cache. 701 bind(loop); 702 703 // Check for match. 704 cmpptr(obj, Address(t)); 705 jccb(Assembler::equal, monitor_found); 706 707 // Search until null encountered, guaranteed _null_sentinel at end. 708 cmpptr(Address(t), 1); 709 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 710 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 711 jmpb(loop); 712 713 // Cache hit. 714 bind(monitor_found); 715 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 716 } 717 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 718 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 719 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 720 721 Label monitor_locked; 722 // Lock the monitor. 723 724 // CAS owner (null => current thread). 725 xorptr(rax_reg, rax_reg); 726 lock(); cmpxchgptr(thread, owner_address); 727 jccb(Assembler::equal, monitor_locked); 728 729 // Check if recursive. 730 cmpptr(thread, rax_reg); 731 jccb(Assembler::notEqual, slow_path); 732 733 // Recursive. 734 increment(recursions_address); 735 736 bind(monitor_locked); 737 if (UseObjectMonitorTable) { 738 // Cache the monitor for unlock 739 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 740 } 741 } 742 743 bind(locked); 744 increment(Address(thread, JavaThread::held_monitor_count_offset())); 745 // Set ZF = 1 746 xorl(rax_reg, rax_reg); 747 748 #ifdef ASSERT 749 // Check that locked label is reached with ZF set. 750 Label zf_correct; 751 Label zf_bad_zero; 752 jcc(Assembler::zero, zf_correct); 753 jmp(zf_bad_zero); 754 #endif 755 756 bind(slow_path); 757 #ifdef ASSERT 758 // Check that slow_path label is reached with ZF not set. 759 jcc(Assembler::notZero, zf_correct); 760 stop("Fast Lock ZF != 0"); 761 bind(zf_bad_zero); 762 stop("Fast Lock ZF != 1"); 763 bind(zf_correct); 764 #endif 765 // C2 uses the value of ZF to determine the continuation. 766 } 767 768 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 769 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 770 assert(reg_rax == rax, "Used for CAS"); 771 assert_different_registers(obj, reg_rax, t); 772 773 // Handle inflated monitor. 774 Label inflated, inflated_check_lock_stack; 775 // Finish fast unlock successfully. MUST jump with ZF == 1 776 Label unlocked; 777 778 // Assume success. 779 decrement(Address(thread, JavaThread::held_monitor_count_offset())); 780 781 const Register mark = t; 782 const Register monitor = t; 783 const Register top = UseObjectMonitorTable ? t : reg_rax; 784 const Register box = reg_rax; 785 786 Label dummy; 787 C2FastUnlockLightweightStub* stub = nullptr; 788 789 if (!Compile::current()->output()->in_scratch_emit_size()) { 790 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 791 Compile::current()->output()->add_stub(stub); 792 } 793 794 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 795 Label& check_successor = stub == nullptr ? dummy : stub->check_successor(); 796 Label& slow_path = stub == nullptr ? dummy : stub->slow_path(); 797 798 { // Lightweight Unlock 799 800 // Load top. 801 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 802 803 if (!UseObjectMonitorTable) { 804 // Prefetch mark. 805 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 806 } 807 808 // Check if obj is top of lock-stack. 809 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 810 // Top of lock stack was not obj. Must be monitor. 811 jcc(Assembler::notEqual, inflated_check_lock_stack); 812 813 // Pop lock-stack. 814 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 815 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 816 817 // Check if recursive. 818 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 819 jcc(Assembler::equal, unlocked); 820 821 // We elide the monitor check, let the CAS fail instead. 822 823 if (UseObjectMonitorTable) { 824 // Load mark. 825 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 826 } 827 828 // Try to unlock. Transition lock bits 0b00 => 0b01 829 movptr(reg_rax, mark); 830 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 831 orptr(mark, markWord::unlocked_value); 832 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 833 jcc(Assembler::notEqual, push_and_slow_path); 834 jmp(unlocked); 835 } 836 837 838 { // Handle inflated monitor. 839 bind(inflated_check_lock_stack); 840 #ifdef ASSERT 841 Label check_done; 842 subl(top, oopSize); 843 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 844 jcc(Assembler::below, check_done); 845 cmpptr(obj, Address(thread, top)); 846 jccb(Assembler::notEqual, inflated_check_lock_stack); 847 stop("Fast Unlock lock on stack"); 848 bind(check_done); 849 if (UseObjectMonitorTable) { 850 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 851 } 852 testptr(mark, markWord::monitor_value); 853 jccb(Assembler::notZero, inflated); 854 stop("Fast Unlock not monitor"); 855 #endif 856 857 bind(inflated); 858 859 if (!UseObjectMonitorTable) { 860 assert(mark == monitor, "should be the same here"); 861 } else { 862 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 863 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 864 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 865 cmpptr(monitor, alignof(ObjectMonitor*)); 866 jcc(Assembler::below, slow_path); 867 } 868 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 869 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 870 const Address cxq_address{monitor, ObjectMonitor::cxq_offset() - monitor_tag}; 871 const Address EntryList_address{monitor, ObjectMonitor::EntryList_offset() - monitor_tag}; 872 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 873 874 Label recursive; 875 876 // Check if recursive. 877 cmpptr(recursions_address, 0); 878 jccb(Assembler::notEqual, recursive); 879 880 // Check if the entry lists are empty. 881 movptr(reg_rax, cxq_address); 882 orptr(reg_rax, EntryList_address); 883 jcc(Assembler::notZero, check_successor); 884 885 // Release lock. 886 movptr(owner_address, NULL_WORD); 887 jmpb(unlocked); 888 889 // Recursive unlock. 890 bind(recursive); 891 decrement(recursions_address); 892 xorl(t, t); 893 } 894 895 bind(unlocked); 896 if (stub != nullptr) { 897 bind(stub->unlocked_continuation()); 898 } 899 900 #ifdef ASSERT 901 // Check that unlocked label is reached with ZF set. 902 Label zf_correct; 903 jcc(Assembler::zero, zf_correct); 904 stop("Fast Unlock ZF != 1"); 905 #endif 906 907 if (stub != nullptr) { 908 bind(stub->slow_path_continuation()); 909 } 910 #ifdef ASSERT 911 // Check that stub->continuation() label is reached with ZF not set. 912 jccb(Assembler::notZero, zf_correct); 913 stop("Fast Unlock ZF != 0"); 914 bind(zf_correct); 915 #endif 916 // C2 uses the value of ZF to determine the continuation. 917 } 918 919 //------------------------------------------------------------------------------------------- 920 // Generic instructions support for use in .ad files C2 code generation 921 922 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 923 if (dst != src) { 924 movdqu(dst, src); 925 } 926 if (opcode == Op_AbsVD) { 927 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 928 } else { 929 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 930 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 931 } 932 } 933 934 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 935 if (opcode == Op_AbsVD) { 936 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 937 } else { 938 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 939 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 940 } 941 } 942 943 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 944 if (dst != src) { 945 movdqu(dst, src); 946 } 947 if (opcode == Op_AbsVF) { 948 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 949 } else { 950 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 951 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 952 } 953 } 954 955 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 956 if (opcode == Op_AbsVF) { 957 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 958 } else { 959 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 960 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 961 } 962 } 963 964 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 965 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 966 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 967 968 if (opcode == Op_MinV) { 969 if (elem_bt == T_BYTE) { 970 pminsb(dst, src); 971 } else if (elem_bt == T_SHORT) { 972 pminsw(dst, src); 973 } else if (elem_bt == T_INT) { 974 pminsd(dst, src); 975 } else { 976 assert(elem_bt == T_LONG, "required"); 977 assert(tmp == xmm0, "required"); 978 assert_different_registers(dst, src, tmp); 979 movdqu(xmm0, dst); 980 pcmpgtq(xmm0, src); 981 blendvpd(dst, src); // xmm0 as mask 982 } 983 } else { // opcode == Op_MaxV 984 if (elem_bt == T_BYTE) { 985 pmaxsb(dst, src); 986 } else if (elem_bt == T_SHORT) { 987 pmaxsw(dst, src); 988 } else if (elem_bt == T_INT) { 989 pmaxsd(dst, src); 990 } else { 991 assert(elem_bt == T_LONG, "required"); 992 assert(tmp == xmm0, "required"); 993 assert_different_registers(dst, src, tmp); 994 movdqu(xmm0, src); 995 pcmpgtq(xmm0, dst); 996 blendvpd(dst, src); // xmm0 as mask 997 } 998 } 999 } 1000 1001 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1002 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1003 int vlen_enc) { 1004 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1005 1006 if (opcode == Op_MinV) { 1007 if (elem_bt == T_BYTE) { 1008 vpminsb(dst, src1, src2, vlen_enc); 1009 } else if (elem_bt == T_SHORT) { 1010 vpminsw(dst, src1, src2, vlen_enc); 1011 } else if (elem_bt == T_INT) { 1012 vpminsd(dst, src1, src2, vlen_enc); 1013 } else { 1014 assert(elem_bt == T_LONG, "required"); 1015 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1016 vpminsq(dst, src1, src2, vlen_enc); 1017 } else { 1018 assert_different_registers(dst, src1, src2); 1019 vpcmpgtq(dst, src1, src2, vlen_enc); 1020 vblendvpd(dst, src1, src2, dst, vlen_enc); 1021 } 1022 } 1023 } else { // opcode == Op_MaxV 1024 if (elem_bt == T_BYTE) { 1025 vpmaxsb(dst, src1, src2, vlen_enc); 1026 } else if (elem_bt == T_SHORT) { 1027 vpmaxsw(dst, src1, src2, vlen_enc); 1028 } else if (elem_bt == T_INT) { 1029 vpmaxsd(dst, src1, src2, vlen_enc); 1030 } else { 1031 assert(elem_bt == T_LONG, "required"); 1032 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1033 vpmaxsq(dst, src1, src2, vlen_enc); 1034 } else { 1035 assert_different_registers(dst, src1, src2); 1036 vpcmpgtq(dst, src1, src2, vlen_enc); 1037 vblendvpd(dst, src2, src1, dst, vlen_enc); 1038 } 1039 } 1040 } 1041 } 1042 1043 // Float/Double min max 1044 1045 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1046 XMMRegister dst, XMMRegister a, XMMRegister b, 1047 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1048 int vlen_enc) { 1049 assert(UseAVX > 0, "required"); 1050 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1051 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1052 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1053 assert_different_registers(a, tmp, atmp, btmp); 1054 assert_different_registers(b, tmp, atmp, btmp); 1055 1056 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1057 bool is_double_word = is_double_word_type(elem_bt); 1058 1059 /* Note on 'non-obvious' assembly sequence: 1060 * 1061 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1062 * and Java on how they handle floats: 1063 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1064 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1065 * 1066 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1067 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1068 * (only useful when signs differ, noop otherwise) 1069 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1070 1071 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1072 * btmp = (b < +0.0) ? a : b 1073 * atmp = (b < +0.0) ? b : a 1074 * Tmp = Max_Float(atmp , btmp) 1075 * Res = (atmp == NaN) ? atmp : Tmp 1076 */ 1077 1078 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1079 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1080 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1081 XMMRegister mask; 1082 1083 if (!is_double_word && is_min) { 1084 mask = a; 1085 vblend = &MacroAssembler::vblendvps; 1086 vmaxmin = &MacroAssembler::vminps; 1087 vcmp = &MacroAssembler::vcmpps; 1088 } else if (!is_double_word && !is_min) { 1089 mask = b; 1090 vblend = &MacroAssembler::vblendvps; 1091 vmaxmin = &MacroAssembler::vmaxps; 1092 vcmp = &MacroAssembler::vcmpps; 1093 } else if (is_double_word && is_min) { 1094 mask = a; 1095 vblend = &MacroAssembler::vblendvpd; 1096 vmaxmin = &MacroAssembler::vminpd; 1097 vcmp = &MacroAssembler::vcmppd; 1098 } else { 1099 assert(is_double_word && !is_min, "sanity"); 1100 mask = b; 1101 vblend = &MacroAssembler::vblendvpd; 1102 vmaxmin = &MacroAssembler::vmaxpd; 1103 vcmp = &MacroAssembler::vcmppd; 1104 } 1105 1106 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1107 XMMRegister maxmin, scratch; 1108 if (dst == btmp) { 1109 maxmin = btmp; 1110 scratch = tmp; 1111 } else { 1112 maxmin = tmp; 1113 scratch = btmp; 1114 } 1115 1116 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1117 if (precompute_mask && !is_double_word) { 1118 vpsrad(tmp, mask, 32, vlen_enc); 1119 mask = tmp; 1120 } else if (precompute_mask && is_double_word) { 1121 vpxor(tmp, tmp, tmp, vlen_enc); 1122 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1123 mask = tmp; 1124 } 1125 1126 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1127 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1128 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1129 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1130 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1131 } 1132 1133 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1134 XMMRegister dst, XMMRegister a, XMMRegister b, 1135 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1136 int vlen_enc) { 1137 assert(UseAVX > 2, "required"); 1138 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1139 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1140 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1141 assert_different_registers(dst, a, atmp, btmp); 1142 assert_different_registers(dst, b, atmp, btmp); 1143 1144 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1145 bool is_double_word = is_double_word_type(elem_bt); 1146 bool merge = true; 1147 1148 if (!is_double_word && is_min) { 1149 evpmovd2m(ktmp, a, vlen_enc); 1150 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1151 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1152 vminps(dst, atmp, btmp, vlen_enc); 1153 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1154 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1155 } else if (!is_double_word && !is_min) { 1156 evpmovd2m(ktmp, b, vlen_enc); 1157 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1158 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1159 vmaxps(dst, atmp, btmp, vlen_enc); 1160 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1161 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1162 } else if (is_double_word && is_min) { 1163 evpmovq2m(ktmp, a, vlen_enc); 1164 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1165 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1166 vminpd(dst, atmp, btmp, vlen_enc); 1167 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1168 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1169 } else { 1170 assert(is_double_word && !is_min, "sanity"); 1171 evpmovq2m(ktmp, b, vlen_enc); 1172 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1173 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1174 vmaxpd(dst, atmp, btmp, vlen_enc); 1175 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1176 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1177 } 1178 } 1179 1180 // Float/Double signum 1181 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1182 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1183 1184 Label DONE_LABEL; 1185 1186 if (opcode == Op_SignumF) { 1187 assert(UseSSE > 0, "required"); 1188 ucomiss(dst, zero); 1189 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1190 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1191 movflt(dst, one); 1192 jcc(Assembler::above, DONE_LABEL); 1193 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1194 } else if (opcode == Op_SignumD) { 1195 assert(UseSSE > 1, "required"); 1196 ucomisd(dst, zero); 1197 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1198 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1199 movdbl(dst, one); 1200 jcc(Assembler::above, DONE_LABEL); 1201 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1202 } 1203 1204 bind(DONE_LABEL); 1205 } 1206 1207 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1208 if (sign) { 1209 pmovsxbw(dst, src); 1210 } else { 1211 pmovzxbw(dst, src); 1212 } 1213 } 1214 1215 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1216 if (sign) { 1217 vpmovsxbw(dst, src, vector_len); 1218 } else { 1219 vpmovzxbw(dst, src, vector_len); 1220 } 1221 } 1222 1223 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1224 if (sign) { 1225 vpmovsxbd(dst, src, vector_len); 1226 } else { 1227 vpmovzxbd(dst, src, vector_len); 1228 } 1229 } 1230 1231 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1232 if (sign) { 1233 vpmovsxwd(dst, src, vector_len); 1234 } else { 1235 vpmovzxwd(dst, src, vector_len); 1236 } 1237 } 1238 1239 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1240 int shift, int vector_len) { 1241 if (opcode == Op_RotateLeftV) { 1242 if (etype == T_INT) { 1243 evprold(dst, src, shift, vector_len); 1244 } else { 1245 assert(etype == T_LONG, "expected type T_LONG"); 1246 evprolq(dst, src, shift, vector_len); 1247 } 1248 } else { 1249 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1250 if (etype == T_INT) { 1251 evprord(dst, src, shift, vector_len); 1252 } else { 1253 assert(etype == T_LONG, "expected type T_LONG"); 1254 evprorq(dst, src, shift, vector_len); 1255 } 1256 } 1257 } 1258 1259 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1260 XMMRegister shift, int vector_len) { 1261 if (opcode == Op_RotateLeftV) { 1262 if (etype == T_INT) { 1263 evprolvd(dst, src, shift, vector_len); 1264 } else { 1265 assert(etype == T_LONG, "expected type T_LONG"); 1266 evprolvq(dst, src, shift, vector_len); 1267 } 1268 } else { 1269 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1270 if (etype == T_INT) { 1271 evprorvd(dst, src, shift, vector_len); 1272 } else { 1273 assert(etype == T_LONG, "expected type T_LONG"); 1274 evprorvq(dst, src, shift, vector_len); 1275 } 1276 } 1277 } 1278 1279 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1280 if (opcode == Op_RShiftVI) { 1281 psrad(dst, shift); 1282 } else if (opcode == Op_LShiftVI) { 1283 pslld(dst, shift); 1284 } else { 1285 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1286 psrld(dst, shift); 1287 } 1288 } 1289 1290 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1291 switch (opcode) { 1292 case Op_RShiftVI: psrad(dst, shift); break; 1293 case Op_LShiftVI: pslld(dst, shift); break; 1294 case Op_URShiftVI: psrld(dst, shift); break; 1295 1296 default: assert(false, "%s", NodeClassNames[opcode]); 1297 } 1298 } 1299 1300 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1301 if (opcode == Op_RShiftVI) { 1302 vpsrad(dst, nds, shift, vector_len); 1303 } else if (opcode == Op_LShiftVI) { 1304 vpslld(dst, nds, shift, vector_len); 1305 } else { 1306 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1307 vpsrld(dst, nds, shift, vector_len); 1308 } 1309 } 1310 1311 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1312 switch (opcode) { 1313 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1314 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1315 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1316 1317 default: assert(false, "%s", NodeClassNames[opcode]); 1318 } 1319 } 1320 1321 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1322 switch (opcode) { 1323 case Op_RShiftVB: // fall-through 1324 case Op_RShiftVS: psraw(dst, shift); break; 1325 1326 case Op_LShiftVB: // fall-through 1327 case Op_LShiftVS: psllw(dst, shift); break; 1328 1329 case Op_URShiftVS: // fall-through 1330 case Op_URShiftVB: psrlw(dst, shift); break; 1331 1332 default: assert(false, "%s", NodeClassNames[opcode]); 1333 } 1334 } 1335 1336 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1337 switch (opcode) { 1338 case Op_RShiftVB: // fall-through 1339 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1340 1341 case Op_LShiftVB: // fall-through 1342 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1343 1344 case Op_URShiftVS: // fall-through 1345 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1346 1347 default: assert(false, "%s", NodeClassNames[opcode]); 1348 } 1349 } 1350 1351 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1352 switch (opcode) { 1353 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1354 case Op_LShiftVL: psllq(dst, shift); break; 1355 case Op_URShiftVL: psrlq(dst, shift); break; 1356 1357 default: assert(false, "%s", NodeClassNames[opcode]); 1358 } 1359 } 1360 1361 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1362 if (opcode == Op_RShiftVL) { 1363 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1364 } else if (opcode == Op_LShiftVL) { 1365 psllq(dst, shift); 1366 } else { 1367 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1368 psrlq(dst, shift); 1369 } 1370 } 1371 1372 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1373 switch (opcode) { 1374 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1375 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1376 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1377 1378 default: assert(false, "%s", NodeClassNames[opcode]); 1379 } 1380 } 1381 1382 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1383 if (opcode == Op_RShiftVL) { 1384 evpsraq(dst, nds, shift, vector_len); 1385 } else if (opcode == Op_LShiftVL) { 1386 vpsllq(dst, nds, shift, vector_len); 1387 } else { 1388 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1389 vpsrlq(dst, nds, shift, vector_len); 1390 } 1391 } 1392 1393 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1394 switch (opcode) { 1395 case Op_RShiftVB: // fall-through 1396 case Op_RShiftVS: // fall-through 1397 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1398 1399 case Op_LShiftVB: // fall-through 1400 case Op_LShiftVS: // fall-through 1401 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1402 1403 case Op_URShiftVB: // fall-through 1404 case Op_URShiftVS: // fall-through 1405 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1406 1407 default: assert(false, "%s", NodeClassNames[opcode]); 1408 } 1409 } 1410 1411 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1412 switch (opcode) { 1413 case Op_RShiftVB: // fall-through 1414 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1415 1416 case Op_LShiftVB: // fall-through 1417 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1418 1419 case Op_URShiftVB: // fall-through 1420 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1421 1422 default: assert(false, "%s", NodeClassNames[opcode]); 1423 } 1424 } 1425 1426 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1427 assert(UseAVX >= 2, "required"); 1428 switch (opcode) { 1429 case Op_RShiftVL: { 1430 if (UseAVX > 2) { 1431 assert(tmp == xnoreg, "not used"); 1432 if (!VM_Version::supports_avx512vl()) { 1433 vlen_enc = Assembler::AVX_512bit; 1434 } 1435 evpsravq(dst, src, shift, vlen_enc); 1436 } else { 1437 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1438 vpsrlvq(dst, src, shift, vlen_enc); 1439 vpsrlvq(tmp, tmp, shift, vlen_enc); 1440 vpxor(dst, dst, tmp, vlen_enc); 1441 vpsubq(dst, dst, tmp, vlen_enc); 1442 } 1443 break; 1444 } 1445 case Op_LShiftVL: { 1446 assert(tmp == xnoreg, "not used"); 1447 vpsllvq(dst, src, shift, vlen_enc); 1448 break; 1449 } 1450 case Op_URShiftVL: { 1451 assert(tmp == xnoreg, "not used"); 1452 vpsrlvq(dst, src, shift, vlen_enc); 1453 break; 1454 } 1455 default: assert(false, "%s", NodeClassNames[opcode]); 1456 } 1457 } 1458 1459 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1460 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1461 assert(opcode == Op_LShiftVB || 1462 opcode == Op_RShiftVB || 1463 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1464 bool sign = (opcode != Op_URShiftVB); 1465 assert(vector_len == 0, "required"); 1466 vextendbd(sign, dst, src, 1); 1467 vpmovzxbd(vtmp, shift, 1); 1468 varshiftd(opcode, dst, dst, vtmp, 1); 1469 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1470 vextracti128_high(vtmp, dst); 1471 vpackusdw(dst, dst, vtmp, 0); 1472 } 1473 1474 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1475 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1476 assert(opcode == Op_LShiftVB || 1477 opcode == Op_RShiftVB || 1478 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1479 bool sign = (opcode != Op_URShiftVB); 1480 int ext_vector_len = vector_len + 1; 1481 vextendbw(sign, dst, src, ext_vector_len); 1482 vpmovzxbw(vtmp, shift, ext_vector_len); 1483 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1484 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1485 if (vector_len == 0) { 1486 vextracti128_high(vtmp, dst); 1487 vpackuswb(dst, dst, vtmp, vector_len); 1488 } else { 1489 vextracti64x4_high(vtmp, dst); 1490 vpackuswb(dst, dst, vtmp, vector_len); 1491 vpermq(dst, dst, 0xD8, vector_len); 1492 } 1493 } 1494 1495 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1496 switch(typ) { 1497 case T_BYTE: 1498 pinsrb(dst, val, idx); 1499 break; 1500 case T_SHORT: 1501 pinsrw(dst, val, idx); 1502 break; 1503 case T_INT: 1504 pinsrd(dst, val, idx); 1505 break; 1506 case T_LONG: 1507 pinsrq(dst, val, idx); 1508 break; 1509 default: 1510 assert(false,"Should not reach here."); 1511 break; 1512 } 1513 } 1514 1515 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1516 switch(typ) { 1517 case T_BYTE: 1518 vpinsrb(dst, src, val, idx); 1519 break; 1520 case T_SHORT: 1521 vpinsrw(dst, src, val, idx); 1522 break; 1523 case T_INT: 1524 vpinsrd(dst, src, val, idx); 1525 break; 1526 case T_LONG: 1527 vpinsrq(dst, src, val, idx); 1528 break; 1529 default: 1530 assert(false,"Should not reach here."); 1531 break; 1532 } 1533 } 1534 1535 #ifdef _LP64 1536 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1537 XMMRegister dst, Register base, 1538 Register idx_base, 1539 Register offset, Register mask, 1540 Register mask_idx, Register rtmp, 1541 int vlen_enc) { 1542 vpxor(dst, dst, dst, vlen_enc); 1543 if (elem_bt == T_SHORT) { 1544 for (int i = 0; i < 4; i++) { 1545 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1546 Label skip_load; 1547 btq(mask, mask_idx); 1548 jccb(Assembler::carryClear, skip_load); 1549 movl(rtmp, Address(idx_base, i * 4)); 1550 if (offset != noreg) { 1551 addl(rtmp, offset); 1552 } 1553 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1554 bind(skip_load); 1555 incq(mask_idx); 1556 } 1557 } else { 1558 assert(elem_bt == T_BYTE, ""); 1559 for (int i = 0; i < 8; i++) { 1560 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1561 Label skip_load; 1562 btq(mask, mask_idx); 1563 jccb(Assembler::carryClear, skip_load); 1564 movl(rtmp, Address(idx_base, i * 4)); 1565 if (offset != noreg) { 1566 addl(rtmp, offset); 1567 } 1568 pinsrb(dst, Address(base, rtmp), i); 1569 bind(skip_load); 1570 incq(mask_idx); 1571 } 1572 } 1573 } 1574 #endif // _LP64 1575 1576 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1577 Register base, Register idx_base, 1578 Register offset, Register rtmp, 1579 int vlen_enc) { 1580 vpxor(dst, dst, dst, vlen_enc); 1581 if (elem_bt == T_SHORT) { 1582 for (int i = 0; i < 4; i++) { 1583 // dst[i] = src[offset + idx_base[i]] 1584 movl(rtmp, Address(idx_base, i * 4)); 1585 if (offset != noreg) { 1586 addl(rtmp, offset); 1587 } 1588 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1589 } 1590 } else { 1591 assert(elem_bt == T_BYTE, ""); 1592 for (int i = 0; i < 8; i++) { 1593 // dst[i] = src[offset + idx_base[i]] 1594 movl(rtmp, Address(idx_base, i * 4)); 1595 if (offset != noreg) { 1596 addl(rtmp, offset); 1597 } 1598 pinsrb(dst, Address(base, rtmp), i); 1599 } 1600 } 1601 } 1602 1603 /* 1604 * Gather using hybrid algorithm, first partially unroll scalar loop 1605 * to accumulate values from gather indices into a quad-word(64bit) slice. 1606 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1607 * permutation to place the slice into appropriate vector lane 1608 * locations in destination vector. Following pseudo code describes the 1609 * algorithm in detail: 1610 * 1611 * DST_VEC = ZERO_VEC 1612 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1613 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1614 * FOREACH_ITER: 1615 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1616 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1617 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1618 * PERM_INDEX = PERM_INDEX - TWO_VEC 1619 * 1620 * With each iteration, doubleword permute indices (0,1) corresponding 1621 * to gathered quadword gets right shifted by two lane positions. 1622 * 1623 */ 1624 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1625 Register base, Register idx_base, 1626 Register offset, Register mask, 1627 XMMRegister xtmp1, XMMRegister xtmp2, 1628 XMMRegister temp_dst, Register rtmp, 1629 Register mask_idx, Register length, 1630 int vector_len, int vlen_enc) { 1631 Label GATHER8_LOOP; 1632 assert(is_subword_type(elem_ty), ""); 1633 movl(length, vector_len); 1634 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1635 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1636 vallones(xtmp2, vlen_enc); 1637 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1638 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1639 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1640 1641 bind(GATHER8_LOOP); 1642 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1643 if (mask == noreg) { 1644 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1645 } else { 1646 LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc)); 1647 } 1648 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1649 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1650 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1651 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1652 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1653 vpor(dst, dst, temp_dst, vlen_enc); 1654 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1655 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1656 jcc(Assembler::notEqual, GATHER8_LOOP); 1657 } 1658 1659 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1660 switch(typ) { 1661 case T_INT: 1662 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1663 break; 1664 case T_FLOAT: 1665 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1666 break; 1667 case T_LONG: 1668 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1669 break; 1670 case T_DOUBLE: 1671 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1672 break; 1673 default: 1674 assert(false,"Should not reach here."); 1675 break; 1676 } 1677 } 1678 1679 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1680 switch(typ) { 1681 case T_INT: 1682 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1683 break; 1684 case T_FLOAT: 1685 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1686 break; 1687 case T_LONG: 1688 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1689 break; 1690 case T_DOUBLE: 1691 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1692 break; 1693 default: 1694 assert(false,"Should not reach here."); 1695 break; 1696 } 1697 } 1698 1699 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1700 switch(typ) { 1701 case T_INT: 1702 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1703 break; 1704 case T_FLOAT: 1705 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1706 break; 1707 case T_LONG: 1708 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1709 break; 1710 case T_DOUBLE: 1711 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1712 break; 1713 default: 1714 assert(false,"Should not reach here."); 1715 break; 1716 } 1717 } 1718 1719 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1720 if (vlen_in_bytes <= 16) { 1721 pxor (dst, dst); 1722 psubb(dst, src); 1723 switch (elem_bt) { 1724 case T_BYTE: /* nothing to do */ break; 1725 case T_SHORT: pmovsxbw(dst, dst); break; 1726 case T_INT: pmovsxbd(dst, dst); break; 1727 case T_FLOAT: pmovsxbd(dst, dst); break; 1728 case T_LONG: pmovsxbq(dst, dst); break; 1729 case T_DOUBLE: pmovsxbq(dst, dst); break; 1730 1731 default: assert(false, "%s", type2name(elem_bt)); 1732 } 1733 } else { 1734 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1735 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1736 1737 vpxor (dst, dst, dst, vlen_enc); 1738 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1739 1740 switch (elem_bt) { 1741 case T_BYTE: /* nothing to do */ break; 1742 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1743 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1744 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1745 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1746 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1747 1748 default: assert(false, "%s", type2name(elem_bt)); 1749 } 1750 } 1751 } 1752 1753 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1754 if (novlbwdq) { 1755 vpmovsxbd(xtmp, src, vlen_enc); 1756 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1757 Assembler::eq, true, vlen_enc, noreg); 1758 } else { 1759 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1760 vpsubb(xtmp, xtmp, src, vlen_enc); 1761 evpmovb2m(dst, xtmp, vlen_enc); 1762 } 1763 } 1764 1765 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1766 switch (vlen_in_bytes) { 1767 case 4: movdl(dst, src); break; 1768 case 8: movq(dst, src); break; 1769 case 16: movdqu(dst, src); break; 1770 case 32: vmovdqu(dst, src); break; 1771 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1772 default: ShouldNotReachHere(); 1773 } 1774 } 1775 1776 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1777 assert(rscratch != noreg || always_reachable(src), "missing"); 1778 1779 if (reachable(src)) { 1780 load_vector(dst, as_Address(src), vlen_in_bytes); 1781 } else { 1782 lea(rscratch, src); 1783 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1784 } 1785 } 1786 1787 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1788 int vlen_enc = vector_length_encoding(vlen); 1789 if (VM_Version::supports_avx()) { 1790 if (bt == T_LONG) { 1791 if (VM_Version::supports_avx2()) { 1792 vpbroadcastq(dst, src, vlen_enc); 1793 } else { 1794 vmovddup(dst, src, vlen_enc); 1795 } 1796 } else if (bt == T_DOUBLE) { 1797 if (vlen_enc != Assembler::AVX_128bit) { 1798 vbroadcastsd(dst, src, vlen_enc, noreg); 1799 } else { 1800 vmovddup(dst, src, vlen_enc); 1801 } 1802 } else { 1803 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1804 vpbroadcastd(dst, src, vlen_enc); 1805 } else { 1806 vbroadcastss(dst, src, vlen_enc); 1807 } 1808 } 1809 } else if (VM_Version::supports_sse3()) { 1810 movddup(dst, src); 1811 } else { 1812 movq(dst, src); 1813 if (vlen == 16) { 1814 punpcklqdq(dst, dst); 1815 } 1816 } 1817 } 1818 1819 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1820 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1821 int offset = exact_log2(type2aelembytes(bt)) << 6; 1822 if (is_floating_point_type(bt)) { 1823 offset += 128; 1824 } 1825 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1826 load_vector(dst, addr, vlen_in_bytes); 1827 } 1828 1829 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1830 1831 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1832 int vector_len = Assembler::AVX_128bit; 1833 1834 switch (opcode) { 1835 case Op_AndReductionV: pand(dst, src); break; 1836 case Op_OrReductionV: por (dst, src); break; 1837 case Op_XorReductionV: pxor(dst, src); break; 1838 case Op_MinReductionV: 1839 switch (typ) { 1840 case T_BYTE: pminsb(dst, src); break; 1841 case T_SHORT: pminsw(dst, src); break; 1842 case T_INT: pminsd(dst, src); break; 1843 case T_LONG: assert(UseAVX > 2, "required"); 1844 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1845 default: assert(false, "wrong type"); 1846 } 1847 break; 1848 case Op_MaxReductionV: 1849 switch (typ) { 1850 case T_BYTE: pmaxsb(dst, src); break; 1851 case T_SHORT: pmaxsw(dst, src); break; 1852 case T_INT: pmaxsd(dst, src); break; 1853 case T_LONG: assert(UseAVX > 2, "required"); 1854 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1855 default: assert(false, "wrong type"); 1856 } 1857 break; 1858 case Op_AddReductionVF: addss(dst, src); break; 1859 case Op_AddReductionVD: addsd(dst, src); break; 1860 case Op_AddReductionVI: 1861 switch (typ) { 1862 case T_BYTE: paddb(dst, src); break; 1863 case T_SHORT: paddw(dst, src); break; 1864 case T_INT: paddd(dst, src); break; 1865 default: assert(false, "wrong type"); 1866 } 1867 break; 1868 case Op_AddReductionVL: paddq(dst, src); break; 1869 case Op_MulReductionVF: mulss(dst, src); break; 1870 case Op_MulReductionVD: mulsd(dst, src); break; 1871 case Op_MulReductionVI: 1872 switch (typ) { 1873 case T_SHORT: pmullw(dst, src); break; 1874 case T_INT: pmulld(dst, src); break; 1875 default: assert(false, "wrong type"); 1876 } 1877 break; 1878 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1879 evpmullq(dst, dst, src, vector_len); break; 1880 default: assert(false, "wrong opcode"); 1881 } 1882 } 1883 1884 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1885 switch (opcode) { 1886 case Op_AddReductionVF: addps(dst, src); break; 1887 case Op_AddReductionVD: addpd(dst, src); break; 1888 case Op_MulReductionVF: mulps(dst, src); break; 1889 case Op_MulReductionVD: mulpd(dst, src); break; 1890 default: assert(false, "%s", NodeClassNames[opcode]); 1891 } 1892 } 1893 1894 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1895 int vector_len = Assembler::AVX_256bit; 1896 1897 switch (opcode) { 1898 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1899 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1900 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1901 case Op_MinReductionV: 1902 switch (typ) { 1903 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1904 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1905 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1906 case T_LONG: assert(UseAVX > 2, "required"); 1907 vpminsq(dst, src1, src2, vector_len); break; 1908 default: assert(false, "wrong type"); 1909 } 1910 break; 1911 case Op_MaxReductionV: 1912 switch (typ) { 1913 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1914 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1915 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1916 case T_LONG: assert(UseAVX > 2, "required"); 1917 vpmaxsq(dst, src1, src2, vector_len); break; 1918 default: assert(false, "wrong type"); 1919 } 1920 break; 1921 case Op_AddReductionVI: 1922 switch (typ) { 1923 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1924 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1925 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1926 default: assert(false, "wrong type"); 1927 } 1928 break; 1929 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1930 case Op_MulReductionVI: 1931 switch (typ) { 1932 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1933 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1934 default: assert(false, "wrong type"); 1935 } 1936 break; 1937 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1938 default: assert(false, "wrong opcode"); 1939 } 1940 } 1941 1942 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1943 int vector_len = Assembler::AVX_256bit; 1944 1945 switch (opcode) { 1946 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 1947 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 1948 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 1949 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 1950 default: assert(false, "%s", NodeClassNames[opcode]); 1951 } 1952 } 1953 1954 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1955 XMMRegister dst, XMMRegister src, 1956 XMMRegister vtmp1, XMMRegister vtmp2) { 1957 switch (opcode) { 1958 case Op_AddReductionVF: 1959 case Op_MulReductionVF: 1960 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1961 break; 1962 1963 case Op_AddReductionVD: 1964 case Op_MulReductionVD: 1965 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1966 break; 1967 1968 default: assert(false, "wrong opcode"); 1969 } 1970 } 1971 1972 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 1973 XMMRegister dst, XMMRegister src, 1974 XMMRegister vtmp1, XMMRegister vtmp2) { 1975 switch (opcode) { 1976 case Op_AddReductionVF: 1977 case Op_MulReductionVF: 1978 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1979 break; 1980 1981 case Op_AddReductionVD: 1982 case Op_MulReductionVD: 1983 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1984 break; 1985 1986 default: assert(false, "%s", NodeClassNames[opcode]); 1987 } 1988 } 1989 1990 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1991 Register dst, Register src1, XMMRegister src2, 1992 XMMRegister vtmp1, XMMRegister vtmp2) { 1993 switch (vlen) { 1994 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1995 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1996 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1997 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1998 1999 default: assert(false, "wrong vector length"); 2000 } 2001 } 2002 2003 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 2004 Register dst, Register src1, XMMRegister src2, 2005 XMMRegister vtmp1, XMMRegister vtmp2) { 2006 switch (vlen) { 2007 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2008 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2009 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2010 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2011 2012 default: assert(false, "wrong vector length"); 2013 } 2014 } 2015 2016 void C2_MacroAssembler::reduceS(int opcode, int vlen, 2017 Register dst, Register src1, XMMRegister src2, 2018 XMMRegister vtmp1, XMMRegister vtmp2) { 2019 switch (vlen) { 2020 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2021 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2022 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2023 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2024 2025 default: assert(false, "wrong vector length"); 2026 } 2027 } 2028 2029 void C2_MacroAssembler::reduceI(int opcode, int vlen, 2030 Register dst, Register src1, XMMRegister src2, 2031 XMMRegister vtmp1, XMMRegister vtmp2) { 2032 switch (vlen) { 2033 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2034 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2035 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2036 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2037 2038 default: assert(false, "wrong vector length"); 2039 } 2040 } 2041 2042 #ifdef _LP64 2043 void C2_MacroAssembler::reduceL(int opcode, int vlen, 2044 Register dst, Register src1, XMMRegister src2, 2045 XMMRegister vtmp1, XMMRegister vtmp2) { 2046 switch (vlen) { 2047 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2048 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2049 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2050 2051 default: assert(false, "wrong vector length"); 2052 } 2053 } 2054 #endif // _LP64 2055 2056 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2057 switch (vlen) { 2058 case 2: 2059 assert(vtmp2 == xnoreg, ""); 2060 reduce2F(opcode, dst, src, vtmp1); 2061 break; 2062 case 4: 2063 assert(vtmp2 == xnoreg, ""); 2064 reduce4F(opcode, dst, src, vtmp1); 2065 break; 2066 case 8: 2067 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2068 break; 2069 case 16: 2070 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2071 break; 2072 default: assert(false, "wrong vector length"); 2073 } 2074 } 2075 2076 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2077 switch (vlen) { 2078 case 2: 2079 assert(vtmp2 == xnoreg, ""); 2080 reduce2D(opcode, dst, src, vtmp1); 2081 break; 2082 case 4: 2083 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2084 break; 2085 case 8: 2086 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2087 break; 2088 default: assert(false, "wrong vector length"); 2089 } 2090 } 2091 2092 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2093 switch (vlen) { 2094 case 2: 2095 assert(vtmp1 == xnoreg, ""); 2096 assert(vtmp2 == xnoreg, ""); 2097 unorderedReduce2F(opcode, dst, src); 2098 break; 2099 case 4: 2100 assert(vtmp2 == xnoreg, ""); 2101 unorderedReduce4F(opcode, dst, src, vtmp1); 2102 break; 2103 case 8: 2104 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 2105 break; 2106 case 16: 2107 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 2108 break; 2109 default: assert(false, "wrong vector length"); 2110 } 2111 } 2112 2113 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2114 switch (vlen) { 2115 case 2: 2116 assert(vtmp1 == xnoreg, ""); 2117 assert(vtmp2 == xnoreg, ""); 2118 unorderedReduce2D(opcode, dst, src); 2119 break; 2120 case 4: 2121 assert(vtmp2 == xnoreg, ""); 2122 unorderedReduce4D(opcode, dst, src, vtmp1); 2123 break; 2124 case 8: 2125 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2126 break; 2127 default: assert(false, "wrong vector length"); 2128 } 2129 } 2130 2131 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2132 if (opcode == Op_AddReductionVI) { 2133 if (vtmp1 != src2) { 2134 movdqu(vtmp1, src2); 2135 } 2136 phaddd(vtmp1, vtmp1); 2137 } else { 2138 pshufd(vtmp1, src2, 0x1); 2139 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2140 } 2141 movdl(vtmp2, src1); 2142 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2143 movdl(dst, vtmp1); 2144 } 2145 2146 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2147 if (opcode == Op_AddReductionVI) { 2148 if (vtmp1 != src2) { 2149 movdqu(vtmp1, src2); 2150 } 2151 phaddd(vtmp1, src2); 2152 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2153 } else { 2154 pshufd(vtmp2, src2, 0xE); 2155 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2156 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2157 } 2158 } 2159 2160 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2161 if (opcode == Op_AddReductionVI) { 2162 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2163 vextracti128_high(vtmp2, vtmp1); 2164 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2165 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2166 } else { 2167 vextracti128_high(vtmp1, src2); 2168 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2169 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2170 } 2171 } 2172 2173 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2174 vextracti64x4_high(vtmp2, src2); 2175 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2176 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2177 } 2178 2179 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2180 pshufd(vtmp2, src2, 0x1); 2181 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2182 movdqu(vtmp1, vtmp2); 2183 psrldq(vtmp1, 2); 2184 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2185 movdqu(vtmp2, vtmp1); 2186 psrldq(vtmp2, 1); 2187 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2188 movdl(vtmp2, src1); 2189 pmovsxbd(vtmp1, vtmp1); 2190 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2191 pextrb(dst, vtmp1, 0x0); 2192 movsbl(dst, dst); 2193 } 2194 2195 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2196 pshufd(vtmp1, src2, 0xE); 2197 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2198 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2199 } 2200 2201 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2202 vextracti128_high(vtmp2, src2); 2203 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2204 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2205 } 2206 2207 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2208 vextracti64x4_high(vtmp1, src2); 2209 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2210 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2211 } 2212 2213 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2214 pmovsxbw(vtmp2, src2); 2215 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2216 } 2217 2218 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2219 if (UseAVX > 1) { 2220 int vector_len = Assembler::AVX_256bit; 2221 vpmovsxbw(vtmp1, src2, vector_len); 2222 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2223 } else { 2224 pmovsxbw(vtmp2, src2); 2225 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2226 pshufd(vtmp2, src2, 0x1); 2227 pmovsxbw(vtmp2, src2); 2228 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2229 } 2230 } 2231 2232 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2233 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2234 int vector_len = Assembler::AVX_512bit; 2235 vpmovsxbw(vtmp1, src2, vector_len); 2236 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2237 } else { 2238 assert(UseAVX >= 2,"Should not reach here."); 2239 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2240 vextracti128_high(vtmp2, src2); 2241 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2242 } 2243 } 2244 2245 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2246 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2247 vextracti64x4_high(vtmp2, src2); 2248 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2249 } 2250 2251 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2252 if (opcode == Op_AddReductionVI) { 2253 if (vtmp1 != src2) { 2254 movdqu(vtmp1, src2); 2255 } 2256 phaddw(vtmp1, vtmp1); 2257 phaddw(vtmp1, vtmp1); 2258 } else { 2259 pshufd(vtmp2, src2, 0x1); 2260 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2261 movdqu(vtmp1, vtmp2); 2262 psrldq(vtmp1, 2); 2263 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2264 } 2265 movdl(vtmp2, src1); 2266 pmovsxwd(vtmp1, vtmp1); 2267 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2268 pextrw(dst, vtmp1, 0x0); 2269 movswl(dst, dst); 2270 } 2271 2272 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2273 if (opcode == Op_AddReductionVI) { 2274 if (vtmp1 != src2) { 2275 movdqu(vtmp1, src2); 2276 } 2277 phaddw(vtmp1, src2); 2278 } else { 2279 pshufd(vtmp1, src2, 0xE); 2280 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2281 } 2282 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2283 } 2284 2285 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2286 if (opcode == Op_AddReductionVI) { 2287 int vector_len = Assembler::AVX_256bit; 2288 vphaddw(vtmp2, src2, src2, vector_len); 2289 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2290 } else { 2291 vextracti128_high(vtmp2, src2); 2292 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2293 } 2294 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2295 } 2296 2297 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2298 int vector_len = Assembler::AVX_256bit; 2299 vextracti64x4_high(vtmp1, src2); 2300 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2301 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2302 } 2303 2304 #ifdef _LP64 2305 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2306 pshufd(vtmp2, src2, 0xE); 2307 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2308 movdq(vtmp1, src1); 2309 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2310 movdq(dst, vtmp1); 2311 } 2312 2313 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2314 vextracti128_high(vtmp1, src2); 2315 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2316 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2317 } 2318 2319 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2320 vextracti64x4_high(vtmp2, src2); 2321 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2322 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2323 } 2324 2325 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2326 mov64(temp, -1L); 2327 bzhiq(temp, temp, len); 2328 kmovql(dst, temp); 2329 } 2330 #endif // _LP64 2331 2332 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2333 reduce_operation_128(T_FLOAT, opcode, dst, src); 2334 pshufd(vtmp, src, 0x1); 2335 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2336 } 2337 2338 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2339 reduce2F(opcode, dst, src, vtmp); 2340 pshufd(vtmp, src, 0x2); 2341 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2342 pshufd(vtmp, src, 0x3); 2343 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2344 } 2345 2346 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2347 reduce4F(opcode, dst, src, vtmp2); 2348 vextractf128_high(vtmp2, src); 2349 reduce4F(opcode, dst, vtmp2, vtmp1); 2350 } 2351 2352 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2353 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2354 vextracti64x4_high(vtmp1, src); 2355 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2356 } 2357 2358 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2359 pshufd(dst, src, 0x1); 2360 reduce_operation_128(T_FLOAT, opcode, dst, src); 2361 } 2362 2363 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2364 pshufd(vtmp, src, 0xE); 2365 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2366 unorderedReduce2F(opcode, dst, vtmp); 2367 } 2368 2369 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2370 vextractf128_high(vtmp1, src); 2371 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2372 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2373 } 2374 2375 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2376 vextractf64x4_high(vtmp2, src); 2377 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2378 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2379 } 2380 2381 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2382 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2383 pshufd(vtmp, src, 0xE); 2384 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2385 } 2386 2387 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2388 reduce2D(opcode, dst, src, vtmp2); 2389 vextractf128_high(vtmp2, src); 2390 reduce2D(opcode, dst, vtmp2, vtmp1); 2391 } 2392 2393 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2394 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2395 vextracti64x4_high(vtmp1, src); 2396 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2397 } 2398 2399 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2400 pshufd(dst, src, 0xE); 2401 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2402 } 2403 2404 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2405 vextractf128_high(vtmp, src); 2406 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2407 unorderedReduce2D(opcode, dst, vtmp); 2408 } 2409 2410 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2411 vextractf64x4_high(vtmp2, src); 2412 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2413 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2414 } 2415 2416 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2417 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2418 } 2419 2420 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2421 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2422 } 2423 2424 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2425 int vec_enc) { 2426 switch(elem_bt) { 2427 case T_INT: 2428 case T_FLOAT: 2429 vmaskmovps(dst, src, mask, vec_enc); 2430 break; 2431 case T_LONG: 2432 case T_DOUBLE: 2433 vmaskmovpd(dst, src, mask, vec_enc); 2434 break; 2435 default: 2436 fatal("Unsupported type %s", type2name(elem_bt)); 2437 break; 2438 } 2439 } 2440 2441 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2442 int vec_enc) { 2443 switch(elem_bt) { 2444 case T_INT: 2445 case T_FLOAT: 2446 vmaskmovps(dst, src, mask, vec_enc); 2447 break; 2448 case T_LONG: 2449 case T_DOUBLE: 2450 vmaskmovpd(dst, src, mask, vec_enc); 2451 break; 2452 default: 2453 fatal("Unsupported type %s", type2name(elem_bt)); 2454 break; 2455 } 2456 } 2457 2458 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2459 XMMRegister dst, XMMRegister src, 2460 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2461 XMMRegister xmm_0, XMMRegister xmm_1) { 2462 const int permconst[] = {1, 14}; 2463 XMMRegister wsrc = src; 2464 XMMRegister wdst = xmm_0; 2465 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2466 2467 int vlen_enc = Assembler::AVX_128bit; 2468 if (vlen == 16) { 2469 vlen_enc = Assembler::AVX_256bit; 2470 } 2471 2472 for (int i = log2(vlen) - 1; i >=0; i--) { 2473 if (i == 0 && !is_dst_valid) { 2474 wdst = dst; 2475 } 2476 if (i == 3) { 2477 vextracti64x4_high(wtmp, wsrc); 2478 } else if (i == 2) { 2479 vextracti128_high(wtmp, wsrc); 2480 } else { // i = [0,1] 2481 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2482 } 2483 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2484 wsrc = wdst; 2485 vlen_enc = Assembler::AVX_128bit; 2486 } 2487 if (is_dst_valid) { 2488 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2489 } 2490 } 2491 2492 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2493 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2494 XMMRegister xmm_0, XMMRegister xmm_1) { 2495 XMMRegister wsrc = src; 2496 XMMRegister wdst = xmm_0; 2497 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2498 int vlen_enc = Assembler::AVX_128bit; 2499 if (vlen == 8) { 2500 vlen_enc = Assembler::AVX_256bit; 2501 } 2502 for (int i = log2(vlen) - 1; i >=0; i--) { 2503 if (i == 0 && !is_dst_valid) { 2504 wdst = dst; 2505 } 2506 if (i == 1) { 2507 vextracti128_high(wtmp, wsrc); 2508 } else if (i == 2) { 2509 vextracti64x4_high(wtmp, wsrc); 2510 } else { 2511 assert(i == 0, "%d", i); 2512 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2513 } 2514 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2515 wsrc = wdst; 2516 vlen_enc = Assembler::AVX_128bit; 2517 } 2518 if (is_dst_valid) { 2519 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2520 } 2521 } 2522 2523 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2524 switch (bt) { 2525 case T_BYTE: pextrb(dst, src, idx); break; 2526 case T_SHORT: pextrw(dst, src, idx); break; 2527 case T_INT: pextrd(dst, src, idx); break; 2528 case T_LONG: pextrq(dst, src, idx); break; 2529 2530 default: 2531 assert(false,"Should not reach here."); 2532 break; 2533 } 2534 } 2535 2536 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2537 int esize = type2aelembytes(typ); 2538 int elem_per_lane = 16/esize; 2539 int lane = elemindex / elem_per_lane; 2540 int eindex = elemindex % elem_per_lane; 2541 2542 if (lane >= 2) { 2543 assert(UseAVX > 2, "required"); 2544 vextractf32x4(dst, src, lane & 3); 2545 return dst; 2546 } else if (lane > 0) { 2547 assert(UseAVX > 0, "required"); 2548 vextractf128(dst, src, lane); 2549 return dst; 2550 } else { 2551 return src; 2552 } 2553 } 2554 2555 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2556 if (typ == T_BYTE) { 2557 movsbl(dst, dst); 2558 } else if (typ == T_SHORT) { 2559 movswl(dst, dst); 2560 } 2561 } 2562 2563 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2564 int esize = type2aelembytes(typ); 2565 int elem_per_lane = 16/esize; 2566 int eindex = elemindex % elem_per_lane; 2567 assert(is_integral_type(typ),"required"); 2568 2569 if (eindex == 0) { 2570 if (typ == T_LONG) { 2571 movq(dst, src); 2572 } else { 2573 movdl(dst, src); 2574 movsxl(typ, dst); 2575 } 2576 } else { 2577 extract(typ, dst, src, eindex); 2578 movsxl(typ, dst); 2579 } 2580 } 2581 2582 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2583 int esize = type2aelembytes(typ); 2584 int elem_per_lane = 16/esize; 2585 int eindex = elemindex % elem_per_lane; 2586 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2587 2588 if (eindex == 0) { 2589 movq(dst, src); 2590 } else { 2591 if (typ == T_FLOAT) { 2592 if (UseAVX == 0) { 2593 movdqu(dst, src); 2594 shufps(dst, dst, eindex); 2595 } else { 2596 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2597 } 2598 } else { 2599 if (UseAVX == 0) { 2600 movdqu(dst, src); 2601 psrldq(dst, eindex*esize); 2602 } else { 2603 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2604 } 2605 movq(dst, dst); 2606 } 2607 } 2608 // Zero upper bits 2609 if (typ == T_FLOAT) { 2610 if (UseAVX == 0) { 2611 assert(vtmp != xnoreg, "required."); 2612 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2613 pand(dst, vtmp); 2614 } else { 2615 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2616 } 2617 } 2618 } 2619 2620 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2621 switch(typ) { 2622 case T_BYTE: 2623 case T_BOOLEAN: 2624 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2625 break; 2626 case T_SHORT: 2627 case T_CHAR: 2628 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2629 break; 2630 case T_INT: 2631 case T_FLOAT: 2632 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2633 break; 2634 case T_LONG: 2635 case T_DOUBLE: 2636 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2637 break; 2638 default: 2639 assert(false,"Should not reach here."); 2640 break; 2641 } 2642 } 2643 2644 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2645 assert(rscratch != noreg || always_reachable(src2), "missing"); 2646 2647 switch(typ) { 2648 case T_BOOLEAN: 2649 case T_BYTE: 2650 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2651 break; 2652 case T_CHAR: 2653 case T_SHORT: 2654 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2655 break; 2656 case T_INT: 2657 case T_FLOAT: 2658 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2659 break; 2660 case T_LONG: 2661 case T_DOUBLE: 2662 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2663 break; 2664 default: 2665 assert(false,"Should not reach here."); 2666 break; 2667 } 2668 } 2669 2670 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2671 switch(typ) { 2672 case T_BYTE: 2673 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2674 break; 2675 case T_SHORT: 2676 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2677 break; 2678 case T_INT: 2679 case T_FLOAT: 2680 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2681 break; 2682 case T_LONG: 2683 case T_DOUBLE: 2684 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2685 break; 2686 default: 2687 assert(false,"Should not reach here."); 2688 break; 2689 } 2690 } 2691 2692 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2693 assert(vlen_in_bytes <= 32, ""); 2694 int esize = type2aelembytes(bt); 2695 if (vlen_in_bytes == 32) { 2696 assert(vtmp == xnoreg, "required."); 2697 if (esize >= 4) { 2698 vtestps(src1, src2, AVX_256bit); 2699 } else { 2700 vptest(src1, src2, AVX_256bit); 2701 } 2702 return; 2703 } 2704 if (vlen_in_bytes < 16) { 2705 // Duplicate the lower part to fill the whole register, 2706 // Don't need to do so for src2 2707 assert(vtmp != xnoreg, "required"); 2708 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2709 pshufd(vtmp, src1, shuffle_imm); 2710 } else { 2711 assert(vtmp == xnoreg, "required"); 2712 vtmp = src1; 2713 } 2714 if (esize >= 4 && VM_Version::supports_avx()) { 2715 vtestps(vtmp, src2, AVX_128bit); 2716 } else { 2717 ptest(vtmp, src2); 2718 } 2719 } 2720 2721 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2722 assert(UseAVX >= 2, "required"); 2723 #ifdef ASSERT 2724 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2725 bool is_bw_supported = VM_Version::supports_avx512bw(); 2726 if (is_bw && !is_bw_supported) { 2727 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2728 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2729 "XMM register should be 0-15"); 2730 } 2731 #endif // ASSERT 2732 switch (elem_bt) { 2733 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2734 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2735 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2736 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2737 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2738 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2739 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2740 } 2741 } 2742 2743 #ifdef _LP64 2744 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2745 assert(UseAVX >= 2, "required"); 2746 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2747 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2748 if ((UseAVX > 2) && 2749 (!is_bw || VM_Version::supports_avx512bw()) && 2750 (!is_vl || VM_Version::supports_avx512vl())) { 2751 switch (elem_bt) { 2752 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2753 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2754 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2755 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2756 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2757 } 2758 } else { 2759 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2760 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2761 switch (elem_bt) { 2762 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2763 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2764 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2765 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2766 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2767 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2768 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2769 } 2770 } 2771 } 2772 #endif 2773 2774 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2775 switch (to_elem_bt) { 2776 case T_SHORT: 2777 vpmovsxbw(dst, src, vlen_enc); 2778 break; 2779 case T_INT: 2780 vpmovsxbd(dst, src, vlen_enc); 2781 break; 2782 case T_FLOAT: 2783 vpmovsxbd(dst, src, vlen_enc); 2784 vcvtdq2ps(dst, dst, vlen_enc); 2785 break; 2786 case T_LONG: 2787 vpmovsxbq(dst, src, vlen_enc); 2788 break; 2789 case T_DOUBLE: { 2790 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2791 vpmovsxbd(dst, src, mid_vlen_enc); 2792 vcvtdq2pd(dst, dst, vlen_enc); 2793 break; 2794 } 2795 default: 2796 fatal("Unsupported type %s", type2name(to_elem_bt)); 2797 break; 2798 } 2799 } 2800 2801 //------------------------------------------------------------------------------------------- 2802 2803 // IndexOf for constant substrings with size >= 8 chars 2804 // which don't need to be loaded through stack. 2805 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2806 Register cnt1, Register cnt2, 2807 int int_cnt2, Register result, 2808 XMMRegister vec, Register tmp, 2809 int ae) { 2810 ShortBranchVerifier sbv(this); 2811 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2812 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2813 2814 // This method uses the pcmpestri instruction with bound registers 2815 // inputs: 2816 // xmm - substring 2817 // rax - substring length (elements count) 2818 // mem - scanned string 2819 // rdx - string length (elements count) 2820 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2821 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2822 // outputs: 2823 // rcx - matched index in string 2824 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2825 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2826 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2827 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2828 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2829 2830 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2831 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2832 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2833 2834 // Note, inline_string_indexOf() generates checks: 2835 // if (substr.count > string.count) return -1; 2836 // if (substr.count == 0) return 0; 2837 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2838 2839 // Load substring. 2840 if (ae == StrIntrinsicNode::UL) { 2841 pmovzxbw(vec, Address(str2, 0)); 2842 } else { 2843 movdqu(vec, Address(str2, 0)); 2844 } 2845 movl(cnt2, int_cnt2); 2846 movptr(result, str1); // string addr 2847 2848 if (int_cnt2 > stride) { 2849 jmpb(SCAN_TO_SUBSTR); 2850 2851 // Reload substr for rescan, this code 2852 // is executed only for large substrings (> 8 chars) 2853 bind(RELOAD_SUBSTR); 2854 if (ae == StrIntrinsicNode::UL) { 2855 pmovzxbw(vec, Address(str2, 0)); 2856 } else { 2857 movdqu(vec, Address(str2, 0)); 2858 } 2859 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2860 2861 bind(RELOAD_STR); 2862 // We came here after the beginning of the substring was 2863 // matched but the rest of it was not so we need to search 2864 // again. Start from the next element after the previous match. 2865 2866 // cnt2 is number of substring reminding elements and 2867 // cnt1 is number of string reminding elements when cmp failed. 2868 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2869 subl(cnt1, cnt2); 2870 addl(cnt1, int_cnt2); 2871 movl(cnt2, int_cnt2); // Now restore cnt2 2872 2873 decrementl(cnt1); // Shift to next element 2874 cmpl(cnt1, cnt2); 2875 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2876 2877 addptr(result, (1<<scale1)); 2878 2879 } // (int_cnt2 > 8) 2880 2881 // Scan string for start of substr in 16-byte vectors 2882 bind(SCAN_TO_SUBSTR); 2883 pcmpestri(vec, Address(result, 0), mode); 2884 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2885 subl(cnt1, stride); 2886 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2887 cmpl(cnt1, cnt2); 2888 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2889 addptr(result, 16); 2890 jmpb(SCAN_TO_SUBSTR); 2891 2892 // Found a potential substr 2893 bind(FOUND_CANDIDATE); 2894 // Matched whole vector if first element matched (tmp(rcx) == 0). 2895 if (int_cnt2 == stride) { 2896 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2897 } else { // int_cnt2 > 8 2898 jccb(Assembler::overflow, FOUND_SUBSTR); 2899 } 2900 // After pcmpestri tmp(rcx) contains matched element index 2901 // Compute start addr of substr 2902 lea(result, Address(result, tmp, scale1)); 2903 2904 // Make sure string is still long enough 2905 subl(cnt1, tmp); 2906 cmpl(cnt1, cnt2); 2907 if (int_cnt2 == stride) { 2908 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2909 } else { // int_cnt2 > 8 2910 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2911 } 2912 // Left less then substring. 2913 2914 bind(RET_NOT_FOUND); 2915 movl(result, -1); 2916 jmp(EXIT); 2917 2918 if (int_cnt2 > stride) { 2919 // This code is optimized for the case when whole substring 2920 // is matched if its head is matched. 2921 bind(MATCH_SUBSTR_HEAD); 2922 pcmpestri(vec, Address(result, 0), mode); 2923 // Reload only string if does not match 2924 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2925 2926 Label CONT_SCAN_SUBSTR; 2927 // Compare the rest of substring (> 8 chars). 2928 bind(FOUND_SUBSTR); 2929 // First 8 chars are already matched. 2930 negptr(cnt2); 2931 addptr(cnt2, stride); 2932 2933 bind(SCAN_SUBSTR); 2934 subl(cnt1, stride); 2935 cmpl(cnt2, -stride); // Do not read beyond substring 2936 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2937 // Back-up strings to avoid reading beyond substring: 2938 // cnt1 = cnt1 - cnt2 + 8 2939 addl(cnt1, cnt2); // cnt2 is negative 2940 addl(cnt1, stride); 2941 movl(cnt2, stride); negptr(cnt2); 2942 bind(CONT_SCAN_SUBSTR); 2943 if (int_cnt2 < (int)G) { 2944 int tail_off1 = int_cnt2<<scale1; 2945 int tail_off2 = int_cnt2<<scale2; 2946 if (ae == StrIntrinsicNode::UL) { 2947 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2948 } else { 2949 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2950 } 2951 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2952 } else { 2953 // calculate index in register to avoid integer overflow (int_cnt2*2) 2954 movl(tmp, int_cnt2); 2955 addptr(tmp, cnt2); 2956 if (ae == StrIntrinsicNode::UL) { 2957 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2958 } else { 2959 movdqu(vec, Address(str2, tmp, scale2, 0)); 2960 } 2961 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2962 } 2963 // Need to reload strings pointers if not matched whole vector 2964 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2965 addptr(cnt2, stride); 2966 jcc(Assembler::negative, SCAN_SUBSTR); 2967 // Fall through if found full substring 2968 2969 } // (int_cnt2 > 8) 2970 2971 bind(RET_FOUND); 2972 // Found result if we matched full small substring. 2973 // Compute substr offset 2974 subptr(result, str1); 2975 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2976 shrl(result, 1); // index 2977 } 2978 bind(EXIT); 2979 2980 } // string_indexofC8 2981 2982 // Small strings are loaded through stack if they cross page boundary. 2983 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2984 Register cnt1, Register cnt2, 2985 int int_cnt2, Register result, 2986 XMMRegister vec, Register tmp, 2987 int ae) { 2988 ShortBranchVerifier sbv(this); 2989 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2990 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2991 2992 // 2993 // int_cnt2 is length of small (< 8 chars) constant substring 2994 // or (-1) for non constant substring in which case its length 2995 // is in cnt2 register. 2996 // 2997 // Note, inline_string_indexOf() generates checks: 2998 // if (substr.count > string.count) return -1; 2999 // if (substr.count == 0) return 0; 3000 // 3001 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 3002 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 3003 // This method uses the pcmpestri instruction with bound registers 3004 // inputs: 3005 // xmm - substring 3006 // rax - substring length (elements count) 3007 // mem - scanned string 3008 // rdx - string length (elements count) 3009 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 3010 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 3011 // outputs: 3012 // rcx - matched index in string 3013 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3014 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 3015 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 3016 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 3017 3018 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 3019 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 3020 FOUND_CANDIDATE; 3021 3022 { //======================================================== 3023 // We don't know where these strings are located 3024 // and we can't read beyond them. Load them through stack. 3025 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 3026 3027 movptr(tmp, rsp); // save old SP 3028 3029 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 3030 if (int_cnt2 == (1>>scale2)) { // One byte 3031 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 3032 load_unsigned_byte(result, Address(str2, 0)); 3033 movdl(vec, result); // move 32 bits 3034 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 3035 // Not enough header space in 32-bit VM: 12+3 = 15. 3036 movl(result, Address(str2, -1)); 3037 shrl(result, 8); 3038 movdl(vec, result); // move 32 bits 3039 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 3040 load_unsigned_short(result, Address(str2, 0)); 3041 movdl(vec, result); // move 32 bits 3042 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 3043 movdl(vec, Address(str2, 0)); // move 32 bits 3044 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 3045 movq(vec, Address(str2, 0)); // move 64 bits 3046 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 3047 // Array header size is 12 bytes in 32-bit VM 3048 // + 6 bytes for 3 chars == 18 bytes, 3049 // enough space to load vec and shift. 3050 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3051 if (ae == StrIntrinsicNode::UL) { 3052 int tail_off = int_cnt2-8; 3053 pmovzxbw(vec, Address(str2, tail_off)); 3054 psrldq(vec, -2*tail_off); 3055 } 3056 else { 3057 int tail_off = int_cnt2*(1<<scale2); 3058 movdqu(vec, Address(str2, tail_off-16)); 3059 psrldq(vec, 16-tail_off); 3060 } 3061 } 3062 } else { // not constant substring 3063 cmpl(cnt2, stride); 3064 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3065 3066 // We can read beyond string if srt+16 does not cross page boundary 3067 // since heaps are aligned and mapped by pages. 3068 assert(os::vm_page_size() < (int)G, "default page should be small"); 3069 movl(result, str2); // We need only low 32 bits 3070 andl(result, ((int)os::vm_page_size()-1)); 3071 cmpl(result, ((int)os::vm_page_size()-16)); 3072 jccb(Assembler::belowEqual, CHECK_STR); 3073 3074 // Move small strings to stack to allow load 16 bytes into vec. 3075 subptr(rsp, 16); 3076 int stk_offset = wordSize-(1<<scale2); 3077 push(cnt2); 3078 3079 bind(COPY_SUBSTR); 3080 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3081 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3082 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3083 } else if (ae == StrIntrinsicNode::UU) { 3084 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3085 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3086 } 3087 decrement(cnt2); 3088 jccb(Assembler::notZero, COPY_SUBSTR); 3089 3090 pop(cnt2); 3091 movptr(str2, rsp); // New substring address 3092 } // non constant 3093 3094 bind(CHECK_STR); 3095 cmpl(cnt1, stride); 3096 jccb(Assembler::aboveEqual, BIG_STRINGS); 3097 3098 // Check cross page boundary. 3099 movl(result, str1); // We need only low 32 bits 3100 andl(result, ((int)os::vm_page_size()-1)); 3101 cmpl(result, ((int)os::vm_page_size()-16)); 3102 jccb(Assembler::belowEqual, BIG_STRINGS); 3103 3104 subptr(rsp, 16); 3105 int stk_offset = -(1<<scale1); 3106 if (int_cnt2 < 0) { // not constant 3107 push(cnt2); 3108 stk_offset += wordSize; 3109 } 3110 movl(cnt2, cnt1); 3111 3112 bind(COPY_STR); 3113 if (ae == StrIntrinsicNode::LL) { 3114 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3115 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3116 } else { 3117 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3118 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3119 } 3120 decrement(cnt2); 3121 jccb(Assembler::notZero, COPY_STR); 3122 3123 if (int_cnt2 < 0) { // not constant 3124 pop(cnt2); 3125 } 3126 movptr(str1, rsp); // New string address 3127 3128 bind(BIG_STRINGS); 3129 // Load substring. 3130 if (int_cnt2 < 0) { // -1 3131 if (ae == StrIntrinsicNode::UL) { 3132 pmovzxbw(vec, Address(str2, 0)); 3133 } else { 3134 movdqu(vec, Address(str2, 0)); 3135 } 3136 push(cnt2); // substr count 3137 push(str2); // substr addr 3138 push(str1); // string addr 3139 } else { 3140 // Small (< 8 chars) constant substrings are loaded already. 3141 movl(cnt2, int_cnt2); 3142 } 3143 push(tmp); // original SP 3144 3145 } // Finished loading 3146 3147 //======================================================== 3148 // Start search 3149 // 3150 3151 movptr(result, str1); // string addr 3152 3153 if (int_cnt2 < 0) { // Only for non constant substring 3154 jmpb(SCAN_TO_SUBSTR); 3155 3156 // SP saved at sp+0 3157 // String saved at sp+1*wordSize 3158 // Substr saved at sp+2*wordSize 3159 // Substr count saved at sp+3*wordSize 3160 3161 // Reload substr for rescan, this code 3162 // is executed only for large substrings (> 8 chars) 3163 bind(RELOAD_SUBSTR); 3164 movptr(str2, Address(rsp, 2*wordSize)); 3165 movl(cnt2, Address(rsp, 3*wordSize)); 3166 if (ae == StrIntrinsicNode::UL) { 3167 pmovzxbw(vec, Address(str2, 0)); 3168 } else { 3169 movdqu(vec, Address(str2, 0)); 3170 } 3171 // We came here after the beginning of the substring was 3172 // matched but the rest of it was not so we need to search 3173 // again. Start from the next element after the previous match. 3174 subptr(str1, result); // Restore counter 3175 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3176 shrl(str1, 1); 3177 } 3178 addl(cnt1, str1); 3179 decrementl(cnt1); // Shift to next element 3180 cmpl(cnt1, cnt2); 3181 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3182 3183 addptr(result, (1<<scale1)); 3184 } // non constant 3185 3186 // Scan string for start of substr in 16-byte vectors 3187 bind(SCAN_TO_SUBSTR); 3188 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3189 pcmpestri(vec, Address(result, 0), mode); 3190 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3191 subl(cnt1, stride); 3192 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3193 cmpl(cnt1, cnt2); 3194 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3195 addptr(result, 16); 3196 3197 bind(ADJUST_STR); 3198 cmpl(cnt1, stride); // Do not read beyond string 3199 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3200 // Back-up string to avoid reading beyond string. 3201 lea(result, Address(result, cnt1, scale1, -16)); 3202 movl(cnt1, stride); 3203 jmpb(SCAN_TO_SUBSTR); 3204 3205 // Found a potential substr 3206 bind(FOUND_CANDIDATE); 3207 // After pcmpestri tmp(rcx) contains matched element index 3208 3209 // Make sure string is still long enough 3210 subl(cnt1, tmp); 3211 cmpl(cnt1, cnt2); 3212 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3213 // Left less then substring. 3214 3215 bind(RET_NOT_FOUND); 3216 movl(result, -1); 3217 jmp(CLEANUP); 3218 3219 bind(FOUND_SUBSTR); 3220 // Compute start addr of substr 3221 lea(result, Address(result, tmp, scale1)); 3222 if (int_cnt2 > 0) { // Constant substring 3223 // Repeat search for small substring (< 8 chars) 3224 // from new point without reloading substring. 3225 // Have to check that we don't read beyond string. 3226 cmpl(tmp, stride-int_cnt2); 3227 jccb(Assembler::greater, ADJUST_STR); 3228 // Fall through if matched whole substring. 3229 } else { // non constant 3230 assert(int_cnt2 == -1, "should be != 0"); 3231 3232 addl(tmp, cnt2); 3233 // Found result if we matched whole substring. 3234 cmpl(tmp, stride); 3235 jcc(Assembler::lessEqual, RET_FOUND); 3236 3237 // Repeat search for small substring (<= 8 chars) 3238 // from new point 'str1' without reloading substring. 3239 cmpl(cnt2, stride); 3240 // Have to check that we don't read beyond string. 3241 jccb(Assembler::lessEqual, ADJUST_STR); 3242 3243 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3244 // Compare the rest of substring (> 8 chars). 3245 movptr(str1, result); 3246 3247 cmpl(tmp, cnt2); 3248 // First 8 chars are already matched. 3249 jccb(Assembler::equal, CHECK_NEXT); 3250 3251 bind(SCAN_SUBSTR); 3252 pcmpestri(vec, Address(str1, 0), mode); 3253 // Need to reload strings pointers if not matched whole vector 3254 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3255 3256 bind(CHECK_NEXT); 3257 subl(cnt2, stride); 3258 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3259 addptr(str1, 16); 3260 if (ae == StrIntrinsicNode::UL) { 3261 addptr(str2, 8); 3262 } else { 3263 addptr(str2, 16); 3264 } 3265 subl(cnt1, stride); 3266 cmpl(cnt2, stride); // Do not read beyond substring 3267 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3268 // Back-up strings to avoid reading beyond substring. 3269 3270 if (ae == StrIntrinsicNode::UL) { 3271 lea(str2, Address(str2, cnt2, scale2, -8)); 3272 lea(str1, Address(str1, cnt2, scale1, -16)); 3273 } else { 3274 lea(str2, Address(str2, cnt2, scale2, -16)); 3275 lea(str1, Address(str1, cnt2, scale1, -16)); 3276 } 3277 subl(cnt1, cnt2); 3278 movl(cnt2, stride); 3279 addl(cnt1, stride); 3280 bind(CONT_SCAN_SUBSTR); 3281 if (ae == StrIntrinsicNode::UL) { 3282 pmovzxbw(vec, Address(str2, 0)); 3283 } else { 3284 movdqu(vec, Address(str2, 0)); 3285 } 3286 jmp(SCAN_SUBSTR); 3287 3288 bind(RET_FOUND_LONG); 3289 movptr(str1, Address(rsp, wordSize)); 3290 } // non constant 3291 3292 bind(RET_FOUND); 3293 // Compute substr offset 3294 subptr(result, str1); 3295 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3296 shrl(result, 1); // index 3297 } 3298 bind(CLEANUP); 3299 pop(rsp); // restore SP 3300 3301 } // string_indexof 3302 3303 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3304 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3305 ShortBranchVerifier sbv(this); 3306 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3307 3308 int stride = 8; 3309 3310 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3311 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3312 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3313 FOUND_SEQ_CHAR, DONE_LABEL; 3314 3315 movptr(result, str1); 3316 if (UseAVX >= 2) { 3317 cmpl(cnt1, stride); 3318 jcc(Assembler::less, SCAN_TO_CHAR); 3319 cmpl(cnt1, 2*stride); 3320 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3321 movdl(vec1, ch); 3322 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3323 vpxor(vec2, vec2); 3324 movl(tmp, cnt1); 3325 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3326 andl(cnt1,0x0000000F); //tail count (in chars) 3327 3328 bind(SCAN_TO_16_CHAR_LOOP); 3329 vmovdqu(vec3, Address(result, 0)); 3330 vpcmpeqw(vec3, vec3, vec1, 1); 3331 vptest(vec2, vec3); 3332 jcc(Assembler::carryClear, FOUND_CHAR); 3333 addptr(result, 32); 3334 subl(tmp, 2*stride); 3335 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3336 jmp(SCAN_TO_8_CHAR); 3337 bind(SCAN_TO_8_CHAR_INIT); 3338 movdl(vec1, ch); 3339 pshuflw(vec1, vec1, 0x00); 3340 pshufd(vec1, vec1, 0); 3341 pxor(vec2, vec2); 3342 } 3343 bind(SCAN_TO_8_CHAR); 3344 cmpl(cnt1, stride); 3345 jcc(Assembler::less, SCAN_TO_CHAR); 3346 if (UseAVX < 2) { 3347 movdl(vec1, ch); 3348 pshuflw(vec1, vec1, 0x00); 3349 pshufd(vec1, vec1, 0); 3350 pxor(vec2, vec2); 3351 } 3352 movl(tmp, cnt1); 3353 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3354 andl(cnt1,0x00000007); //tail count (in chars) 3355 3356 bind(SCAN_TO_8_CHAR_LOOP); 3357 movdqu(vec3, Address(result, 0)); 3358 pcmpeqw(vec3, vec1); 3359 ptest(vec2, vec3); 3360 jcc(Assembler::carryClear, FOUND_CHAR); 3361 addptr(result, 16); 3362 subl(tmp, stride); 3363 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3364 bind(SCAN_TO_CHAR); 3365 testl(cnt1, cnt1); 3366 jcc(Assembler::zero, RET_NOT_FOUND); 3367 bind(SCAN_TO_CHAR_LOOP); 3368 load_unsigned_short(tmp, Address(result, 0)); 3369 cmpl(ch, tmp); 3370 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3371 addptr(result, 2); 3372 subl(cnt1, 1); 3373 jccb(Assembler::zero, RET_NOT_FOUND); 3374 jmp(SCAN_TO_CHAR_LOOP); 3375 3376 bind(RET_NOT_FOUND); 3377 movl(result, -1); 3378 jmpb(DONE_LABEL); 3379 3380 bind(FOUND_CHAR); 3381 if (UseAVX >= 2) { 3382 vpmovmskb(tmp, vec3); 3383 } else { 3384 pmovmskb(tmp, vec3); 3385 } 3386 bsfl(ch, tmp); 3387 addptr(result, ch); 3388 3389 bind(FOUND_SEQ_CHAR); 3390 subptr(result, str1); 3391 shrl(result, 1); 3392 3393 bind(DONE_LABEL); 3394 } // string_indexof_char 3395 3396 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3397 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3398 ShortBranchVerifier sbv(this); 3399 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3400 3401 int stride = 16; 3402 3403 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3404 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3405 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3406 FOUND_SEQ_CHAR, DONE_LABEL; 3407 3408 movptr(result, str1); 3409 if (UseAVX >= 2) { 3410 cmpl(cnt1, stride); 3411 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3412 cmpl(cnt1, stride*2); 3413 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3414 movdl(vec1, ch); 3415 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3416 vpxor(vec2, vec2); 3417 movl(tmp, cnt1); 3418 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3419 andl(cnt1,0x0000001F); //tail count (in chars) 3420 3421 bind(SCAN_TO_32_CHAR_LOOP); 3422 vmovdqu(vec3, Address(result, 0)); 3423 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3424 vptest(vec2, vec3); 3425 jcc(Assembler::carryClear, FOUND_CHAR); 3426 addptr(result, 32); 3427 subl(tmp, stride*2); 3428 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3429 jmp(SCAN_TO_16_CHAR); 3430 3431 bind(SCAN_TO_16_CHAR_INIT); 3432 movdl(vec1, ch); 3433 pxor(vec2, vec2); 3434 pshufb(vec1, vec2); 3435 } 3436 3437 bind(SCAN_TO_16_CHAR); 3438 cmpl(cnt1, stride); 3439 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3440 if (UseAVX < 2) { 3441 movdl(vec1, ch); 3442 pxor(vec2, vec2); 3443 pshufb(vec1, vec2); 3444 } 3445 movl(tmp, cnt1); 3446 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3447 andl(cnt1,0x0000000F); //tail count (in bytes) 3448 3449 bind(SCAN_TO_16_CHAR_LOOP); 3450 movdqu(vec3, Address(result, 0)); 3451 pcmpeqb(vec3, vec1); 3452 ptest(vec2, vec3); 3453 jcc(Assembler::carryClear, FOUND_CHAR); 3454 addptr(result, 16); 3455 subl(tmp, stride); 3456 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3457 3458 bind(SCAN_TO_CHAR_INIT); 3459 testl(cnt1, cnt1); 3460 jcc(Assembler::zero, RET_NOT_FOUND); 3461 bind(SCAN_TO_CHAR_LOOP); 3462 load_unsigned_byte(tmp, Address(result, 0)); 3463 cmpl(ch, tmp); 3464 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3465 addptr(result, 1); 3466 subl(cnt1, 1); 3467 jccb(Assembler::zero, RET_NOT_FOUND); 3468 jmp(SCAN_TO_CHAR_LOOP); 3469 3470 bind(RET_NOT_FOUND); 3471 movl(result, -1); 3472 jmpb(DONE_LABEL); 3473 3474 bind(FOUND_CHAR); 3475 if (UseAVX >= 2) { 3476 vpmovmskb(tmp, vec3); 3477 } else { 3478 pmovmskb(tmp, vec3); 3479 } 3480 bsfl(ch, tmp); 3481 addptr(result, ch); 3482 3483 bind(FOUND_SEQ_CHAR); 3484 subptr(result, str1); 3485 3486 bind(DONE_LABEL); 3487 } // stringL_indexof_char 3488 3489 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3490 switch (eltype) { 3491 case T_BOOLEAN: return sizeof(jboolean); 3492 case T_BYTE: return sizeof(jbyte); 3493 case T_SHORT: return sizeof(jshort); 3494 case T_CHAR: return sizeof(jchar); 3495 case T_INT: return sizeof(jint); 3496 default: 3497 ShouldNotReachHere(); 3498 return -1; 3499 } 3500 } 3501 3502 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3503 switch (eltype) { 3504 // T_BOOLEAN used as surrogate for unsigned byte 3505 case T_BOOLEAN: movzbl(dst, src); break; 3506 case T_BYTE: movsbl(dst, src); break; 3507 case T_SHORT: movswl(dst, src); break; 3508 case T_CHAR: movzwl(dst, src); break; 3509 case T_INT: movl(dst, src); break; 3510 default: 3511 ShouldNotReachHere(); 3512 } 3513 } 3514 3515 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3516 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3517 } 3518 3519 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3520 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3521 } 3522 3523 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3524 const int vlen = Assembler::AVX_256bit; 3525 switch (eltype) { 3526 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3527 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3528 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3529 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3530 case T_INT: 3531 // do nothing 3532 break; 3533 default: 3534 ShouldNotReachHere(); 3535 } 3536 } 3537 3538 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3539 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3540 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3541 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3542 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3543 BasicType eltype) { 3544 ShortBranchVerifier sbv(this); 3545 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3546 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3547 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3548 3549 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3550 SHORT_UNROLLED_LOOP_EXIT, 3551 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3552 UNROLLED_VECTOR_LOOP_BEGIN, 3553 END; 3554 switch (eltype) { 3555 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3556 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3557 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3558 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3559 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3560 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3561 } 3562 3563 // For "renaming" for readibility of the code 3564 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3565 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3566 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3567 3568 const int elsize = arrays_hashcode_elsize(eltype); 3569 3570 /* 3571 if (cnt1 >= 2) { 3572 if (cnt1 >= 32) { 3573 UNROLLED VECTOR LOOP 3574 } 3575 UNROLLED SCALAR LOOP 3576 } 3577 SINGLE SCALAR 3578 */ 3579 3580 cmpl(cnt1, 32); 3581 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3582 3583 // cnt1 >= 32 && generate_vectorized_loop 3584 xorl(index, index); 3585 3586 // vresult = IntVector.zero(I256); 3587 for (int idx = 0; idx < 4; idx++) { 3588 vpxor(vresult[idx], vresult[idx]); 3589 } 3590 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3591 Register bound = tmp2; 3592 Register next = tmp3; 3593 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3594 movl(next, Address(tmp2, 0)); 3595 movdl(vnext, next); 3596 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3597 3598 // index = 0; 3599 // bound = cnt1 & ~(32 - 1); 3600 movl(bound, cnt1); 3601 andl(bound, ~(32 - 1)); 3602 // for (; index < bound; index += 32) { 3603 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3604 // result *= next; 3605 imull(result, next); 3606 // loop fission to upfront the cost of fetching from memory, OOO execution 3607 // can then hopefully do a better job of prefetching 3608 for (int idx = 0; idx < 4; idx++) { 3609 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3610 } 3611 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3612 for (int idx = 0; idx < 4; idx++) { 3613 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3614 arrays_hashcode_elvcast(vtmp[idx], eltype); 3615 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3616 } 3617 // index += 32; 3618 addl(index, 32); 3619 // index < bound; 3620 cmpl(index, bound); 3621 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3622 // } 3623 3624 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3625 subl(cnt1, bound); 3626 // release bound 3627 3628 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3629 for (int idx = 0; idx < 4; idx++) { 3630 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3631 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3632 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3633 } 3634 // result += vresult.reduceLanes(ADD); 3635 for (int idx = 0; idx < 4; idx++) { 3636 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3637 } 3638 3639 // } else if (cnt1 < 32) { 3640 3641 bind(SHORT_UNROLLED_BEGIN); 3642 // int i = 1; 3643 movl(index, 1); 3644 cmpl(index, cnt1); 3645 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3646 3647 // for (; i < cnt1 ; i += 2) { 3648 bind(SHORT_UNROLLED_LOOP_BEGIN); 3649 movl(tmp3, 961); 3650 imull(result, tmp3); 3651 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3652 movl(tmp3, tmp2); 3653 shll(tmp3, 5); 3654 subl(tmp3, tmp2); 3655 addl(result, tmp3); 3656 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3657 addl(result, tmp3); 3658 addl(index, 2); 3659 cmpl(index, cnt1); 3660 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3661 3662 // } 3663 // if (i >= cnt1) { 3664 bind(SHORT_UNROLLED_LOOP_EXIT); 3665 jccb(Assembler::greater, END); 3666 movl(tmp2, result); 3667 shll(result, 5); 3668 subl(result, tmp2); 3669 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3670 addl(result, tmp3); 3671 // } 3672 bind(END); 3673 3674 BLOCK_COMMENT("} // arrays_hashcode"); 3675 3676 } // arrays_hashcode 3677 3678 // helper function for string_compare 3679 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3680 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3681 Address::ScaleFactor scale2, Register index, int ae) { 3682 if (ae == StrIntrinsicNode::LL) { 3683 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3684 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3685 } else if (ae == StrIntrinsicNode::UU) { 3686 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3687 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3688 } else { 3689 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3690 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3691 } 3692 } 3693 3694 // Compare strings, used for char[] and byte[]. 3695 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3696 Register cnt1, Register cnt2, Register result, 3697 XMMRegister vec1, int ae, KRegister mask) { 3698 ShortBranchVerifier sbv(this); 3699 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3700 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3701 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3702 int stride2x2 = 0x40; 3703 Address::ScaleFactor scale = Address::no_scale; 3704 Address::ScaleFactor scale1 = Address::no_scale; 3705 Address::ScaleFactor scale2 = Address::no_scale; 3706 3707 if (ae != StrIntrinsicNode::LL) { 3708 stride2x2 = 0x20; 3709 } 3710 3711 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3712 shrl(cnt2, 1); 3713 } 3714 // Compute the minimum of the string lengths and the 3715 // difference of the string lengths (stack). 3716 // Do the conditional move stuff 3717 movl(result, cnt1); 3718 subl(cnt1, cnt2); 3719 push(cnt1); 3720 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3721 3722 // Is the minimum length zero? 3723 testl(cnt2, cnt2); 3724 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3725 if (ae == StrIntrinsicNode::LL) { 3726 // Load first bytes 3727 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3728 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3729 } else if (ae == StrIntrinsicNode::UU) { 3730 // Load first characters 3731 load_unsigned_short(result, Address(str1, 0)); 3732 load_unsigned_short(cnt1, Address(str2, 0)); 3733 } else { 3734 load_unsigned_byte(result, Address(str1, 0)); 3735 load_unsigned_short(cnt1, Address(str2, 0)); 3736 } 3737 subl(result, cnt1); 3738 jcc(Assembler::notZero, POP_LABEL); 3739 3740 if (ae == StrIntrinsicNode::UU) { 3741 // Divide length by 2 to get number of chars 3742 shrl(cnt2, 1); 3743 } 3744 cmpl(cnt2, 1); 3745 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3746 3747 // Check if the strings start at the same location and setup scale and stride 3748 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3749 cmpptr(str1, str2); 3750 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3751 if (ae == StrIntrinsicNode::LL) { 3752 scale = Address::times_1; 3753 stride = 16; 3754 } else { 3755 scale = Address::times_2; 3756 stride = 8; 3757 } 3758 } else { 3759 scale1 = Address::times_1; 3760 scale2 = Address::times_2; 3761 // scale not used 3762 stride = 8; 3763 } 3764 3765 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3766 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3767 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3768 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3769 Label COMPARE_TAIL_LONG; 3770 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3771 3772 int pcmpmask = 0x19; 3773 if (ae == StrIntrinsicNode::LL) { 3774 pcmpmask &= ~0x01; 3775 } 3776 3777 // Setup to compare 16-chars (32-bytes) vectors, 3778 // start from first character again because it has aligned address. 3779 if (ae == StrIntrinsicNode::LL) { 3780 stride2 = 32; 3781 } else { 3782 stride2 = 16; 3783 } 3784 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3785 adr_stride = stride << scale; 3786 } else { 3787 adr_stride1 = 8; //stride << scale1; 3788 adr_stride2 = 16; //stride << scale2; 3789 } 3790 3791 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3792 // rax and rdx are used by pcmpestri as elements counters 3793 movl(result, cnt2); 3794 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3795 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3796 3797 // fast path : compare first 2 8-char vectors. 3798 bind(COMPARE_16_CHARS); 3799 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3800 movdqu(vec1, Address(str1, 0)); 3801 } else { 3802 pmovzxbw(vec1, Address(str1, 0)); 3803 } 3804 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3805 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3806 3807 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3808 movdqu(vec1, Address(str1, adr_stride)); 3809 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3810 } else { 3811 pmovzxbw(vec1, Address(str1, adr_stride1)); 3812 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3813 } 3814 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3815 addl(cnt1, stride); 3816 3817 // Compare the characters at index in cnt1 3818 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3819 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3820 subl(result, cnt2); 3821 jmp(POP_LABEL); 3822 3823 // Setup the registers to start vector comparison loop 3824 bind(COMPARE_WIDE_VECTORS); 3825 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3826 lea(str1, Address(str1, result, scale)); 3827 lea(str2, Address(str2, result, scale)); 3828 } else { 3829 lea(str1, Address(str1, result, scale1)); 3830 lea(str2, Address(str2, result, scale2)); 3831 } 3832 subl(result, stride2); 3833 subl(cnt2, stride2); 3834 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3835 negptr(result); 3836 3837 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3838 bind(COMPARE_WIDE_VECTORS_LOOP); 3839 3840 #ifdef _LP64 3841 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3842 cmpl(cnt2, stride2x2); 3843 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3844 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3845 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3846 3847 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3848 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3849 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3850 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3851 } else { 3852 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3853 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3854 } 3855 kortestql(mask, mask); 3856 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3857 addptr(result, stride2x2); // update since we already compared at this addr 3858 subl(cnt2, stride2x2); // and sub the size too 3859 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3860 3861 vpxor(vec1, vec1); 3862 jmpb(COMPARE_WIDE_TAIL); 3863 }//if (VM_Version::supports_avx512vlbw()) 3864 #endif // _LP64 3865 3866 3867 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3868 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3869 vmovdqu(vec1, Address(str1, result, scale)); 3870 vpxor(vec1, Address(str2, result, scale)); 3871 } else { 3872 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3873 vpxor(vec1, Address(str2, result, scale2)); 3874 } 3875 vptest(vec1, vec1); 3876 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3877 addptr(result, stride2); 3878 subl(cnt2, stride2); 3879 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3880 // clean upper bits of YMM registers 3881 vpxor(vec1, vec1); 3882 3883 // compare wide vectors tail 3884 bind(COMPARE_WIDE_TAIL); 3885 testptr(result, result); 3886 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3887 3888 movl(result, stride2); 3889 movl(cnt2, result); 3890 negptr(result); 3891 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3892 3893 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3894 bind(VECTOR_NOT_EQUAL); 3895 // clean upper bits of YMM registers 3896 vpxor(vec1, vec1); 3897 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3898 lea(str1, Address(str1, result, scale)); 3899 lea(str2, Address(str2, result, scale)); 3900 } else { 3901 lea(str1, Address(str1, result, scale1)); 3902 lea(str2, Address(str2, result, scale2)); 3903 } 3904 jmp(COMPARE_16_CHARS); 3905 3906 // Compare tail chars, length between 1 to 15 chars 3907 bind(COMPARE_TAIL_LONG); 3908 movl(cnt2, result); 3909 cmpl(cnt2, stride); 3910 jcc(Assembler::less, COMPARE_SMALL_STR); 3911 3912 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3913 movdqu(vec1, Address(str1, 0)); 3914 } else { 3915 pmovzxbw(vec1, Address(str1, 0)); 3916 } 3917 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3918 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3919 subptr(cnt2, stride); 3920 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3921 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3922 lea(str1, Address(str1, result, scale)); 3923 lea(str2, Address(str2, result, scale)); 3924 } else { 3925 lea(str1, Address(str1, result, scale1)); 3926 lea(str2, Address(str2, result, scale2)); 3927 } 3928 negptr(cnt2); 3929 jmpb(WHILE_HEAD_LABEL); 3930 3931 bind(COMPARE_SMALL_STR); 3932 } else if (UseSSE42Intrinsics) { 3933 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3934 int pcmpmask = 0x19; 3935 // Setup to compare 8-char (16-byte) vectors, 3936 // start from first character again because it has aligned address. 3937 movl(result, cnt2); 3938 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3939 if (ae == StrIntrinsicNode::LL) { 3940 pcmpmask &= ~0x01; 3941 } 3942 jcc(Assembler::zero, COMPARE_TAIL); 3943 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3944 lea(str1, Address(str1, result, scale)); 3945 lea(str2, Address(str2, result, scale)); 3946 } else { 3947 lea(str1, Address(str1, result, scale1)); 3948 lea(str2, Address(str2, result, scale2)); 3949 } 3950 negptr(result); 3951 3952 // pcmpestri 3953 // inputs: 3954 // vec1- substring 3955 // rax - negative string length (elements count) 3956 // mem - scanned string 3957 // rdx - string length (elements count) 3958 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3959 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3960 // outputs: 3961 // rcx - first mismatched element index 3962 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3963 3964 bind(COMPARE_WIDE_VECTORS); 3965 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3966 movdqu(vec1, Address(str1, result, scale)); 3967 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3968 } else { 3969 pmovzxbw(vec1, Address(str1, result, scale1)); 3970 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3971 } 3972 // After pcmpestri cnt1(rcx) contains mismatched element index 3973 3974 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3975 addptr(result, stride); 3976 subptr(cnt2, stride); 3977 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3978 3979 // compare wide vectors tail 3980 testptr(result, result); 3981 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3982 3983 movl(cnt2, stride); 3984 movl(result, stride); 3985 negptr(result); 3986 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3987 movdqu(vec1, Address(str1, result, scale)); 3988 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3989 } else { 3990 pmovzxbw(vec1, Address(str1, result, scale1)); 3991 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3992 } 3993 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3994 3995 // Mismatched characters in the vectors 3996 bind(VECTOR_NOT_EQUAL); 3997 addptr(cnt1, result); 3998 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3999 subl(result, cnt2); 4000 jmpb(POP_LABEL); 4001 4002 bind(COMPARE_TAIL); // limit is zero 4003 movl(cnt2, result); 4004 // Fallthru to tail compare 4005 } 4006 // Shift str2 and str1 to the end of the arrays, negate min 4007 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4008 lea(str1, Address(str1, cnt2, scale)); 4009 lea(str2, Address(str2, cnt2, scale)); 4010 } else { 4011 lea(str1, Address(str1, cnt2, scale1)); 4012 lea(str2, Address(str2, cnt2, scale2)); 4013 } 4014 decrementl(cnt2); // first character was compared already 4015 negptr(cnt2); 4016 4017 // Compare the rest of the elements 4018 bind(WHILE_HEAD_LABEL); 4019 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 4020 subl(result, cnt1); 4021 jccb(Assembler::notZero, POP_LABEL); 4022 increment(cnt2); 4023 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 4024 4025 // Strings are equal up to min length. Return the length difference. 4026 bind(LENGTH_DIFF_LABEL); 4027 pop(result); 4028 if (ae == StrIntrinsicNode::UU) { 4029 // Divide diff by 2 to get number of chars 4030 sarl(result, 1); 4031 } 4032 jmpb(DONE_LABEL); 4033 4034 #ifdef _LP64 4035 if (VM_Version::supports_avx512vlbw()) { 4036 4037 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 4038 4039 kmovql(cnt1, mask); 4040 notq(cnt1); 4041 bsfq(cnt2, cnt1); 4042 if (ae != StrIntrinsicNode::LL) { 4043 // Divide diff by 2 to get number of chars 4044 sarl(cnt2, 1); 4045 } 4046 addq(result, cnt2); 4047 if (ae == StrIntrinsicNode::LL) { 4048 load_unsigned_byte(cnt1, Address(str2, result)); 4049 load_unsigned_byte(result, Address(str1, result)); 4050 } else if (ae == StrIntrinsicNode::UU) { 4051 load_unsigned_short(cnt1, Address(str2, result, scale)); 4052 load_unsigned_short(result, Address(str1, result, scale)); 4053 } else { 4054 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4055 load_unsigned_byte(result, Address(str1, result, scale1)); 4056 } 4057 subl(result, cnt1); 4058 jmpb(POP_LABEL); 4059 }//if (VM_Version::supports_avx512vlbw()) 4060 #endif // _LP64 4061 4062 // Discard the stored length difference 4063 bind(POP_LABEL); 4064 pop(cnt1); 4065 4066 // That's it 4067 bind(DONE_LABEL); 4068 if(ae == StrIntrinsicNode::UL) { 4069 negl(result); 4070 } 4071 4072 } 4073 4074 // Search for Non-ASCII character (Negative byte value) in a byte array, 4075 // return the index of the first such character, otherwise the length 4076 // of the array segment searched. 4077 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4078 // @IntrinsicCandidate 4079 // public static int countPositives(byte[] ba, int off, int len) { 4080 // for (int i = off; i < off + len; i++) { 4081 // if (ba[i] < 0) { 4082 // return i - off; 4083 // } 4084 // } 4085 // return len; 4086 // } 4087 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4088 Register result, Register tmp1, 4089 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4090 // rsi: byte array 4091 // rcx: len 4092 // rax: result 4093 ShortBranchVerifier sbv(this); 4094 assert_different_registers(ary1, len, result, tmp1); 4095 assert_different_registers(vec1, vec2); 4096 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4097 4098 movl(result, len); // copy 4099 // len == 0 4100 testl(len, len); 4101 jcc(Assembler::zero, DONE); 4102 4103 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4104 VM_Version::supports_avx512vlbw() && 4105 VM_Version::supports_bmi2()) { 4106 4107 Label test_64_loop, test_tail, BREAK_LOOP; 4108 movl(tmp1, len); 4109 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4110 4111 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4112 andl(len, 0xffffffc0); // vector count (in chars) 4113 jccb(Assembler::zero, test_tail); 4114 4115 lea(ary1, Address(ary1, len, Address::times_1)); 4116 negptr(len); 4117 4118 bind(test_64_loop); 4119 // Check whether our 64 elements of size byte contain negatives 4120 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4121 kortestql(mask1, mask1); 4122 jcc(Assembler::notZero, BREAK_LOOP); 4123 4124 addptr(len, 64); 4125 jccb(Assembler::notZero, test_64_loop); 4126 4127 bind(test_tail); 4128 // bail out when there is nothing to be done 4129 testl(tmp1, -1); 4130 jcc(Assembler::zero, DONE); 4131 4132 4133 // check the tail for absense of negatives 4134 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4135 #ifdef _LP64 4136 { 4137 Register tmp3_aliased = len; 4138 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4139 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4140 notq(tmp3_aliased); 4141 kmovql(mask2, tmp3_aliased); 4142 } 4143 #else 4144 Label k_init; 4145 jmp(k_init); 4146 4147 // We could not read 64-bits from a general purpose register thus we move 4148 // data required to compose 64 1's to the instruction stream 4149 // We emit 64 byte wide series of elements from 0..63 which later on would 4150 // be used as a compare targets with tail count contained in tmp1 register. 4151 // Result would be a k register having tmp1 consecutive number or 1 4152 // counting from least significant bit. 4153 address tmp = pc(); 4154 emit_int64(0x0706050403020100); 4155 emit_int64(0x0F0E0D0C0B0A0908); 4156 emit_int64(0x1716151413121110); 4157 emit_int64(0x1F1E1D1C1B1A1918); 4158 emit_int64(0x2726252423222120); 4159 emit_int64(0x2F2E2D2C2B2A2928); 4160 emit_int64(0x3736353433323130); 4161 emit_int64(0x3F3E3D3C3B3A3938); 4162 4163 bind(k_init); 4164 lea(len, InternalAddress(tmp)); 4165 // create mask to test for negative byte inside a vector 4166 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 4167 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 4168 4169 #endif 4170 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4171 ktestq(mask1, mask2); 4172 jcc(Assembler::zero, DONE); 4173 4174 // do a full check for negative registers in the tail 4175 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4176 // ary1 already pointing to the right place 4177 jmpb(TAIL_START); 4178 4179 bind(BREAK_LOOP); 4180 // At least one byte in the last 64 byte block was negative. 4181 // Set up to look at the last 64 bytes as if they were a tail 4182 lea(ary1, Address(ary1, len, Address::times_1)); 4183 addptr(result, len); 4184 // Ignore the very last byte: if all others are positive, 4185 // it must be negative, so we can skip right to the 2+1 byte 4186 // end comparison at this point 4187 orl(result, 63); 4188 movl(len, 63); 4189 // Fallthru to tail compare 4190 } else { 4191 4192 if (UseAVX >= 2 && UseSSE >= 2) { 4193 // With AVX2, use 32-byte vector compare 4194 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4195 4196 // Compare 32-byte vectors 4197 testl(len, 0xffffffe0); // vector count (in bytes) 4198 jccb(Assembler::zero, TAIL_START); 4199 4200 andl(len, 0xffffffe0); 4201 lea(ary1, Address(ary1, len, Address::times_1)); 4202 negptr(len); 4203 4204 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4205 movdl(vec2, tmp1); 4206 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4207 4208 bind(COMPARE_WIDE_VECTORS); 4209 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4210 vptest(vec1, vec2); 4211 jccb(Assembler::notZero, BREAK_LOOP); 4212 addptr(len, 32); 4213 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4214 4215 testl(result, 0x0000001f); // any bytes remaining? 4216 jcc(Assembler::zero, DONE); 4217 4218 // Quick test using the already prepared vector mask 4219 movl(len, result); 4220 andl(len, 0x0000001f); 4221 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4222 vptest(vec1, vec2); 4223 jcc(Assembler::zero, DONE); 4224 // There are zeros, jump to the tail to determine exactly where 4225 jmpb(TAIL_START); 4226 4227 bind(BREAK_LOOP); 4228 // At least one byte in the last 32-byte vector is negative. 4229 // Set up to look at the last 32 bytes as if they were a tail 4230 lea(ary1, Address(ary1, len, Address::times_1)); 4231 addptr(result, len); 4232 // Ignore the very last byte: if all others are positive, 4233 // it must be negative, so we can skip right to the 2+1 byte 4234 // end comparison at this point 4235 orl(result, 31); 4236 movl(len, 31); 4237 // Fallthru to tail compare 4238 } else if (UseSSE42Intrinsics) { 4239 // With SSE4.2, use double quad vector compare 4240 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4241 4242 // Compare 16-byte vectors 4243 testl(len, 0xfffffff0); // vector count (in bytes) 4244 jcc(Assembler::zero, TAIL_START); 4245 4246 andl(len, 0xfffffff0); 4247 lea(ary1, Address(ary1, len, Address::times_1)); 4248 negptr(len); 4249 4250 movl(tmp1, 0x80808080); 4251 movdl(vec2, tmp1); 4252 pshufd(vec2, vec2, 0); 4253 4254 bind(COMPARE_WIDE_VECTORS); 4255 movdqu(vec1, Address(ary1, len, Address::times_1)); 4256 ptest(vec1, vec2); 4257 jccb(Assembler::notZero, BREAK_LOOP); 4258 addptr(len, 16); 4259 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4260 4261 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4262 jcc(Assembler::zero, DONE); 4263 4264 // Quick test using the already prepared vector mask 4265 movl(len, result); 4266 andl(len, 0x0000000f); // tail count (in bytes) 4267 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4268 ptest(vec1, vec2); 4269 jcc(Assembler::zero, DONE); 4270 jmpb(TAIL_START); 4271 4272 bind(BREAK_LOOP); 4273 // At least one byte in the last 16-byte vector is negative. 4274 // Set up and look at the last 16 bytes as if they were a tail 4275 lea(ary1, Address(ary1, len, Address::times_1)); 4276 addptr(result, len); 4277 // Ignore the very last byte: if all others are positive, 4278 // it must be negative, so we can skip right to the 2+1 byte 4279 // end comparison at this point 4280 orl(result, 15); 4281 movl(len, 15); 4282 // Fallthru to tail compare 4283 } 4284 } 4285 4286 bind(TAIL_START); 4287 // Compare 4-byte vectors 4288 andl(len, 0xfffffffc); // vector count (in bytes) 4289 jccb(Assembler::zero, COMPARE_CHAR); 4290 4291 lea(ary1, Address(ary1, len, Address::times_1)); 4292 negptr(len); 4293 4294 bind(COMPARE_VECTORS); 4295 movl(tmp1, Address(ary1, len, Address::times_1)); 4296 andl(tmp1, 0x80808080); 4297 jccb(Assembler::notZero, TAIL_ADJUST); 4298 addptr(len, 4); 4299 jccb(Assembler::notZero, COMPARE_VECTORS); 4300 4301 // Compare trailing char (final 2-3 bytes), if any 4302 bind(COMPARE_CHAR); 4303 4304 testl(result, 0x2); // tail char 4305 jccb(Assembler::zero, COMPARE_BYTE); 4306 load_unsigned_short(tmp1, Address(ary1, 0)); 4307 andl(tmp1, 0x00008080); 4308 jccb(Assembler::notZero, CHAR_ADJUST); 4309 lea(ary1, Address(ary1, 2)); 4310 4311 bind(COMPARE_BYTE); 4312 testl(result, 0x1); // tail byte 4313 jccb(Assembler::zero, DONE); 4314 load_unsigned_byte(tmp1, Address(ary1, 0)); 4315 testl(tmp1, 0x00000080); 4316 jccb(Assembler::zero, DONE); 4317 subptr(result, 1); 4318 jmpb(DONE); 4319 4320 bind(TAIL_ADJUST); 4321 // there are negative bits in the last 4 byte block. 4322 // Adjust result and check the next three bytes 4323 addptr(result, len); 4324 orl(result, 3); 4325 lea(ary1, Address(ary1, len, Address::times_1)); 4326 jmpb(COMPARE_CHAR); 4327 4328 bind(CHAR_ADJUST); 4329 // We are looking at a char + optional byte tail, and found that one 4330 // of the bytes in the char is negative. Adjust the result, check the 4331 // first byte and readjust if needed. 4332 andl(result, 0xfffffffc); 4333 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4334 jccb(Assembler::notZero, DONE); 4335 addptr(result, 1); 4336 4337 // That's it 4338 bind(DONE); 4339 if (UseAVX >= 2 && UseSSE >= 2) { 4340 // clean upper bits of YMM registers 4341 vpxor(vec1, vec1); 4342 vpxor(vec2, vec2); 4343 } 4344 } 4345 4346 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4347 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4348 Register limit, Register result, Register chr, 4349 XMMRegister vec1, XMMRegister vec2, bool is_char, 4350 KRegister mask, bool expand_ary2) { 4351 // for expand_ary2, limit is the (smaller) size of the second array. 4352 ShortBranchVerifier sbv(this); 4353 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4354 4355 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4356 "Expansion only implemented for AVX2"); 4357 4358 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4359 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4360 4361 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4362 int scaleIncr = expand_ary2 ? 8 : 16; 4363 4364 if (is_array_equ) { 4365 // Check the input args 4366 cmpoop(ary1, ary2); 4367 jcc(Assembler::equal, TRUE_LABEL); 4368 4369 // Need additional checks for arrays_equals. 4370 testptr(ary1, ary1); 4371 jcc(Assembler::zero, FALSE_LABEL); 4372 testptr(ary2, ary2); 4373 jcc(Assembler::zero, FALSE_LABEL); 4374 4375 // Check the lengths 4376 movl(limit, Address(ary1, length_offset)); 4377 cmpl(limit, Address(ary2, length_offset)); 4378 jcc(Assembler::notEqual, FALSE_LABEL); 4379 } 4380 4381 // count == 0 4382 testl(limit, limit); 4383 jcc(Assembler::zero, TRUE_LABEL); 4384 4385 if (is_array_equ) { 4386 // Load array address 4387 lea(ary1, Address(ary1, base_offset)); 4388 lea(ary2, Address(ary2, base_offset)); 4389 } 4390 4391 if (is_array_equ && is_char) { 4392 // arrays_equals when used for char[]. 4393 shll(limit, 1); // byte count != 0 4394 } 4395 movl(result, limit); // copy 4396 4397 if (UseAVX >= 2) { 4398 // With AVX2, use 32-byte vector compare 4399 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4400 4401 // Compare 32-byte vectors 4402 if (expand_ary2) { 4403 andl(result, 0x0000000f); // tail count (in bytes) 4404 andl(limit, 0xfffffff0); // vector count (in bytes) 4405 jcc(Assembler::zero, COMPARE_TAIL); 4406 } else { 4407 andl(result, 0x0000001f); // tail count (in bytes) 4408 andl(limit, 0xffffffe0); // vector count (in bytes) 4409 jcc(Assembler::zero, COMPARE_TAIL_16); 4410 } 4411 4412 lea(ary1, Address(ary1, limit, scaleFactor)); 4413 lea(ary2, Address(ary2, limit, Address::times_1)); 4414 negptr(limit); 4415 4416 #ifdef _LP64 4417 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4418 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4419 4420 cmpl(limit, -64); 4421 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4422 4423 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4424 4425 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4426 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4427 kortestql(mask, mask); 4428 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4429 addptr(limit, 64); // update since we already compared at this addr 4430 cmpl(limit, -64); 4431 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4432 4433 // At this point we may still need to compare -limit+result bytes. 4434 // We could execute the next two instruction and just continue via non-wide path: 4435 // cmpl(limit, 0); 4436 // jcc(Assembler::equal, COMPARE_TAIL); // true 4437 // But since we stopped at the points ary{1,2}+limit which are 4438 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4439 // (|limit| <= 32 and result < 32), 4440 // we may just compare the last 64 bytes. 4441 // 4442 addptr(result, -64); // it is safe, bc we just came from this area 4443 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4444 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4445 kortestql(mask, mask); 4446 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4447 4448 jmp(TRUE_LABEL); 4449 4450 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4451 4452 }//if (VM_Version::supports_avx512vlbw()) 4453 #endif //_LP64 4454 bind(COMPARE_WIDE_VECTORS); 4455 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4456 if (expand_ary2) { 4457 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4458 } else { 4459 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4460 } 4461 vpxor(vec1, vec2); 4462 4463 vptest(vec1, vec1); 4464 jcc(Assembler::notZero, FALSE_LABEL); 4465 addptr(limit, scaleIncr * 2); 4466 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4467 4468 testl(result, result); 4469 jcc(Assembler::zero, TRUE_LABEL); 4470 4471 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4472 if (expand_ary2) { 4473 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4474 } else { 4475 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4476 } 4477 vpxor(vec1, vec2); 4478 4479 vptest(vec1, vec1); 4480 jcc(Assembler::notZero, FALSE_LABEL); 4481 jmp(TRUE_LABEL); 4482 4483 bind(COMPARE_TAIL_16); // limit is zero 4484 movl(limit, result); 4485 4486 // Compare 16-byte chunks 4487 andl(result, 0x0000000f); // tail count (in bytes) 4488 andl(limit, 0xfffffff0); // vector count (in bytes) 4489 jcc(Assembler::zero, COMPARE_TAIL); 4490 4491 lea(ary1, Address(ary1, limit, scaleFactor)); 4492 lea(ary2, Address(ary2, limit, Address::times_1)); 4493 negptr(limit); 4494 4495 bind(COMPARE_WIDE_VECTORS_16); 4496 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4497 if (expand_ary2) { 4498 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4499 } else { 4500 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4501 } 4502 pxor(vec1, vec2); 4503 4504 ptest(vec1, vec1); 4505 jcc(Assembler::notZero, FALSE_LABEL); 4506 addptr(limit, scaleIncr); 4507 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4508 4509 bind(COMPARE_TAIL); // limit is zero 4510 movl(limit, result); 4511 // Fallthru to tail compare 4512 } else if (UseSSE42Intrinsics) { 4513 // With SSE4.2, use double quad vector compare 4514 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4515 4516 // Compare 16-byte vectors 4517 andl(result, 0x0000000f); // tail count (in bytes) 4518 andl(limit, 0xfffffff0); // vector count (in bytes) 4519 jcc(Assembler::zero, COMPARE_TAIL); 4520 4521 lea(ary1, Address(ary1, limit, Address::times_1)); 4522 lea(ary2, Address(ary2, limit, Address::times_1)); 4523 negptr(limit); 4524 4525 bind(COMPARE_WIDE_VECTORS); 4526 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4527 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4528 pxor(vec1, vec2); 4529 4530 ptest(vec1, vec1); 4531 jcc(Assembler::notZero, FALSE_LABEL); 4532 addptr(limit, 16); 4533 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4534 4535 testl(result, result); 4536 jcc(Assembler::zero, TRUE_LABEL); 4537 4538 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4539 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4540 pxor(vec1, vec2); 4541 4542 ptest(vec1, vec1); 4543 jccb(Assembler::notZero, FALSE_LABEL); 4544 jmpb(TRUE_LABEL); 4545 4546 bind(COMPARE_TAIL); // limit is zero 4547 movl(limit, result); 4548 // Fallthru to tail compare 4549 } 4550 4551 // Compare 4-byte vectors 4552 if (expand_ary2) { 4553 testl(result, result); 4554 jccb(Assembler::zero, TRUE_LABEL); 4555 } else { 4556 andl(limit, 0xfffffffc); // vector count (in bytes) 4557 jccb(Assembler::zero, COMPARE_CHAR); 4558 } 4559 4560 lea(ary1, Address(ary1, limit, scaleFactor)); 4561 lea(ary2, Address(ary2, limit, Address::times_1)); 4562 negptr(limit); 4563 4564 bind(COMPARE_VECTORS); 4565 if (expand_ary2) { 4566 // There are no "vector" operations for bytes to shorts 4567 movzbl(chr, Address(ary2, limit, Address::times_1)); 4568 cmpw(Address(ary1, limit, Address::times_2), chr); 4569 jccb(Assembler::notEqual, FALSE_LABEL); 4570 addptr(limit, 1); 4571 jcc(Assembler::notZero, COMPARE_VECTORS); 4572 jmp(TRUE_LABEL); 4573 } else { 4574 movl(chr, Address(ary1, limit, Address::times_1)); 4575 cmpl(chr, Address(ary2, limit, Address::times_1)); 4576 jccb(Assembler::notEqual, FALSE_LABEL); 4577 addptr(limit, 4); 4578 jcc(Assembler::notZero, COMPARE_VECTORS); 4579 } 4580 4581 // Compare trailing char (final 2 bytes), if any 4582 bind(COMPARE_CHAR); 4583 testl(result, 0x2); // tail char 4584 jccb(Assembler::zero, COMPARE_BYTE); 4585 load_unsigned_short(chr, Address(ary1, 0)); 4586 load_unsigned_short(limit, Address(ary2, 0)); 4587 cmpl(chr, limit); 4588 jccb(Assembler::notEqual, FALSE_LABEL); 4589 4590 if (is_array_equ && is_char) { 4591 bind(COMPARE_BYTE); 4592 } else { 4593 lea(ary1, Address(ary1, 2)); 4594 lea(ary2, Address(ary2, 2)); 4595 4596 bind(COMPARE_BYTE); 4597 testl(result, 0x1); // tail byte 4598 jccb(Assembler::zero, TRUE_LABEL); 4599 load_unsigned_byte(chr, Address(ary1, 0)); 4600 load_unsigned_byte(limit, Address(ary2, 0)); 4601 cmpl(chr, limit); 4602 jccb(Assembler::notEqual, FALSE_LABEL); 4603 } 4604 bind(TRUE_LABEL); 4605 movl(result, 1); // return true 4606 jmpb(DONE); 4607 4608 bind(FALSE_LABEL); 4609 xorl(result, result); // return false 4610 4611 // That's it 4612 bind(DONE); 4613 if (UseAVX >= 2) { 4614 // clean upper bits of YMM registers 4615 vpxor(vec1, vec1); 4616 vpxor(vec2, vec2); 4617 } 4618 } 4619 4620 #ifdef _LP64 4621 4622 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4623 #define __ masm. 4624 Register dst = stub.data<0>(); 4625 XMMRegister src = stub.data<1>(); 4626 address target = stub.data<2>(); 4627 __ bind(stub.entry()); 4628 __ subptr(rsp, 8); 4629 __ movdbl(Address(rsp), src); 4630 __ call(RuntimeAddress(target)); 4631 __ pop(dst); 4632 __ jmp(stub.continuation()); 4633 #undef __ 4634 } 4635 4636 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4637 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4638 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4639 4640 address slowpath_target; 4641 if (dst_bt == T_INT) { 4642 if (src_bt == T_FLOAT) { 4643 cvttss2sil(dst, src); 4644 cmpl(dst, 0x80000000); 4645 slowpath_target = StubRoutines::x86::f2i_fixup(); 4646 } else { 4647 cvttsd2sil(dst, src); 4648 cmpl(dst, 0x80000000); 4649 slowpath_target = StubRoutines::x86::d2i_fixup(); 4650 } 4651 } else { 4652 if (src_bt == T_FLOAT) { 4653 cvttss2siq(dst, src); 4654 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4655 slowpath_target = StubRoutines::x86::f2l_fixup(); 4656 } else { 4657 cvttsd2siq(dst, src); 4658 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4659 slowpath_target = StubRoutines::x86::d2l_fixup(); 4660 } 4661 } 4662 4663 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4664 jcc(Assembler::equal, stub->entry()); 4665 bind(stub->continuation()); 4666 } 4667 4668 #endif // _LP64 4669 4670 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4671 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4672 switch(ideal_opc) { 4673 case Op_LShiftVS: 4674 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4675 case Op_LShiftVI: 4676 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4677 case Op_LShiftVL: 4678 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4679 case Op_RShiftVS: 4680 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4681 case Op_RShiftVI: 4682 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4683 case Op_RShiftVL: 4684 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4685 case Op_URShiftVS: 4686 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4687 case Op_URShiftVI: 4688 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4689 case Op_URShiftVL: 4690 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4691 case Op_RotateRightV: 4692 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4693 case Op_RotateLeftV: 4694 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4695 default: 4696 fatal("Unsupported masked operation"); break; 4697 } 4698 } 4699 4700 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4701 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4702 bool is_varshift) { 4703 switch (ideal_opc) { 4704 case Op_AddVB: 4705 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4706 case Op_AddVS: 4707 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4708 case Op_AddVI: 4709 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4710 case Op_AddVL: 4711 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4712 case Op_AddVF: 4713 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4714 case Op_AddVD: 4715 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4716 case Op_SubVB: 4717 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4718 case Op_SubVS: 4719 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4720 case Op_SubVI: 4721 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4722 case Op_SubVL: 4723 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4724 case Op_SubVF: 4725 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4726 case Op_SubVD: 4727 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4728 case Op_MulVS: 4729 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4730 case Op_MulVI: 4731 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4732 case Op_MulVL: 4733 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4734 case Op_MulVF: 4735 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4736 case Op_MulVD: 4737 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4738 case Op_DivVF: 4739 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4740 case Op_DivVD: 4741 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4742 case Op_SqrtVF: 4743 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4744 case Op_SqrtVD: 4745 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4746 case Op_AbsVB: 4747 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4748 case Op_AbsVS: 4749 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4750 case Op_AbsVI: 4751 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4752 case Op_AbsVL: 4753 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4754 case Op_FmaVF: 4755 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4756 case Op_FmaVD: 4757 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4758 case Op_VectorRearrange: 4759 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4760 case Op_LShiftVS: 4761 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4762 case Op_LShiftVI: 4763 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4764 case Op_LShiftVL: 4765 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4766 case Op_RShiftVS: 4767 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4768 case Op_RShiftVI: 4769 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4770 case Op_RShiftVL: 4771 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4772 case Op_URShiftVS: 4773 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4774 case Op_URShiftVI: 4775 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4776 case Op_URShiftVL: 4777 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4778 case Op_RotateLeftV: 4779 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4780 case Op_RotateRightV: 4781 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4782 case Op_MaxV: 4783 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4784 case Op_MinV: 4785 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4786 case Op_XorV: 4787 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4788 case Op_OrV: 4789 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4790 case Op_AndV: 4791 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4792 default: 4793 fatal("Unsupported masked operation"); break; 4794 } 4795 } 4796 4797 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4798 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4799 switch (ideal_opc) { 4800 case Op_AddVB: 4801 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4802 case Op_AddVS: 4803 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4804 case Op_AddVI: 4805 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4806 case Op_AddVL: 4807 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4808 case Op_AddVF: 4809 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4810 case Op_AddVD: 4811 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4812 case Op_SubVB: 4813 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4814 case Op_SubVS: 4815 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4816 case Op_SubVI: 4817 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4818 case Op_SubVL: 4819 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4820 case Op_SubVF: 4821 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4822 case Op_SubVD: 4823 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4824 case Op_MulVS: 4825 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4826 case Op_MulVI: 4827 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4828 case Op_MulVL: 4829 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4830 case Op_MulVF: 4831 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4832 case Op_MulVD: 4833 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4834 case Op_DivVF: 4835 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4836 case Op_DivVD: 4837 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4838 case Op_FmaVF: 4839 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4840 case Op_FmaVD: 4841 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4842 case Op_MaxV: 4843 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4844 case Op_MinV: 4845 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4846 case Op_XorV: 4847 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4848 case Op_OrV: 4849 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4850 case Op_AndV: 4851 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4852 default: 4853 fatal("Unsupported masked operation"); break; 4854 } 4855 } 4856 4857 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4858 KRegister src1, KRegister src2) { 4859 BasicType etype = T_ILLEGAL; 4860 switch(mask_len) { 4861 case 2: 4862 case 4: 4863 case 8: etype = T_BYTE; break; 4864 case 16: etype = T_SHORT; break; 4865 case 32: etype = T_INT; break; 4866 case 64: etype = T_LONG; break; 4867 default: fatal("Unsupported type"); break; 4868 } 4869 assert(etype != T_ILLEGAL, ""); 4870 switch(ideal_opc) { 4871 case Op_AndVMask: 4872 kand(etype, dst, src1, src2); break; 4873 case Op_OrVMask: 4874 kor(etype, dst, src1, src2); break; 4875 case Op_XorVMask: 4876 kxor(etype, dst, src1, src2); break; 4877 default: 4878 fatal("Unsupported masked operation"); break; 4879 } 4880 } 4881 4882 /* 4883 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4884 * If src is NaN, the result is 0. 4885 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4886 * the result is equal to the value of Integer.MIN_VALUE. 4887 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4888 * the result is equal to the value of Integer.MAX_VALUE. 4889 */ 4890 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4891 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4892 Register rscratch, AddressLiteral float_sign_flip, 4893 int vec_enc) { 4894 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4895 Label done; 4896 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4897 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4898 vptest(xtmp2, xtmp2, vec_enc); 4899 jccb(Assembler::equal, done); 4900 4901 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4902 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4903 4904 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4905 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4906 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4907 4908 // Recompute the mask for remaining special value. 4909 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4910 // Extract SRC values corresponding to TRUE mask lanes. 4911 vpand(xtmp4, xtmp2, src, vec_enc); 4912 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4913 // values are set. 4914 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4915 4916 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4917 bind(done); 4918 } 4919 4920 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4921 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4922 Register rscratch, AddressLiteral float_sign_flip, 4923 int vec_enc) { 4924 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4925 Label done; 4926 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4927 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4928 kortestwl(ktmp1, ktmp1); 4929 jccb(Assembler::equal, done); 4930 4931 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4932 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4933 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4934 4935 kxorwl(ktmp1, ktmp1, ktmp2); 4936 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4937 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4938 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4939 bind(done); 4940 } 4941 4942 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4943 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4944 Register rscratch, AddressLiteral double_sign_flip, 4945 int vec_enc) { 4946 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4947 4948 Label done; 4949 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4950 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4951 kortestwl(ktmp1, ktmp1); 4952 jccb(Assembler::equal, done); 4953 4954 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4955 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4956 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4957 4958 kxorwl(ktmp1, ktmp1, ktmp2); 4959 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4960 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4961 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4962 bind(done); 4963 } 4964 4965 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4966 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4967 Register rscratch, AddressLiteral float_sign_flip, 4968 int vec_enc) { 4969 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4970 Label done; 4971 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4972 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4973 kortestwl(ktmp1, ktmp1); 4974 jccb(Assembler::equal, done); 4975 4976 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4977 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4978 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4979 4980 kxorwl(ktmp1, ktmp1, ktmp2); 4981 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4982 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4983 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4984 bind(done); 4985 } 4986 4987 /* 4988 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4989 * If src is NaN, the result is 0. 4990 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4991 * the result is equal to the value of Long.MIN_VALUE. 4992 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4993 * the result is equal to the value of Long.MAX_VALUE. 4994 */ 4995 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4996 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4997 Register rscratch, AddressLiteral double_sign_flip, 4998 int vec_enc) { 4999 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5000 5001 Label done; 5002 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5003 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 5004 kortestwl(ktmp1, ktmp1); 5005 jccb(Assembler::equal, done); 5006 5007 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5008 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5009 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5010 5011 kxorwl(ktmp1, ktmp1, ktmp2); 5012 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5013 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5014 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5015 bind(done); 5016 } 5017 5018 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5019 XMMRegister xtmp, int index, int vec_enc) { 5020 assert(vec_enc < Assembler::AVX_512bit, ""); 5021 if (vec_enc == Assembler::AVX_256bit) { 5022 vextractf128_high(xtmp, src); 5023 vshufps(dst, src, xtmp, index, vec_enc); 5024 } else { 5025 vshufps(dst, src, zero, index, vec_enc); 5026 } 5027 } 5028 5029 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5030 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5031 AddressLiteral float_sign_flip, int src_vec_enc) { 5032 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5033 5034 Label done; 5035 // Compare the destination lanes with float_sign_flip 5036 // value to get mask for all special values. 5037 movdqu(xtmp1, float_sign_flip, rscratch); 5038 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5039 ptest(xtmp2, xtmp2); 5040 jccb(Assembler::equal, done); 5041 5042 // Flip float_sign_flip to get max integer value. 5043 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5044 pxor(xtmp1, xtmp4); 5045 5046 // Set detination lanes corresponding to unordered source lanes as zero. 5047 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5048 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5049 5050 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5051 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5052 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5053 5054 // Recompute the mask for remaining special value. 5055 pxor(xtmp2, xtmp3); 5056 // Extract mask corresponding to non-negative source lanes. 5057 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5058 5059 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5060 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5061 pand(xtmp3, xtmp2); 5062 5063 // Replace destination lanes holding special value(0x80000000) with max int 5064 // if corresponding source lane holds a +ve value. 5065 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5066 bind(done); 5067 } 5068 5069 5070 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5071 XMMRegister xtmp, Register rscratch, int vec_enc) { 5072 switch(to_elem_bt) { 5073 case T_SHORT: 5074 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5075 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5076 vpackusdw(dst, dst, zero, vec_enc); 5077 if (vec_enc == Assembler::AVX_256bit) { 5078 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5079 } 5080 break; 5081 case T_BYTE: 5082 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5083 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5084 vpackusdw(dst, dst, zero, vec_enc); 5085 if (vec_enc == Assembler::AVX_256bit) { 5086 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5087 } 5088 vpackuswb(dst, dst, zero, vec_enc); 5089 break; 5090 default: assert(false, "%s", type2name(to_elem_bt)); 5091 } 5092 } 5093 5094 /* 5095 * Algorithm for vector D2L and F2I conversions:- 5096 * a) Perform vector D2L/F2I cast. 5097 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5098 * It signifies that source value could be any of the special floating point 5099 * values(NaN,-Inf,Inf,Max,-Min). 5100 * c) Set destination to zero if source is NaN value. 5101 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5102 */ 5103 5104 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5105 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5106 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5107 int to_elem_sz = type2aelembytes(to_elem_bt); 5108 assert(to_elem_sz <= 4, ""); 5109 vcvttps2dq(dst, src, vec_enc); 5110 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5111 if (to_elem_sz < 4) { 5112 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5113 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5114 } 5115 } 5116 5117 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5118 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5119 Register rscratch, int vec_enc) { 5120 int to_elem_sz = type2aelembytes(to_elem_bt); 5121 assert(to_elem_sz <= 4, ""); 5122 vcvttps2dq(dst, src, vec_enc); 5123 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5124 switch(to_elem_bt) { 5125 case T_INT: 5126 break; 5127 case T_SHORT: 5128 evpmovdw(dst, dst, vec_enc); 5129 break; 5130 case T_BYTE: 5131 evpmovdb(dst, dst, vec_enc); 5132 break; 5133 default: assert(false, "%s", type2name(to_elem_bt)); 5134 } 5135 } 5136 5137 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5138 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5139 Register rscratch, int vec_enc) { 5140 evcvttps2qq(dst, src, vec_enc); 5141 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5142 } 5143 5144 // Handling for downcasting from double to integer or sub-word types on AVX2. 5145 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5146 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5147 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5148 int to_elem_sz = type2aelembytes(to_elem_bt); 5149 assert(to_elem_sz < 8, ""); 5150 vcvttpd2dq(dst, src, vec_enc); 5151 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5152 float_sign_flip, vec_enc); 5153 if (to_elem_sz < 4) { 5154 // xtmp4 holds all zero lanes. 5155 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5156 } 5157 } 5158 5159 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5160 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5161 KRegister ktmp2, AddressLiteral sign_flip, 5162 Register rscratch, int vec_enc) { 5163 if (VM_Version::supports_avx512dq()) { 5164 evcvttpd2qq(dst, src, vec_enc); 5165 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5166 switch(to_elem_bt) { 5167 case T_LONG: 5168 break; 5169 case T_INT: 5170 evpmovsqd(dst, dst, vec_enc); 5171 break; 5172 case T_SHORT: 5173 evpmovsqd(dst, dst, vec_enc); 5174 evpmovdw(dst, dst, vec_enc); 5175 break; 5176 case T_BYTE: 5177 evpmovsqd(dst, dst, vec_enc); 5178 evpmovdb(dst, dst, vec_enc); 5179 break; 5180 default: assert(false, "%s", type2name(to_elem_bt)); 5181 } 5182 } else { 5183 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5184 vcvttpd2dq(dst, src, vec_enc); 5185 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5186 switch(to_elem_bt) { 5187 case T_INT: 5188 break; 5189 case T_SHORT: 5190 evpmovdw(dst, dst, vec_enc); 5191 break; 5192 case T_BYTE: 5193 evpmovdb(dst, dst, vec_enc); 5194 break; 5195 default: assert(false, "%s", type2name(to_elem_bt)); 5196 } 5197 } 5198 } 5199 5200 #ifdef _LP64 5201 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5202 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5203 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5204 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5205 // and re-instantiate original MXCSR.RC mode after that. 5206 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5207 5208 mov64(tmp, julong_cast(0.5L)); 5209 evpbroadcastq(xtmp1, tmp, vec_enc); 5210 vaddpd(xtmp1, src , xtmp1, vec_enc); 5211 evcvtpd2qq(dst, xtmp1, vec_enc); 5212 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5213 double_sign_flip, vec_enc);; 5214 5215 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5216 } 5217 5218 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5219 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5220 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5221 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5222 // and re-instantiate original MXCSR.RC mode after that. 5223 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5224 5225 movl(tmp, jint_cast(0.5)); 5226 movq(xtmp1, tmp); 5227 vbroadcastss(xtmp1, xtmp1, vec_enc); 5228 vaddps(xtmp1, src , xtmp1, vec_enc); 5229 vcvtps2dq(dst, xtmp1, vec_enc); 5230 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5231 float_sign_flip, vec_enc); 5232 5233 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5234 } 5235 5236 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5237 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5238 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5239 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5240 // and re-instantiate original MXCSR.RC mode after that. 5241 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5242 5243 movl(tmp, jint_cast(0.5)); 5244 movq(xtmp1, tmp); 5245 vbroadcastss(xtmp1, xtmp1, vec_enc); 5246 vaddps(xtmp1, src , xtmp1, vec_enc); 5247 vcvtps2dq(dst, xtmp1, vec_enc); 5248 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5249 5250 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5251 } 5252 #endif // _LP64 5253 5254 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5255 BasicType from_elem_bt, BasicType to_elem_bt) { 5256 switch (from_elem_bt) { 5257 case T_BYTE: 5258 switch (to_elem_bt) { 5259 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5260 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5261 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5262 default: ShouldNotReachHere(); 5263 } 5264 break; 5265 case T_SHORT: 5266 switch (to_elem_bt) { 5267 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5268 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5269 default: ShouldNotReachHere(); 5270 } 5271 break; 5272 case T_INT: 5273 assert(to_elem_bt == T_LONG, ""); 5274 vpmovzxdq(dst, src, vlen_enc); 5275 break; 5276 default: 5277 ShouldNotReachHere(); 5278 } 5279 } 5280 5281 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5282 BasicType from_elem_bt, BasicType to_elem_bt) { 5283 switch (from_elem_bt) { 5284 case T_BYTE: 5285 switch (to_elem_bt) { 5286 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5287 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5288 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5289 default: ShouldNotReachHere(); 5290 } 5291 break; 5292 case T_SHORT: 5293 switch (to_elem_bt) { 5294 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5295 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5296 default: ShouldNotReachHere(); 5297 } 5298 break; 5299 case T_INT: 5300 assert(to_elem_bt == T_LONG, ""); 5301 vpmovsxdq(dst, src, vlen_enc); 5302 break; 5303 default: 5304 ShouldNotReachHere(); 5305 } 5306 } 5307 5308 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5309 BasicType dst_bt, BasicType src_bt, int vlen) { 5310 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5311 assert(vlen_enc != AVX_512bit, ""); 5312 5313 int dst_bt_size = type2aelembytes(dst_bt); 5314 int src_bt_size = type2aelembytes(src_bt); 5315 if (dst_bt_size > src_bt_size) { 5316 switch (dst_bt_size / src_bt_size) { 5317 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5318 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5319 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5320 default: ShouldNotReachHere(); 5321 } 5322 } else { 5323 assert(dst_bt_size < src_bt_size, ""); 5324 switch (src_bt_size / dst_bt_size) { 5325 case 2: { 5326 if (vlen_enc == AVX_128bit) { 5327 vpacksswb(dst, src, src, vlen_enc); 5328 } else { 5329 vpacksswb(dst, src, src, vlen_enc); 5330 vpermq(dst, dst, 0x08, vlen_enc); 5331 } 5332 break; 5333 } 5334 case 4: { 5335 if (vlen_enc == AVX_128bit) { 5336 vpackssdw(dst, src, src, vlen_enc); 5337 vpacksswb(dst, dst, dst, vlen_enc); 5338 } else { 5339 vpackssdw(dst, src, src, vlen_enc); 5340 vpermq(dst, dst, 0x08, vlen_enc); 5341 vpacksswb(dst, dst, dst, AVX_128bit); 5342 } 5343 break; 5344 } 5345 case 8: { 5346 if (vlen_enc == AVX_128bit) { 5347 vpshufd(dst, src, 0x08, vlen_enc); 5348 vpackssdw(dst, dst, dst, vlen_enc); 5349 vpacksswb(dst, dst, dst, vlen_enc); 5350 } else { 5351 vpshufd(dst, src, 0x08, vlen_enc); 5352 vpermq(dst, dst, 0x08, vlen_enc); 5353 vpackssdw(dst, dst, dst, AVX_128bit); 5354 vpacksswb(dst, dst, dst, AVX_128bit); 5355 } 5356 break; 5357 } 5358 default: ShouldNotReachHere(); 5359 } 5360 } 5361 } 5362 5363 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5364 bool merge, BasicType bt, int vlen_enc) { 5365 if (bt == T_INT) { 5366 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5367 } else { 5368 assert(bt == T_LONG, ""); 5369 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5370 } 5371 } 5372 5373 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5374 bool merge, BasicType bt, int vlen_enc) { 5375 if (bt == T_INT) { 5376 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5377 } else { 5378 assert(bt == T_LONG, ""); 5379 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5380 } 5381 } 5382 5383 #ifdef _LP64 5384 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5385 Register rtmp2, XMMRegister xtmp, int mask_len, 5386 int vec_enc) { 5387 int index = 0; 5388 int vindex = 0; 5389 mov64(rtmp1, 0x0101010101010101L); 5390 pdepq(rtmp1, src, rtmp1); 5391 if (mask_len > 8) { 5392 movq(rtmp2, src); 5393 vpxor(xtmp, xtmp, xtmp, vec_enc); 5394 movq(xtmp, rtmp1); 5395 } 5396 movq(dst, rtmp1); 5397 5398 mask_len -= 8; 5399 while (mask_len > 0) { 5400 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5401 index++; 5402 if ((index % 2) == 0) { 5403 pxor(xtmp, xtmp); 5404 } 5405 mov64(rtmp1, 0x0101010101010101L); 5406 shrq(rtmp2, 8); 5407 pdepq(rtmp1, rtmp2, rtmp1); 5408 pinsrq(xtmp, rtmp1, index % 2); 5409 vindex = index / 2; 5410 if (vindex) { 5411 // Write entire 16 byte vector when both 64 bit 5412 // lanes are update to save redundant instructions. 5413 if (index % 2) { 5414 vinsertf128(dst, dst, xtmp, vindex); 5415 } 5416 } else { 5417 vmovdqu(dst, xtmp); 5418 } 5419 mask_len -= 8; 5420 } 5421 } 5422 5423 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5424 switch(opc) { 5425 case Op_VectorMaskTrueCount: 5426 popcntq(dst, tmp); 5427 break; 5428 case Op_VectorMaskLastTrue: 5429 if (VM_Version::supports_lzcnt()) { 5430 lzcntq(tmp, tmp); 5431 movl(dst, 63); 5432 subl(dst, tmp); 5433 } else { 5434 movl(dst, -1); 5435 bsrq(tmp, tmp); 5436 cmov32(Assembler::notZero, dst, tmp); 5437 } 5438 break; 5439 case Op_VectorMaskFirstTrue: 5440 if (VM_Version::supports_bmi1()) { 5441 if (masklen < 32) { 5442 orl(tmp, 1 << masklen); 5443 tzcntl(dst, tmp); 5444 } else if (masklen == 32) { 5445 tzcntl(dst, tmp); 5446 } else { 5447 assert(masklen == 64, ""); 5448 tzcntq(dst, tmp); 5449 } 5450 } else { 5451 if (masklen < 32) { 5452 orl(tmp, 1 << masklen); 5453 bsfl(dst, tmp); 5454 } else { 5455 assert(masklen == 32 || masklen == 64, ""); 5456 movl(dst, masklen); 5457 if (masklen == 32) { 5458 bsfl(tmp, tmp); 5459 } else { 5460 bsfq(tmp, tmp); 5461 } 5462 cmov32(Assembler::notZero, dst, tmp); 5463 } 5464 } 5465 break; 5466 case Op_VectorMaskToLong: 5467 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5468 break; 5469 default: assert(false, "Unhandled mask operation"); 5470 } 5471 } 5472 5473 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5474 int masklen, int masksize, int vec_enc) { 5475 assert(VM_Version::supports_popcnt(), ""); 5476 5477 if(VM_Version::supports_avx512bw()) { 5478 kmovql(tmp, mask); 5479 } else { 5480 assert(masklen <= 16, ""); 5481 kmovwl(tmp, mask); 5482 } 5483 5484 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5485 // operations needs to be clipped. 5486 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5487 andq(tmp, (1 << masklen) - 1); 5488 } 5489 5490 vector_mask_operation_helper(opc, dst, tmp, masklen); 5491 } 5492 5493 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5494 Register tmp, int masklen, BasicType bt, int vec_enc) { 5495 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5496 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5497 assert(VM_Version::supports_popcnt(), ""); 5498 5499 bool need_clip = false; 5500 switch(bt) { 5501 case T_BOOLEAN: 5502 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5503 vpxor(xtmp, xtmp, xtmp, vec_enc); 5504 vpsubb(xtmp, xtmp, mask, vec_enc); 5505 vpmovmskb(tmp, xtmp, vec_enc); 5506 need_clip = masklen < 16; 5507 break; 5508 case T_BYTE: 5509 vpmovmskb(tmp, mask, vec_enc); 5510 need_clip = masklen < 16; 5511 break; 5512 case T_SHORT: 5513 vpacksswb(xtmp, mask, mask, vec_enc); 5514 if (masklen >= 16) { 5515 vpermpd(xtmp, xtmp, 8, vec_enc); 5516 } 5517 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5518 need_clip = masklen < 16; 5519 break; 5520 case T_INT: 5521 case T_FLOAT: 5522 vmovmskps(tmp, mask, vec_enc); 5523 need_clip = masklen < 4; 5524 break; 5525 case T_LONG: 5526 case T_DOUBLE: 5527 vmovmskpd(tmp, mask, vec_enc); 5528 need_clip = masklen < 2; 5529 break; 5530 default: assert(false, "Unhandled type, %s", type2name(bt)); 5531 } 5532 5533 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5534 // operations needs to be clipped. 5535 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5536 // need_clip implies masklen < 32 5537 andq(tmp, (1 << masklen) - 1); 5538 } 5539 5540 vector_mask_operation_helper(opc, dst, tmp, masklen); 5541 } 5542 5543 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5544 Register rtmp2, int mask_len) { 5545 kmov(rtmp1, src); 5546 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5547 mov64(rtmp2, -1L); 5548 pextq(rtmp2, rtmp2, rtmp1); 5549 kmov(dst, rtmp2); 5550 } 5551 5552 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5553 XMMRegister mask, Register rtmp, Register rscratch, 5554 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5555 int vec_enc) { 5556 assert(type2aelembytes(bt) >= 4, ""); 5557 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5558 address compress_perm_table = nullptr; 5559 address expand_perm_table = nullptr; 5560 if (type2aelembytes(bt) == 8) { 5561 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5562 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5563 vmovmskpd(rtmp, mask, vec_enc); 5564 } else { 5565 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5566 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5567 vmovmskps(rtmp, mask, vec_enc); 5568 } 5569 shlq(rtmp, 5); // for 32 byte permute row. 5570 if (opcode == Op_CompressV) { 5571 lea(rscratch, ExternalAddress(compress_perm_table)); 5572 } else { 5573 lea(rscratch, ExternalAddress(expand_perm_table)); 5574 } 5575 addptr(rtmp, rscratch); 5576 vmovdqu(permv, Address(rtmp)); 5577 vpermps(dst, permv, src, Assembler::AVX_256bit); 5578 vpxor(xtmp, xtmp, xtmp, vec_enc); 5579 // Blend the result with zero vector using permute mask, each column entry 5580 // in a permute table row contains either a valid permute index or a -1 (default) 5581 // value, this can potentially be used as a blending mask after 5582 // compressing/expanding the source vector lanes. 5583 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5584 } 5585 5586 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5587 bool merge, BasicType bt, int vec_enc) { 5588 if (opcode == Op_CompressV) { 5589 switch(bt) { 5590 case T_BYTE: 5591 evpcompressb(dst, mask, src, merge, vec_enc); 5592 break; 5593 case T_CHAR: 5594 case T_SHORT: 5595 evpcompressw(dst, mask, src, merge, vec_enc); 5596 break; 5597 case T_INT: 5598 evpcompressd(dst, mask, src, merge, vec_enc); 5599 break; 5600 case T_FLOAT: 5601 evcompressps(dst, mask, src, merge, vec_enc); 5602 break; 5603 case T_LONG: 5604 evpcompressq(dst, mask, src, merge, vec_enc); 5605 break; 5606 case T_DOUBLE: 5607 evcompresspd(dst, mask, src, merge, vec_enc); 5608 break; 5609 default: 5610 fatal("Unsupported type %s", type2name(bt)); 5611 break; 5612 } 5613 } else { 5614 assert(opcode == Op_ExpandV, ""); 5615 switch(bt) { 5616 case T_BYTE: 5617 evpexpandb(dst, mask, src, merge, vec_enc); 5618 break; 5619 case T_CHAR: 5620 case T_SHORT: 5621 evpexpandw(dst, mask, src, merge, vec_enc); 5622 break; 5623 case T_INT: 5624 evpexpandd(dst, mask, src, merge, vec_enc); 5625 break; 5626 case T_FLOAT: 5627 evexpandps(dst, mask, src, merge, vec_enc); 5628 break; 5629 case T_LONG: 5630 evpexpandq(dst, mask, src, merge, vec_enc); 5631 break; 5632 case T_DOUBLE: 5633 evexpandpd(dst, mask, src, merge, vec_enc); 5634 break; 5635 default: 5636 fatal("Unsupported type %s", type2name(bt)); 5637 break; 5638 } 5639 } 5640 } 5641 #endif 5642 5643 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5644 KRegister ktmp1, int vec_enc) { 5645 if (opcode == Op_SignumVD) { 5646 vsubpd(dst, zero, one, vec_enc); 5647 // if src < 0 ? -1 : 1 5648 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5649 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5650 // if src == NaN, -0.0 or 0.0 return src. 5651 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5652 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5653 } else { 5654 assert(opcode == Op_SignumVF, ""); 5655 vsubps(dst, zero, one, vec_enc); 5656 // if src < 0 ? -1 : 1 5657 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5658 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5659 // if src == NaN, -0.0 or 0.0 return src. 5660 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5661 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5662 } 5663 } 5664 5665 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5666 XMMRegister xtmp1, int vec_enc) { 5667 if (opcode == Op_SignumVD) { 5668 vsubpd(dst, zero, one, vec_enc); 5669 // if src < 0 ? -1 : 1 5670 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5671 // if src == NaN, -0.0 or 0.0 return src. 5672 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5673 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5674 } else { 5675 assert(opcode == Op_SignumVF, ""); 5676 vsubps(dst, zero, one, vec_enc); 5677 // if src < 0 ? -1 : 1 5678 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5679 // if src == NaN, -0.0 or 0.0 return src. 5680 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5681 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5682 } 5683 } 5684 5685 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5686 if (VM_Version::supports_avx512bw()) { 5687 if (mask_len > 32) { 5688 kmovql(dst, src); 5689 } else { 5690 kmovdl(dst, src); 5691 if (mask_len != 32) { 5692 kshiftrdl(dst, dst, 32 - mask_len); 5693 } 5694 } 5695 } else { 5696 assert(mask_len <= 16, ""); 5697 kmovwl(dst, src); 5698 if (mask_len != 16) { 5699 kshiftrwl(dst, dst, 16 - mask_len); 5700 } 5701 } 5702 } 5703 5704 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5705 int lane_size = type2aelembytes(bt); 5706 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5707 if ((is_LP64 || lane_size < 8) && 5708 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5709 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5710 movptr(rtmp, imm32); 5711 switch(lane_size) { 5712 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5713 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5714 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5715 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5716 fatal("Unsupported lane size %d", lane_size); 5717 break; 5718 } 5719 } else { 5720 movptr(rtmp, imm32); 5721 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5722 switch(lane_size) { 5723 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5724 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5725 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5726 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5727 fatal("Unsupported lane size %d", lane_size); 5728 break; 5729 } 5730 } 5731 } 5732 5733 // 5734 // Following is lookup table based popcount computation algorithm:- 5735 // Index Bit set count 5736 // [ 0000 -> 0, 5737 // 0001 -> 1, 5738 // 0010 -> 1, 5739 // 0011 -> 2, 5740 // 0100 -> 1, 5741 // 0101 -> 2, 5742 // 0110 -> 2, 5743 // 0111 -> 3, 5744 // 1000 -> 1, 5745 // 1001 -> 2, 5746 // 1010 -> 3, 5747 // 1011 -> 3, 5748 // 1100 -> 2, 5749 // 1101 -> 3, 5750 // 1111 -> 4 ] 5751 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5752 // shuffle indices for lookup table access. 5753 // b. Right shift each byte of vector lane by 4 positions. 5754 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5755 // shuffle indices for lookup table access. 5756 // d. Add the bitset count of upper and lower 4 bits of each byte. 5757 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5758 // count of all the bytes of a quadword. 5759 // f. Perform step e. for upper 128bit vector lane. 5760 // g. Pack the bitset count of quadwords back to double word. 5761 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5762 5763 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5764 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5765 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5766 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5767 vpsrlw(dst, src, 4, vec_enc); 5768 vpand(dst, dst, xtmp1, vec_enc); 5769 vpand(xtmp1, src, xtmp1, vec_enc); 5770 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5771 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5772 vpshufb(dst, xtmp2, dst, vec_enc); 5773 vpaddb(dst, dst, xtmp1, vec_enc); 5774 } 5775 5776 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5777 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5778 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5779 // Following code is as per steps e,f,g and h of above algorithm. 5780 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5781 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5782 vpsadbw(dst, dst, xtmp2, vec_enc); 5783 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5784 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5785 vpackuswb(dst, xtmp1, dst, vec_enc); 5786 } 5787 5788 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5789 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5790 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5791 // Add the popcount of upper and lower bytes of word. 5792 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5793 vpsrlw(dst, xtmp1, 8, vec_enc); 5794 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5795 vpaddw(dst, dst, xtmp1, vec_enc); 5796 } 5797 5798 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5799 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5800 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5801 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5802 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5803 } 5804 5805 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5806 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5807 switch(bt) { 5808 case T_LONG: 5809 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5810 break; 5811 case T_INT: 5812 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5813 break; 5814 case T_CHAR: 5815 case T_SHORT: 5816 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5817 break; 5818 case T_BYTE: 5819 case T_BOOLEAN: 5820 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5821 break; 5822 default: 5823 fatal("Unsupported type %s", type2name(bt)); 5824 break; 5825 } 5826 } 5827 5828 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5829 KRegister mask, bool merge, int vec_enc) { 5830 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5831 switch(bt) { 5832 case T_LONG: 5833 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5834 evpopcntq(dst, mask, src, merge, vec_enc); 5835 break; 5836 case T_INT: 5837 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5838 evpopcntd(dst, mask, src, merge, vec_enc); 5839 break; 5840 case T_CHAR: 5841 case T_SHORT: 5842 assert(VM_Version::supports_avx512_bitalg(), ""); 5843 evpopcntw(dst, mask, src, merge, vec_enc); 5844 break; 5845 case T_BYTE: 5846 case T_BOOLEAN: 5847 assert(VM_Version::supports_avx512_bitalg(), ""); 5848 evpopcntb(dst, mask, src, merge, vec_enc); 5849 break; 5850 default: 5851 fatal("Unsupported type %s", type2name(bt)); 5852 break; 5853 } 5854 } 5855 5856 #ifndef _LP64 5857 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5858 assert(VM_Version::supports_avx512bw(), ""); 5859 kmovdl(tmp, src); 5860 kunpckdql(dst, tmp, tmp); 5861 } 5862 #endif 5863 5864 // Bit reversal algorithm first reverses the bits of each byte followed by 5865 // a byte level reversal for multi-byte primitive types (short/int/long). 5866 // Algorithm performs a lookup table access to get reverse bit sequence 5867 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5868 // is obtained by swapping the reverse bit sequences of upper and lower 5869 // nibble of a byte. 5870 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5871 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5872 if (VM_Version::supports_avx512vlbw()) { 5873 5874 // Get the reverse bit sequence of lower nibble of each byte. 5875 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5876 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5877 evpandq(dst, xtmp2, src, vec_enc); 5878 vpshufb(dst, xtmp1, dst, vec_enc); 5879 vpsllq(dst, dst, 4, vec_enc); 5880 5881 // Get the reverse bit sequence of upper nibble of each byte. 5882 vpandn(xtmp2, xtmp2, src, vec_enc); 5883 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5884 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5885 5886 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5887 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5888 evporq(xtmp2, dst, xtmp2, vec_enc); 5889 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5890 5891 } else if(vec_enc == Assembler::AVX_512bit) { 5892 // Shift based bit reversal. 5893 assert(bt == T_LONG || bt == T_INT, ""); 5894 5895 // Swap lower and upper nibble of each byte. 5896 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5897 5898 // Swap two least and most significant bits of each nibble. 5899 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5900 5901 // Swap adjacent pair of bits. 5902 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5903 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5904 5905 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5906 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5907 } else { 5908 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5909 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5910 5911 // Get the reverse bit sequence of lower nibble of each byte. 5912 vpand(dst, xtmp2, src, vec_enc); 5913 vpshufb(dst, xtmp1, dst, vec_enc); 5914 vpsllq(dst, dst, 4, vec_enc); 5915 5916 // Get the reverse bit sequence of upper nibble of each byte. 5917 vpandn(xtmp2, xtmp2, src, vec_enc); 5918 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5919 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5920 5921 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5922 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5923 vpor(xtmp2, dst, xtmp2, vec_enc); 5924 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5925 } 5926 } 5927 5928 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5929 XMMRegister xtmp, Register rscratch) { 5930 assert(VM_Version::supports_gfni(), ""); 5931 assert(rscratch != noreg || always_reachable(mask), "missing"); 5932 5933 // Galois field instruction based bit reversal based on following algorithm. 5934 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5935 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5936 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5937 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5938 } 5939 5940 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5941 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5942 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5943 evpandq(dst, xtmp1, src, vec_enc); 5944 vpsllq(dst, dst, nbits, vec_enc); 5945 vpandn(xtmp1, xtmp1, src, vec_enc); 5946 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5947 evporq(dst, dst, xtmp1, vec_enc); 5948 } 5949 5950 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5951 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5952 // Shift based bit reversal. 5953 assert(VM_Version::supports_evex(), ""); 5954 switch(bt) { 5955 case T_LONG: 5956 // Swap upper and lower double word of each quad word. 5957 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5958 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5959 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5960 break; 5961 case T_INT: 5962 // Swap upper and lower word of each double word. 5963 evprord(xtmp1, k0, src, 16, true, vec_enc); 5964 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5965 break; 5966 case T_CHAR: 5967 case T_SHORT: 5968 // Swap upper and lower byte of each word. 5969 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 5970 break; 5971 case T_BYTE: 5972 evmovdquq(dst, k0, src, true, vec_enc); 5973 break; 5974 default: 5975 fatal("Unsupported type %s", type2name(bt)); 5976 break; 5977 } 5978 } 5979 5980 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5981 if (bt == T_BYTE) { 5982 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 5983 evmovdquq(dst, k0, src, true, vec_enc); 5984 } else { 5985 vmovdqu(dst, src); 5986 } 5987 return; 5988 } 5989 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 5990 // pre-computed shuffle indices. 5991 switch(bt) { 5992 case T_LONG: 5993 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 5994 break; 5995 case T_INT: 5996 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 5997 break; 5998 case T_CHAR: 5999 case T_SHORT: 6000 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6001 break; 6002 default: 6003 fatal("Unsupported type %s", type2name(bt)); 6004 break; 6005 } 6006 vpshufb(dst, src, dst, vec_enc); 6007 } 6008 6009 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6010 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6011 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6012 assert(is_integral_type(bt), ""); 6013 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6014 assert(VM_Version::supports_avx512cd(), ""); 6015 switch(bt) { 6016 case T_LONG: 6017 evplzcntq(dst, ktmp, src, merge, vec_enc); 6018 break; 6019 case T_INT: 6020 evplzcntd(dst, ktmp, src, merge, vec_enc); 6021 break; 6022 case T_SHORT: 6023 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6024 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6025 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6026 vpunpckhwd(dst, xtmp1, src, vec_enc); 6027 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6028 vpackusdw(dst, xtmp2, dst, vec_enc); 6029 break; 6030 case T_BYTE: 6031 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6032 // accessing the lookup table. 6033 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6034 // accessing the lookup table. 6035 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6036 assert(VM_Version::supports_avx512bw(), ""); 6037 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6038 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6039 vpand(xtmp2, dst, src, vec_enc); 6040 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6041 vpsrlw(xtmp3, src, 4, vec_enc); 6042 vpand(xtmp3, dst, xtmp3, vec_enc); 6043 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6044 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6045 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6046 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6047 break; 6048 default: 6049 fatal("Unsupported type %s", type2name(bt)); 6050 break; 6051 } 6052 } 6053 6054 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6055 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6056 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6057 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6058 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6059 // accessing the lookup table. 6060 vpand(dst, xtmp2, src, vec_enc); 6061 vpshufb(dst, xtmp1, dst, vec_enc); 6062 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6063 // accessing the lookup table. 6064 vpsrlw(xtmp3, src, 4, vec_enc); 6065 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6066 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6067 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6068 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6069 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6070 vpaddb(dst, dst, xtmp2, vec_enc); 6071 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6072 } 6073 6074 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6075 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6076 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6077 // Add zero counts of lower byte and upper byte of a word if 6078 // upper byte holds a zero value. 6079 vpsrlw(xtmp3, src, 8, vec_enc); 6080 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6081 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6082 vpsllw(xtmp2, dst, 8, vec_enc); 6083 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6084 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6085 vpsrlw(dst, dst, 8, vec_enc); 6086 } 6087 6088 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6089 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6090 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6091 // hence biased exponent can be used to compute leading zero count as per 6092 // following formula:- 6093 // LZCNT = 32 - (biased_exp - 127) 6094 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6095 6096 // Broadcast 0xFF 6097 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6098 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6099 6100 // Extract biased exponent. 6101 vcvtdq2ps(dst, src, vec_enc); 6102 vpsrld(dst, dst, 23, vec_enc); 6103 vpand(dst, dst, xtmp1, vec_enc); 6104 6105 // Broadcast 127. 6106 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6107 // Exponent = biased_exp - 127 6108 vpsubd(dst, dst, xtmp1, vec_enc); 6109 6110 // Exponent = Exponent + 1 6111 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6112 vpaddd(dst, dst, xtmp3, vec_enc); 6113 6114 // Replace -ve exponent with zero, exponent is -ve when src 6115 // lane contains a zero value. 6116 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6117 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6118 6119 // Rematerialize broadcast 32. 6120 vpslld(xtmp1, xtmp3, 5, vec_enc); 6121 // Exponent is 32 if corresponding source lane contains max_int value. 6122 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6123 // LZCNT = 32 - exponent 6124 vpsubd(dst, xtmp1, dst, vec_enc); 6125 6126 // Replace LZCNT with a value 1 if corresponding source lane 6127 // contains max_int value. 6128 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6129 6130 // Replace biased_exp with 0 if source lane value is less than zero. 6131 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6132 vblendvps(dst, dst, xtmp2, src, vec_enc); 6133 } 6134 6135 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6136 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6137 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6138 // Add zero counts of lower word and upper word of a double word if 6139 // upper word holds a zero value. 6140 vpsrld(xtmp3, src, 16, vec_enc); 6141 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6142 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6143 vpslld(xtmp2, dst, 16, vec_enc); 6144 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6145 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6146 vpsrld(dst, dst, 16, vec_enc); 6147 // Add zero counts of lower doubleword and upper doubleword of a 6148 // quadword if upper doubleword holds a zero value. 6149 vpsrlq(xtmp3, src, 32, vec_enc); 6150 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6151 vpsllq(xtmp2, dst, 32, vec_enc); 6152 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6153 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6154 vpsrlq(dst, dst, 32, vec_enc); 6155 } 6156 6157 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6158 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6159 Register rtmp, int vec_enc) { 6160 assert(is_integral_type(bt), "unexpected type"); 6161 assert(vec_enc < Assembler::AVX_512bit, ""); 6162 switch(bt) { 6163 case T_LONG: 6164 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6165 break; 6166 case T_INT: 6167 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6168 break; 6169 case T_SHORT: 6170 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6171 break; 6172 case T_BYTE: 6173 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6174 break; 6175 default: 6176 fatal("Unsupported type %s", type2name(bt)); 6177 break; 6178 } 6179 } 6180 6181 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6182 switch(bt) { 6183 case T_BYTE: 6184 vpsubb(dst, src1, src2, vec_enc); 6185 break; 6186 case T_SHORT: 6187 vpsubw(dst, src1, src2, vec_enc); 6188 break; 6189 case T_INT: 6190 vpsubd(dst, src1, src2, vec_enc); 6191 break; 6192 case T_LONG: 6193 vpsubq(dst, src1, src2, vec_enc); 6194 break; 6195 default: 6196 fatal("Unsupported type %s", type2name(bt)); 6197 break; 6198 } 6199 } 6200 6201 // Trailing zero count computation is based on leading zero count operation as per 6202 // following equation. All AVX3 targets support AVX512CD feature which offers 6203 // direct vector instruction to compute leading zero count. 6204 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6205 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6206 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6207 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6208 assert(is_integral_type(bt), ""); 6209 // xtmp = -1 6210 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6211 // xtmp = xtmp + src 6212 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6213 // xtmp = xtmp & ~src 6214 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6215 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6216 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6217 vpsub(bt, dst, xtmp4, dst, vec_enc); 6218 } 6219 6220 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6221 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6222 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6223 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6224 assert(is_integral_type(bt), ""); 6225 // xtmp = 0 6226 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6227 // xtmp = 0 - src 6228 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6229 // xtmp = xtmp | src 6230 vpor(xtmp3, xtmp3, src, vec_enc); 6231 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6232 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6233 vpsub(bt, dst, xtmp1, dst, vec_enc); 6234 } 6235 6236 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6237 Label done; 6238 Label neg_divisor_fastpath; 6239 cmpl(divisor, 0); 6240 jccb(Assembler::less, neg_divisor_fastpath); 6241 xorl(rdx, rdx); 6242 divl(divisor); 6243 jmpb(done); 6244 bind(neg_divisor_fastpath); 6245 // Fastpath for divisor < 0: 6246 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6247 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6248 movl(rdx, rax); 6249 subl(rdx, divisor); 6250 if (VM_Version::supports_bmi1()) { 6251 andnl(rax, rdx, rax); 6252 } else { 6253 notl(rdx); 6254 andl(rax, rdx); 6255 } 6256 shrl(rax, 31); 6257 bind(done); 6258 } 6259 6260 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6261 Label done; 6262 Label neg_divisor_fastpath; 6263 cmpl(divisor, 0); 6264 jccb(Assembler::less, neg_divisor_fastpath); 6265 xorl(rdx, rdx); 6266 divl(divisor); 6267 jmpb(done); 6268 bind(neg_divisor_fastpath); 6269 // Fastpath when divisor < 0: 6270 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6271 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6272 movl(rdx, rax); 6273 subl(rax, divisor); 6274 if (VM_Version::supports_bmi1()) { 6275 andnl(rax, rax, rdx); 6276 } else { 6277 notl(rax); 6278 andl(rax, rdx); 6279 } 6280 sarl(rax, 31); 6281 andl(rax, divisor); 6282 subl(rdx, rax); 6283 bind(done); 6284 } 6285 6286 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6287 Label done; 6288 Label neg_divisor_fastpath; 6289 6290 cmpl(divisor, 0); 6291 jccb(Assembler::less, neg_divisor_fastpath); 6292 xorl(rdx, rdx); 6293 divl(divisor); 6294 jmpb(done); 6295 bind(neg_divisor_fastpath); 6296 // Fastpath for divisor < 0: 6297 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6298 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6299 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6300 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6301 movl(rdx, rax); 6302 subl(rax, divisor); 6303 if (VM_Version::supports_bmi1()) { 6304 andnl(rax, rax, rdx); 6305 } else { 6306 notl(rax); 6307 andl(rax, rdx); 6308 } 6309 movl(tmp, rax); 6310 shrl(rax, 31); // quotient 6311 sarl(tmp, 31); 6312 andl(tmp, divisor); 6313 subl(rdx, tmp); // remainder 6314 bind(done); 6315 } 6316 6317 #ifdef _LP64 6318 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6319 XMMRegister xtmp2, Register rtmp) { 6320 if(VM_Version::supports_gfni()) { 6321 // Galois field instruction based bit reversal based on following algorithm. 6322 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6323 mov64(rtmp, 0x8040201008040201L); 6324 movq(xtmp1, src); 6325 movq(xtmp2, rtmp); 6326 gf2p8affineqb(xtmp1, xtmp2, 0); 6327 movq(dst, xtmp1); 6328 } else { 6329 // Swap even and odd numbered bits. 6330 movl(rtmp, src); 6331 andl(rtmp, 0x55555555); 6332 shll(rtmp, 1); 6333 movl(dst, src); 6334 andl(dst, 0xAAAAAAAA); 6335 shrl(dst, 1); 6336 orl(dst, rtmp); 6337 6338 // Swap LSB and MSB 2 bits of each nibble. 6339 movl(rtmp, dst); 6340 andl(rtmp, 0x33333333); 6341 shll(rtmp, 2); 6342 andl(dst, 0xCCCCCCCC); 6343 shrl(dst, 2); 6344 orl(dst, rtmp); 6345 6346 // Swap LSB and MSB 4 bits of each byte. 6347 movl(rtmp, dst); 6348 andl(rtmp, 0x0F0F0F0F); 6349 shll(rtmp, 4); 6350 andl(dst, 0xF0F0F0F0); 6351 shrl(dst, 4); 6352 orl(dst, rtmp); 6353 } 6354 bswapl(dst); 6355 } 6356 6357 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6358 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6359 if(VM_Version::supports_gfni()) { 6360 // Galois field instruction based bit reversal based on following algorithm. 6361 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6362 mov64(rtmp1, 0x8040201008040201L); 6363 movq(xtmp1, src); 6364 movq(xtmp2, rtmp1); 6365 gf2p8affineqb(xtmp1, xtmp2, 0); 6366 movq(dst, xtmp1); 6367 } else { 6368 // Swap even and odd numbered bits. 6369 movq(rtmp1, src); 6370 mov64(rtmp2, 0x5555555555555555L); 6371 andq(rtmp1, rtmp2); 6372 shlq(rtmp1, 1); 6373 movq(dst, src); 6374 notq(rtmp2); 6375 andq(dst, rtmp2); 6376 shrq(dst, 1); 6377 orq(dst, rtmp1); 6378 6379 // Swap LSB and MSB 2 bits of each nibble. 6380 movq(rtmp1, dst); 6381 mov64(rtmp2, 0x3333333333333333L); 6382 andq(rtmp1, rtmp2); 6383 shlq(rtmp1, 2); 6384 notq(rtmp2); 6385 andq(dst, rtmp2); 6386 shrq(dst, 2); 6387 orq(dst, rtmp1); 6388 6389 // Swap LSB and MSB 4 bits of each byte. 6390 movq(rtmp1, dst); 6391 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6392 andq(rtmp1, rtmp2); 6393 shlq(rtmp1, 4); 6394 notq(rtmp2); 6395 andq(dst, rtmp2); 6396 shrq(dst, 4); 6397 orq(dst, rtmp1); 6398 } 6399 bswapq(dst); 6400 } 6401 6402 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6403 Label done; 6404 Label neg_divisor_fastpath; 6405 cmpq(divisor, 0); 6406 jccb(Assembler::less, neg_divisor_fastpath); 6407 xorl(rdx, rdx); 6408 divq(divisor); 6409 jmpb(done); 6410 bind(neg_divisor_fastpath); 6411 // Fastpath for divisor < 0: 6412 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6413 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6414 movq(rdx, rax); 6415 subq(rdx, divisor); 6416 if (VM_Version::supports_bmi1()) { 6417 andnq(rax, rdx, rax); 6418 } else { 6419 notq(rdx); 6420 andq(rax, rdx); 6421 } 6422 shrq(rax, 63); 6423 bind(done); 6424 } 6425 6426 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6427 Label done; 6428 Label neg_divisor_fastpath; 6429 cmpq(divisor, 0); 6430 jccb(Assembler::less, neg_divisor_fastpath); 6431 xorq(rdx, rdx); 6432 divq(divisor); 6433 jmp(done); 6434 bind(neg_divisor_fastpath); 6435 // Fastpath when divisor < 0: 6436 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6437 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6438 movq(rdx, rax); 6439 subq(rax, divisor); 6440 if (VM_Version::supports_bmi1()) { 6441 andnq(rax, rax, rdx); 6442 } else { 6443 notq(rax); 6444 andq(rax, rdx); 6445 } 6446 sarq(rax, 63); 6447 andq(rax, divisor); 6448 subq(rdx, rax); 6449 bind(done); 6450 } 6451 6452 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6453 Label done; 6454 Label neg_divisor_fastpath; 6455 cmpq(divisor, 0); 6456 jccb(Assembler::less, neg_divisor_fastpath); 6457 xorq(rdx, rdx); 6458 divq(divisor); 6459 jmp(done); 6460 bind(neg_divisor_fastpath); 6461 // Fastpath for divisor < 0: 6462 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6463 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6464 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6465 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6466 movq(rdx, rax); 6467 subq(rax, divisor); 6468 if (VM_Version::supports_bmi1()) { 6469 andnq(rax, rax, rdx); 6470 } else { 6471 notq(rax); 6472 andq(rax, rdx); 6473 } 6474 movq(tmp, rax); 6475 shrq(rax, 63); // quotient 6476 sarq(tmp, 63); 6477 andq(tmp, divisor); 6478 subq(rdx, tmp); // remainder 6479 bind(done); 6480 } 6481 #endif 6482 6483 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6484 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6485 int vlen_enc) { 6486 assert(VM_Version::supports_avx512bw(), ""); 6487 // Byte shuffles are inlane operations and indices are determined using 6488 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6489 // normalized to index range 0-15. This makes sure that all the multiples 6490 // of an index value are placed at same relative position in 128 bit 6491 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6492 // will be 16th element in their respective 128 bit lanes. 6493 movl(rtmp, 16); 6494 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6495 6496 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6497 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6498 // original shuffle indices and move the shuffled lanes corresponding to true 6499 // mask to destination vector. 6500 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6501 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6502 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6503 6504 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6505 // and broadcasting second 128 bit lane. 6506 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6507 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6508 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6509 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6510 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6511 6512 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6513 // and broadcasting third 128 bit lane. 6514 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6515 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6516 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6517 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6518 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6519 6520 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6521 // and broadcasting third 128 bit lane. 6522 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6523 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6524 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6525 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6526 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6527 } 6528 6529 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6530 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6531 if (vlen_enc == AVX_128bit) { 6532 vpermilps(dst, src, shuffle, vlen_enc); 6533 } else if (bt == T_INT) { 6534 vpermd(dst, shuffle, src, vlen_enc); 6535 } else { 6536 assert(bt == T_FLOAT, ""); 6537 vpermps(dst, shuffle, src, vlen_enc); 6538 } 6539 }