1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/globals.hpp" 37 #include "runtime/objectMonitor.hpp" 38 #include "runtime/stubRoutines.hpp" 39 #include "utilities/checkedCast.hpp" 40 #include "utilities/globalDefinitions.hpp" 41 #include "utilities/powerOfTwo.hpp" 42 #include "utilities/sizes.hpp" 43 44 #ifdef PRODUCT 45 #define BLOCK_COMMENT(str) /* nothing */ 46 #define STOP(error) stop(error) 47 #else 48 #define BLOCK_COMMENT(str) block_comment(str) 49 #define STOP(error) block_comment(error); stop(error) 50 #endif 51 52 // C2 compiled method's prolog code. 53 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) { 54 if (C->clinit_barrier_on_entry()) { 55 assert(VM_Version::supports_fast_class_init_checks(), "sanity"); 56 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started"); 57 58 Label L_skip_barrier; 59 Register klass = rscratch1; 60 61 mov_metadata(klass, C->method()->holder()->constant_encoding()); 62 clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 63 64 jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 65 66 bind(L_skip_barrier); 67 } 68 69 int framesize = C->output()->frame_size_in_bytes(); 70 int bangsize = C->output()->bang_size_in_bytes(); 71 bool fp_mode_24b = false; 72 int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0; 73 74 // WARNING: Initial instruction MUST be 5 bytes or longer so that 75 // NativeJump::patch_verified_entry will be able to patch out the entry 76 // code safely. The push to verify stack depth is ok at 5 bytes, 77 // the frame allocation can be either 3 or 6 bytes. So if we don't do 78 // stack bang then we must use the 6 byte frame allocation even if 79 // we have no frame. :-( 80 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 81 82 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 83 // Remove word for return addr 84 framesize -= wordSize; 85 stack_bang_size -= wordSize; 86 87 // Calls to C2R adapters often do not accept exceptional returns. 88 // We require that their callers must bang for them. But be careful, because 89 // some VM calls (such as call site linkage) can use several kilobytes of 90 // stack. But the stack safety zone should account for that. 91 // See bugs 4446381, 4468289, 4497237. 92 if (stack_bang_size > 0) { 93 generate_stack_overflow_check(stack_bang_size); 94 95 // We always push rbp, so that on return to interpreter rbp, will be 96 // restored correctly and we can correct the stack. 97 push(rbp); 98 // Save caller's stack pointer into RBP if the frame pointer is preserved. 99 if (PreserveFramePointer) { 100 mov(rbp, rsp); 101 } 102 // Remove word for ebp 103 framesize -= wordSize; 104 105 // Create frame 106 if (framesize) { 107 subptr(rsp, framesize); 108 } 109 } else { 110 // Create frame (force generation of a 4 byte immediate value) 111 subptr_imm32(rsp, framesize); 112 113 // Save RBP register now. 114 framesize -= wordSize; 115 movptr(Address(rsp, framesize), rbp); 116 // Save caller's stack pointer into RBP if the frame pointer is preserved. 117 if (PreserveFramePointer) { 118 movptr(rbp, rsp); 119 if (framesize > 0) { 120 addptr(rbp, framesize); 121 } 122 } 123 } 124 125 if (C->needs_stack_repair()) { 126 // Save stack increment just below the saved rbp (also account for fixed framesize and rbp) 127 assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned"); 128 movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize); 129 } 130 131 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 132 framesize -= wordSize; 133 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 134 } 135 136 #ifndef _LP64 137 // If method sets FPU control word do it now 138 if (fp_mode_24b) { 139 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 140 } 141 if (UseSSE >= 2 && VerifyFPU) { 142 verify_FPU(0, "FPU stack must be clean on entry"); 143 } 144 #endif 145 146 #ifdef ASSERT 147 if (VerifyStackAtCalls) { 148 Label L; 149 push(rax); 150 mov(rax, rsp); 151 andptr(rax, StackAlignmentInBytes-1); 152 cmpptr(rax, StackAlignmentInBytes-wordSize); 153 pop(rax); 154 jcc(Assembler::equal, L); 155 STOP("Stack is not properly aligned!"); 156 bind(L); 157 } 158 #endif 159 } 160 161 void C2_MacroAssembler::entry_barrier() { 162 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 163 #ifdef _LP64 164 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 165 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 166 Label dummy_slow_path; 167 Label dummy_continuation; 168 Label* slow_path = &dummy_slow_path; 169 Label* continuation = &dummy_continuation; 170 if (!Compile::current()->output()->in_scratch_emit_size()) { 171 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 172 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 173 Compile::current()->output()->add_stub(stub); 174 slow_path = &stub->entry(); 175 continuation = &stub->continuation(); 176 } 177 bs->nmethod_entry_barrier(this, slow_path, continuation); 178 } 179 #else 180 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 181 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 182 #endif 183 } 184 185 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 186 switch (vlen_in_bytes) { 187 case 4: // fall-through 188 case 8: // fall-through 189 case 16: return Assembler::AVX_128bit; 190 case 32: return Assembler::AVX_256bit; 191 case 64: return Assembler::AVX_512bit; 192 193 default: { 194 ShouldNotReachHere(); 195 return Assembler::AVX_NoVec; 196 } 197 } 198 } 199 200 // fast_lock and fast_unlock used by C2 201 202 // Because the transitions from emitted code to the runtime 203 // monitorenter/exit helper stubs are so slow it's critical that 204 // we inline both the stack-locking fast path and the inflated fast path. 205 // 206 // See also: cmpFastLock and cmpFastUnlock. 207 // 208 // What follows is a specialized inline transliteration of the code 209 // in enter() and exit(). If we're concerned about I$ bloat another 210 // option would be to emit TrySlowEnter and TrySlowExit methods 211 // at startup-time. These methods would accept arguments as 212 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 213 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 214 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 215 // In practice, however, the # of lock sites is bounded and is usually small. 216 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 217 // if the processor uses simple bimodal branch predictors keyed by EIP 218 // Since the helper routines would be called from multiple synchronization 219 // sites. 220 // 221 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 222 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 223 // to those specialized methods. That'd give us a mostly platform-independent 224 // implementation that the JITs could optimize and inline at their pleasure. 225 // Done correctly, the only time we'd need to cross to native could would be 226 // to park() or unpark() threads. We'd also need a few more unsafe operators 227 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 228 // (b) explicit barriers or fence operations. 229 // 230 // TODO: 231 // 232 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 233 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 234 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 235 // the lock operators would typically be faster than reifying Self. 236 // 237 // * Ideally I'd define the primitives as: 238 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 239 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 240 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 241 // Instead, we're stuck with a rather awkward and brittle register assignments below. 242 // Furthermore the register assignments are overconstrained, possibly resulting in 243 // sub-optimal code near the synchronization site. 244 // 245 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 246 // Alternately, use a better sp-proximity test. 247 // 248 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 249 // Either one is sufficient to uniquely identify a thread. 250 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 251 // 252 // * Intrinsify notify() and notifyAll() for the common cases where the 253 // object is locked by the calling thread but the waitlist is empty. 254 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 255 // 256 // * use jccb and jmpb instead of jcc and jmp to improve code density. 257 // But beware of excessive branch density on AMD Opterons. 258 // 259 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 260 // or failure of the fast path. If the fast path fails then we pass 261 // control to the slow path, typically in C. In fast_lock and 262 // fast_unlock we often branch to DONE_LABEL, just to find that C2 263 // will emit a conditional branch immediately after the node. 264 // So we have branches to branches and lots of ICC.ZF games. 265 // Instead, it might be better to have C2 pass a "FailureLabel" 266 // into fast_lock and fast_unlock. In the case of success, control 267 // will drop through the node. ICC.ZF is undefined at exit. 268 // In the case of failure, the node will branch directly to the 269 // FailureLabel 270 271 272 // obj: object to lock 273 // box: on-stack box address (displaced header location) - KILLED 274 // rax,: tmp -- KILLED 275 // scr: tmp -- KILLED 276 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 277 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 278 Metadata* method_data) { 279 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 280 // Ensure the register assignments are disjoint 281 assert(tmpReg == rax, ""); 282 assert(cx1Reg == noreg, ""); 283 assert(cx2Reg == noreg, ""); 284 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 285 286 // Possible cases that we'll encounter in fast_lock 287 // ------------------------------------------------ 288 // * Inflated 289 // -- unlocked 290 // -- Locked 291 // = by self 292 // = by other 293 // * neutral 294 // * stack-locked 295 // -- by self 296 // = sp-proximity test hits 297 // = sp-proximity test generates false-negative 298 // -- by other 299 // 300 301 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 302 303 if (DiagnoseSyncOnValueBasedClasses != 0) { 304 load_klass(tmpReg, objReg, scrReg); 305 testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 306 jcc(Assembler::notZero, DONE_LABEL); 307 } 308 309 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 310 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 311 jcc(Assembler::notZero, IsInflated); 312 313 if (LockingMode == LM_MONITOR) { 314 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 315 testptr(objReg, objReg); 316 } else { 317 assert(LockingMode == LM_LEGACY, "must be"); 318 // Attempt stack-locking ... 319 orptr (tmpReg, markWord::unlocked_value); 320 if (EnableValhalla) { 321 // Mask inline_type bit such that we go to the slow path if object is an inline type 322 andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place)); 323 } 324 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 325 lock(); 326 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 327 jcc(Assembler::equal, COUNT); // Success 328 329 // Recursive locking. 330 // The object is stack-locked: markword contains stack pointer to BasicLock. 331 // Locked by current thread if difference with current SP is less than one page. 332 subptr(tmpReg, rsp); 333 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 334 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 335 movptr(Address(boxReg, 0), tmpReg); 336 } 337 jmp(DONE_LABEL); 338 339 bind(IsInflated); 340 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 341 342 #ifndef _LP64 343 // The object is inflated. 344 345 // boxReg refers to the on-stack BasicLock in the current frame. 346 // We'd like to write: 347 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 348 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 349 // additional latency as we have another ST in the store buffer that must drain. 350 351 // avoid ST-before-CAS 352 // register juggle because we need tmpReg for cmpxchgptr below 353 movptr(scrReg, boxReg); 354 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 355 356 // Optimistic form: consider XORL tmpReg,tmpReg 357 movptr(tmpReg, NULL_WORD); 358 359 // Appears unlocked - try to swing _owner from null to non-null. 360 // Ideally, I'd manifest "Self" with get_thread and then attempt 361 // to CAS the register containing Self into m->Owner. 362 // But we don't have enough registers, so instead we can either try to CAS 363 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 364 // we later store "Self" into m->Owner. Transiently storing a stack address 365 // (rsp or the address of the box) into m->owner is harmless. 366 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 367 lock(); 368 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 369 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 370 // If we weren't able to swing _owner from null to the BasicLock 371 // then take the slow path. 372 jccb (Assembler::notZero, NO_COUNT); 373 // update _owner from BasicLock to thread 374 get_thread (scrReg); // beware: clobbers ICCs 375 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 376 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 377 378 // If the CAS fails we can either retry or pass control to the slow path. 379 // We use the latter tactic. 380 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 381 // If the CAS was successful ... 382 // Self has acquired the lock 383 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 384 // Intentional fall-through into DONE_LABEL ... 385 #else // _LP64 386 // It's inflated and we use scrReg for ObjectMonitor* in this section. 387 movq(scrReg, tmpReg); 388 xorq(tmpReg, tmpReg); 389 lock(); 390 cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 391 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 392 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 393 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 394 // Propagate ICC.ZF from CAS above into DONE_LABEL. 395 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 396 397 cmpptr(thread, rax); // Check if we are already the owner (recursive lock) 398 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 399 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 400 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 401 #endif // _LP64 402 bind(DONE_LABEL); 403 404 // ZFlag == 1 count in fast path 405 // ZFlag == 0 count in slow path 406 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 407 408 bind(COUNT); 409 // Count monitors in fast path 410 increment(Address(thread, JavaThread::held_monitor_count_offset())); 411 412 xorl(tmpReg, tmpReg); // Set ZF == 1 413 414 bind(NO_COUNT); 415 416 // At NO_COUNT the icc ZFlag is set as follows ... 417 // fast_unlock uses the same protocol. 418 // ZFlag == 1 -> Success 419 // ZFlag == 0 -> Failure - force control through the slow path 420 } 421 422 // obj: object to unlock 423 // box: box address (displaced header location), killed. Must be EAX. 424 // tmp: killed, cannot be obj nor box. 425 // 426 // Some commentary on balanced locking: 427 // 428 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 429 // Methods that don't have provably balanced locking are forced to run in the 430 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 431 // The interpreter provides two properties: 432 // I1: At return-time the interpreter automatically and quietly unlocks any 433 // objects acquired the current activation (frame). Recall that the 434 // interpreter maintains an on-stack list of locks currently held by 435 // a frame. 436 // I2: If a method attempts to unlock an object that is not held by the 437 // the frame the interpreter throws IMSX. 438 // 439 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 440 // B() doesn't have provably balanced locking so it runs in the interpreter. 441 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 442 // is still locked by A(). 443 // 444 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 445 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 446 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 447 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 448 // Arguably given that the spec legislates the JNI case as undefined our implementation 449 // could reasonably *avoid* checking owner in fast_unlock(). 450 // In the interest of performance we elide m->Owner==Self check in unlock. 451 // A perfectly viable alternative is to elide the owner check except when 452 // Xcheck:jni is enabled. 453 454 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { 455 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 456 assert(boxReg == rax, ""); 457 assert_different_registers(objReg, boxReg, tmpReg); 458 459 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 460 461 if (LockingMode == LM_LEGACY) { 462 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 463 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 464 } 465 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 466 if (LockingMode != LM_MONITOR) { 467 testptr(tmpReg, markWord::monitor_value); // Inflated? 468 jcc(Assembler::zero, Stacked); 469 } 470 471 // It's inflated. 472 473 // Despite our balanced locking property we still check that m->_owner == Self 474 // as java routines or native JNI code called by this thread might 475 // have released the lock. 476 // Refer to the comments in synchronizer.cpp for how we might encode extra 477 // state in _succ so we can avoid fetching EntryList|cxq. 478 // 479 // If there's no contention try a 1-0 exit. That is, exit without 480 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 481 // we detect and recover from the race that the 1-0 exit admits. 482 // 483 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 484 // before it STs null into _owner, releasing the lock. Updates 485 // to data protected by the critical section must be visible before 486 // we drop the lock (and thus before any other thread could acquire 487 // the lock and observe the fields protected by the lock). 488 // IA32's memory-model is SPO, so STs are ordered with respect to 489 // each other and there's no need for an explicit barrier (fence). 490 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 491 Label LSuccess, LNotRecursive; 492 493 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 494 jccb(Assembler::equal, LNotRecursive); 495 496 // Recursive inflated unlock 497 decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 498 jmpb(LSuccess); 499 500 bind(LNotRecursive); 501 502 // Set owner to null. 503 // Release to satisfy the JMM 504 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 505 // We need a full fence after clearing owner to avoid stranding. 506 // StoreLoad achieves this. 507 membar(StoreLoad); 508 509 // Check if the entry lists are empty (EntryList first - by convention). 510 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 511 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 512 jccb(Assembler::zero, LSuccess); // If so we are done. 513 514 // Check if there is a successor. 515 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 516 jccb(Assembler::notZero, LSuccess); // If so we are done. 517 518 // Save the monitor pointer in the current thread, so we can try to 519 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 520 andptr(tmpReg, ~(int32_t)markWord::monitor_value); 521 #ifndef _LP64 522 get_thread(boxReg); 523 movptr(Address(boxReg, JavaThread::unlocked_inflated_monitor_offset()), tmpReg); 524 #else // _LP64 525 movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg); 526 #endif 527 528 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 529 jmpb (DONE_LABEL); 530 531 bind (LSuccess); 532 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 533 jmpb (DONE_LABEL); 534 535 if (LockingMode == LM_LEGACY) { 536 bind (Stacked); 537 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 538 lock(); 539 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 540 // Intentional fall-thru into DONE_LABEL 541 } 542 543 bind(DONE_LABEL); 544 545 // ZFlag == 1 count in fast path 546 // ZFlag == 0 count in slow path 547 jccb(Assembler::notZero, NO_COUNT); 548 549 bind(COUNT); 550 // Count monitors in fast path 551 #ifndef _LP64 552 get_thread(tmpReg); 553 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 554 #else // _LP64 555 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 556 #endif 557 558 xorl(tmpReg, tmpReg); // Set ZF == 1 559 560 bind(NO_COUNT); 561 } 562 563 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 564 Register t, Register thread) { 565 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 566 assert(rax_reg == rax, "Used for CAS"); 567 assert_different_registers(obj, box, rax_reg, t, thread); 568 569 // Handle inflated monitor. 570 Label inflated; 571 // Finish fast lock successfully. ZF value is irrelevant. 572 Label locked; 573 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 574 Label slow_path; 575 576 if (UseObjectMonitorTable) { 577 // Clear cache in case fast locking succeeds. 578 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 579 } 580 581 if (DiagnoseSyncOnValueBasedClasses != 0) { 582 load_klass(rax_reg, obj, t); 583 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 584 jcc(Assembler::notZero, slow_path); 585 } 586 587 const Register mark = t; 588 589 { // Lightweight Lock 590 591 Label push; 592 593 const Register top = UseObjectMonitorTable ? rax_reg : box; 594 595 // Load the mark. 596 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 597 598 // Prefetch top. 599 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 600 601 // Check for monitor (0b10). 602 testptr(mark, markWord::monitor_value); 603 jcc(Assembler::notZero, inflated); 604 605 // Check if lock-stack is full. 606 cmpl(top, LockStack::end_offset() - 1); 607 jcc(Assembler::greater, slow_path); 608 609 // Check if recursive. 610 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 611 jccb(Assembler::equal, push); 612 613 // Try to lock. Transition lock bits 0b01 => 0b00 614 movptr(rax_reg, mark); 615 orptr(rax_reg, markWord::unlocked_value); 616 andptr(mark, ~(int32_t)markWord::unlocked_value); 617 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 618 jcc(Assembler::notEqual, slow_path); 619 620 if (UseObjectMonitorTable) { 621 // Need to reload top, clobbered by CAS. 622 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 623 } 624 bind(push); 625 // After successful lock, push object on lock-stack. 626 movptr(Address(thread, top), obj); 627 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 628 jmpb(locked); 629 } 630 631 { // Handle inflated monitor. 632 bind(inflated); 633 634 const Register monitor = t; 635 636 if (!UseObjectMonitorTable) { 637 assert(mark == monitor, "should be the same here"); 638 } else { 639 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 640 // Fetch ObjectMonitor* from the cache or take the slow-path. 641 Label monitor_found; 642 643 // Load cache address 644 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 645 646 const int num_unrolled = 2; 647 for (int i = 0; i < num_unrolled; i++) { 648 cmpptr(obj, Address(t)); 649 jccb(Assembler::equal, monitor_found); 650 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 651 } 652 653 Label loop; 654 655 // Search for obj in cache. 656 bind(loop); 657 658 // Check for match. 659 cmpptr(obj, Address(t)); 660 jccb(Assembler::equal, monitor_found); 661 662 // Search until null encountered, guaranteed _null_sentinel at end. 663 cmpptr(Address(t), 1); 664 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 665 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 666 jmpb(loop); 667 668 // Cache hit. 669 bind(monitor_found); 670 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 671 } 672 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 673 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 674 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 675 676 Label monitor_locked; 677 // Lock the monitor. 678 679 // CAS owner (null => current thread). 680 xorptr(rax_reg, rax_reg); 681 lock(); cmpxchgptr(thread, owner_address); 682 jccb(Assembler::equal, monitor_locked); 683 684 // Check if recursive. 685 cmpptr(thread, rax_reg); 686 jccb(Assembler::notEqual, slow_path); 687 688 // Recursive. 689 increment(recursions_address); 690 691 bind(monitor_locked); 692 if (UseObjectMonitorTable) { 693 // Cache the monitor for unlock 694 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 695 } 696 } 697 698 bind(locked); 699 increment(Address(thread, JavaThread::held_monitor_count_offset())); 700 // Set ZF = 1 701 xorl(rax_reg, rax_reg); 702 703 #ifdef ASSERT 704 // Check that locked label is reached with ZF set. 705 Label zf_correct; 706 Label zf_bad_zero; 707 jcc(Assembler::zero, zf_correct); 708 jmp(zf_bad_zero); 709 #endif 710 711 bind(slow_path); 712 #ifdef ASSERT 713 // Check that slow_path label is reached with ZF not set. 714 jcc(Assembler::notZero, zf_correct); 715 stop("Fast Lock ZF != 0"); 716 bind(zf_bad_zero); 717 stop("Fast Lock ZF != 1"); 718 bind(zf_correct); 719 #endif 720 // C2 uses the value of ZF to determine the continuation. 721 } 722 723 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 724 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 725 assert(reg_rax == rax, "Used for CAS"); 726 assert_different_registers(obj, reg_rax, t); 727 728 // Handle inflated monitor. 729 Label inflated, inflated_check_lock_stack; 730 // Finish fast unlock successfully. MUST jump with ZF == 1 731 Label unlocked, slow_path; 732 733 const Register mark = t; 734 const Register monitor = t; 735 const Register top = UseObjectMonitorTable ? t : reg_rax; 736 const Register box = reg_rax; 737 738 Label dummy; 739 C2FastUnlockLightweightStub* stub = nullptr; 740 741 if (!Compile::current()->output()->in_scratch_emit_size()) { 742 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 743 Compile::current()->output()->add_stub(stub); 744 } 745 746 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 747 748 { // Lightweight Unlock 749 750 // Load top. 751 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 752 753 if (!UseObjectMonitorTable) { 754 // Prefetch mark. 755 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 756 } 757 758 // Check if obj is top of lock-stack. 759 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 760 // Top of lock stack was not obj. Must be monitor. 761 jcc(Assembler::notEqual, inflated_check_lock_stack); 762 763 // Pop lock-stack. 764 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 765 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 766 767 // Check if recursive. 768 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 769 jcc(Assembler::equal, unlocked); 770 771 // We elide the monitor check, let the CAS fail instead. 772 773 if (UseObjectMonitorTable) { 774 // Load mark. 775 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 776 } 777 778 // Try to unlock. Transition lock bits 0b00 => 0b01 779 movptr(reg_rax, mark); 780 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 781 orptr(mark, markWord::unlocked_value); 782 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 783 jcc(Assembler::notEqual, push_and_slow_path); 784 jmp(unlocked); 785 } 786 787 788 { // Handle inflated monitor. 789 bind(inflated_check_lock_stack); 790 #ifdef ASSERT 791 Label check_done; 792 subl(top, oopSize); 793 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 794 jcc(Assembler::below, check_done); 795 cmpptr(obj, Address(thread, top)); 796 jccb(Assembler::notEqual, inflated_check_lock_stack); 797 stop("Fast Unlock lock on stack"); 798 bind(check_done); 799 if (UseObjectMonitorTable) { 800 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 801 } 802 testptr(mark, markWord::monitor_value); 803 jccb(Assembler::notZero, inflated); 804 stop("Fast Unlock not monitor"); 805 #endif 806 807 bind(inflated); 808 809 if (!UseObjectMonitorTable) { 810 assert(mark == monitor, "should be the same here"); 811 } else { 812 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 813 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 814 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 815 cmpptr(monitor, alignof(ObjectMonitor*)); 816 jcc(Assembler::below, slow_path); 817 } 818 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 819 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 820 const Address cxq_address{monitor, ObjectMonitor::cxq_offset() - monitor_tag}; 821 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 822 const Address EntryList_address{monitor, ObjectMonitor::EntryList_offset() - monitor_tag}; 823 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 824 825 Label recursive; 826 827 // Check if recursive. 828 cmpptr(recursions_address, 0); 829 jccb(Assembler::notZero, recursive); 830 831 // Set owner to null. 832 // Release to satisfy the JMM 833 movptr(owner_address, NULL_WORD); 834 // We need a full fence after clearing owner to avoid stranding. 835 // StoreLoad achieves this. 836 membar(StoreLoad); 837 838 // Check if the entry lists are empty (EntryList first - by convention). 839 movptr(reg_rax, EntryList_address); 840 orptr(reg_rax, cxq_address); 841 jccb(Assembler::zero, unlocked); // If so we are done. 842 843 // Check if there is a successor. 844 cmpptr(succ_address, NULL_WORD); 845 jccb(Assembler::notZero, unlocked); // If so we are done. 846 847 // Save the monitor pointer in the current thread, so we can try to 848 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 849 if (!UseObjectMonitorTable) { 850 andptr(monitor, ~(int32_t)markWord::monitor_value); 851 } 852 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 853 854 orl(t, 1); // Fast Unlock ZF = 0 855 jmpb(slow_path); 856 857 // Recursive unlock. 858 bind(recursive); 859 decrement(recursions_address); 860 } 861 862 bind(unlocked); 863 decrement(Address(thread, JavaThread::held_monitor_count_offset())); 864 xorl(t, t); // Fast Unlock ZF = 1 865 866 #ifdef ASSERT 867 // Check that unlocked label is reached with ZF set. 868 Label zf_correct; 869 Label zf_bad_zero; 870 jcc(Assembler::zero, zf_correct); 871 jmp(zf_bad_zero); 872 #endif 873 874 bind(slow_path); 875 if (stub != nullptr) { 876 bind(stub->slow_path_continuation()); 877 } 878 #ifdef ASSERT 879 // Check that stub->continuation() label is reached with ZF not set. 880 jcc(Assembler::notZero, zf_correct); 881 stop("Fast Unlock ZF != 0"); 882 bind(zf_bad_zero); 883 stop("Fast Unlock ZF != 1"); 884 bind(zf_correct); 885 #endif 886 // C2 uses the value of ZF to determine the continuation. 887 } 888 889 //------------------------------------------------------------------------------------------- 890 // Generic instructions support for use in .ad files C2 code generation 891 892 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 893 if (dst != src) { 894 movdqu(dst, src); 895 } 896 if (opcode == Op_AbsVD) { 897 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 898 } else { 899 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 900 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 901 } 902 } 903 904 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 905 if (opcode == Op_AbsVD) { 906 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 907 } else { 908 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 909 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 910 } 911 } 912 913 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 914 if (dst != src) { 915 movdqu(dst, src); 916 } 917 if (opcode == Op_AbsVF) { 918 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 919 } else { 920 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 921 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 922 } 923 } 924 925 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 926 if (opcode == Op_AbsVF) { 927 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 928 } else { 929 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 930 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 931 } 932 } 933 934 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 935 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 936 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 937 938 if (opcode == Op_MinV) { 939 if (elem_bt == T_BYTE) { 940 pminsb(dst, src); 941 } else if (elem_bt == T_SHORT) { 942 pminsw(dst, src); 943 } else if (elem_bt == T_INT) { 944 pminsd(dst, src); 945 } else { 946 assert(elem_bt == T_LONG, "required"); 947 assert(tmp == xmm0, "required"); 948 assert_different_registers(dst, src, tmp); 949 movdqu(xmm0, dst); 950 pcmpgtq(xmm0, src); 951 blendvpd(dst, src); // xmm0 as mask 952 } 953 } else { // opcode == Op_MaxV 954 if (elem_bt == T_BYTE) { 955 pmaxsb(dst, src); 956 } else if (elem_bt == T_SHORT) { 957 pmaxsw(dst, src); 958 } else if (elem_bt == T_INT) { 959 pmaxsd(dst, src); 960 } else { 961 assert(elem_bt == T_LONG, "required"); 962 assert(tmp == xmm0, "required"); 963 assert_different_registers(dst, src, tmp); 964 movdqu(xmm0, src); 965 pcmpgtq(xmm0, dst); 966 blendvpd(dst, src); // xmm0 as mask 967 } 968 } 969 } 970 971 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 972 XMMRegister src1, Address src2, int vlen_enc) { 973 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 974 if (opcode == Op_UMinV) { 975 switch(elem_bt) { 976 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 977 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 978 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 979 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 980 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 981 } 982 } else { 983 assert(opcode == Op_UMaxV, "required"); 984 switch(elem_bt) { 985 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 986 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 987 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 988 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 989 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 990 } 991 } 992 } 993 994 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 995 // For optimality, leverage a full vector width of 512 bits 996 // for operations over smaller vector sizes on AVX512 targets. 997 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { 998 if (opcode == Op_UMaxV) { 999 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 1000 } else { 1001 assert(opcode == Op_UMinV, "required"); 1002 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 1003 } 1004 } else { 1005 // T1 = -1 1006 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc); 1007 // T1 = -1 << 63 1008 vpsllq(xtmp1, xtmp1, 63, vlen_enc); 1009 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2 1010 vpaddq(xtmp2, xtmp1, src2, vlen_enc); 1011 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1 1012 vpaddq(xtmp1, xtmp1, src1, vlen_enc); 1013 // Mask = T2 > T1 1014 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc); 1015 if (opcode == Op_UMaxV) { 1016 // Res = Mask ? Src2 : Src1 1017 vpblendvb(dst, src1, src2, xtmp1, vlen_enc); 1018 } else { 1019 // Res = Mask ? Src1 : Src2 1020 vpblendvb(dst, src2, src1, xtmp1, vlen_enc); 1021 } 1022 } 1023 } 1024 1025 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 1026 XMMRegister src1, XMMRegister src2, int vlen_enc) { 1027 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 1028 if (opcode == Op_UMinV) { 1029 switch(elem_bt) { 1030 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 1031 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 1032 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 1033 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 1034 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 1035 } 1036 } else { 1037 assert(opcode == Op_UMaxV, "required"); 1038 switch(elem_bt) { 1039 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 1040 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 1041 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 1042 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 1043 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 1044 } 1045 } 1046 } 1047 1048 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1049 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1050 int vlen_enc) { 1051 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1052 1053 if (opcode == Op_MinV) { 1054 if (elem_bt == T_BYTE) { 1055 vpminsb(dst, src1, src2, vlen_enc); 1056 } else if (elem_bt == T_SHORT) { 1057 vpminsw(dst, src1, src2, vlen_enc); 1058 } else if (elem_bt == T_INT) { 1059 vpminsd(dst, src1, src2, vlen_enc); 1060 } else { 1061 assert(elem_bt == T_LONG, "required"); 1062 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1063 vpminsq(dst, src1, src2, vlen_enc); 1064 } else { 1065 assert_different_registers(dst, src1, src2); 1066 vpcmpgtq(dst, src1, src2, vlen_enc); 1067 vblendvpd(dst, src1, src2, dst, vlen_enc); 1068 } 1069 } 1070 } else { // opcode == Op_MaxV 1071 if (elem_bt == T_BYTE) { 1072 vpmaxsb(dst, src1, src2, vlen_enc); 1073 } else if (elem_bt == T_SHORT) { 1074 vpmaxsw(dst, src1, src2, vlen_enc); 1075 } else if (elem_bt == T_INT) { 1076 vpmaxsd(dst, src1, src2, vlen_enc); 1077 } else { 1078 assert(elem_bt == T_LONG, "required"); 1079 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1080 vpmaxsq(dst, src1, src2, vlen_enc); 1081 } else { 1082 assert_different_registers(dst, src1, src2); 1083 vpcmpgtq(dst, src1, src2, vlen_enc); 1084 vblendvpd(dst, src2, src1, dst, vlen_enc); 1085 } 1086 } 1087 } 1088 } 1089 1090 // Float/Double min max 1091 1092 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1093 XMMRegister dst, XMMRegister a, XMMRegister b, 1094 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1095 int vlen_enc) { 1096 assert(UseAVX > 0, "required"); 1097 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1098 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1099 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1100 assert_different_registers(a, tmp, atmp, btmp); 1101 assert_different_registers(b, tmp, atmp, btmp); 1102 1103 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1104 bool is_double_word = is_double_word_type(elem_bt); 1105 1106 /* Note on 'non-obvious' assembly sequence: 1107 * 1108 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1109 * and Java on how they handle floats: 1110 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1111 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1112 * 1113 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1114 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1115 * (only useful when signs differ, noop otherwise) 1116 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1117 1118 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1119 * btmp = (b < +0.0) ? a : b 1120 * atmp = (b < +0.0) ? b : a 1121 * Tmp = Max_Float(atmp , btmp) 1122 * Res = (atmp == NaN) ? atmp : Tmp 1123 */ 1124 1125 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1126 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1127 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1128 XMMRegister mask; 1129 1130 if (!is_double_word && is_min) { 1131 mask = a; 1132 vblend = &MacroAssembler::vblendvps; 1133 vmaxmin = &MacroAssembler::vminps; 1134 vcmp = &MacroAssembler::vcmpps; 1135 } else if (!is_double_word && !is_min) { 1136 mask = b; 1137 vblend = &MacroAssembler::vblendvps; 1138 vmaxmin = &MacroAssembler::vmaxps; 1139 vcmp = &MacroAssembler::vcmpps; 1140 } else if (is_double_word && is_min) { 1141 mask = a; 1142 vblend = &MacroAssembler::vblendvpd; 1143 vmaxmin = &MacroAssembler::vminpd; 1144 vcmp = &MacroAssembler::vcmppd; 1145 } else { 1146 assert(is_double_word && !is_min, "sanity"); 1147 mask = b; 1148 vblend = &MacroAssembler::vblendvpd; 1149 vmaxmin = &MacroAssembler::vmaxpd; 1150 vcmp = &MacroAssembler::vcmppd; 1151 } 1152 1153 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1154 XMMRegister maxmin, scratch; 1155 if (dst == btmp) { 1156 maxmin = btmp; 1157 scratch = tmp; 1158 } else { 1159 maxmin = tmp; 1160 scratch = btmp; 1161 } 1162 1163 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1164 if (precompute_mask && !is_double_word) { 1165 vpsrad(tmp, mask, 32, vlen_enc); 1166 mask = tmp; 1167 } else if (precompute_mask && is_double_word) { 1168 vpxor(tmp, tmp, tmp, vlen_enc); 1169 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1170 mask = tmp; 1171 } 1172 1173 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1174 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1175 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1176 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1177 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1178 } 1179 1180 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1181 XMMRegister dst, XMMRegister a, XMMRegister b, 1182 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1183 int vlen_enc) { 1184 assert(UseAVX > 2, "required"); 1185 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1186 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1187 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1188 assert_different_registers(dst, a, atmp, btmp); 1189 assert_different_registers(dst, b, atmp, btmp); 1190 1191 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1192 bool is_double_word = is_double_word_type(elem_bt); 1193 bool merge = true; 1194 1195 if (!is_double_word && is_min) { 1196 evpmovd2m(ktmp, a, vlen_enc); 1197 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1198 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1199 vminps(dst, atmp, btmp, vlen_enc); 1200 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1201 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1202 } else if (!is_double_word && !is_min) { 1203 evpmovd2m(ktmp, b, vlen_enc); 1204 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1205 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1206 vmaxps(dst, atmp, btmp, vlen_enc); 1207 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1208 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1209 } else if (is_double_word && is_min) { 1210 evpmovq2m(ktmp, a, vlen_enc); 1211 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1212 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1213 vminpd(dst, atmp, btmp, vlen_enc); 1214 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1215 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1216 } else { 1217 assert(is_double_word && !is_min, "sanity"); 1218 evpmovq2m(ktmp, b, vlen_enc); 1219 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1220 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1221 vmaxpd(dst, atmp, btmp, vlen_enc); 1222 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1223 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1224 } 1225 } 1226 1227 // Float/Double signum 1228 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1229 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1230 1231 Label DONE_LABEL; 1232 1233 if (opcode == Op_SignumF) { 1234 assert(UseSSE > 0, "required"); 1235 ucomiss(dst, zero); 1236 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1237 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1238 movflt(dst, one); 1239 jcc(Assembler::above, DONE_LABEL); 1240 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1241 } else if (opcode == Op_SignumD) { 1242 assert(UseSSE > 1, "required"); 1243 ucomisd(dst, zero); 1244 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1245 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1246 movdbl(dst, one); 1247 jcc(Assembler::above, DONE_LABEL); 1248 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1249 } 1250 1251 bind(DONE_LABEL); 1252 } 1253 1254 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1255 if (sign) { 1256 pmovsxbw(dst, src); 1257 } else { 1258 pmovzxbw(dst, src); 1259 } 1260 } 1261 1262 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1263 if (sign) { 1264 vpmovsxbw(dst, src, vector_len); 1265 } else { 1266 vpmovzxbw(dst, src, vector_len); 1267 } 1268 } 1269 1270 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1271 if (sign) { 1272 vpmovsxbd(dst, src, vector_len); 1273 } else { 1274 vpmovzxbd(dst, src, vector_len); 1275 } 1276 } 1277 1278 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1279 if (sign) { 1280 vpmovsxwd(dst, src, vector_len); 1281 } else { 1282 vpmovzxwd(dst, src, vector_len); 1283 } 1284 } 1285 1286 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1287 int shift, int vector_len) { 1288 if (opcode == Op_RotateLeftV) { 1289 if (etype == T_INT) { 1290 evprold(dst, src, shift, vector_len); 1291 } else { 1292 assert(etype == T_LONG, "expected type T_LONG"); 1293 evprolq(dst, src, shift, vector_len); 1294 } 1295 } else { 1296 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1297 if (etype == T_INT) { 1298 evprord(dst, src, shift, vector_len); 1299 } else { 1300 assert(etype == T_LONG, "expected type T_LONG"); 1301 evprorq(dst, src, shift, vector_len); 1302 } 1303 } 1304 } 1305 1306 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1307 XMMRegister shift, int vector_len) { 1308 if (opcode == Op_RotateLeftV) { 1309 if (etype == T_INT) { 1310 evprolvd(dst, src, shift, vector_len); 1311 } else { 1312 assert(etype == T_LONG, "expected type T_LONG"); 1313 evprolvq(dst, src, shift, vector_len); 1314 } 1315 } else { 1316 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1317 if (etype == T_INT) { 1318 evprorvd(dst, src, shift, vector_len); 1319 } else { 1320 assert(etype == T_LONG, "expected type T_LONG"); 1321 evprorvq(dst, src, shift, vector_len); 1322 } 1323 } 1324 } 1325 1326 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1327 if (opcode == Op_RShiftVI) { 1328 psrad(dst, shift); 1329 } else if (opcode == Op_LShiftVI) { 1330 pslld(dst, shift); 1331 } else { 1332 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1333 psrld(dst, shift); 1334 } 1335 } 1336 1337 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1338 switch (opcode) { 1339 case Op_RShiftVI: psrad(dst, shift); break; 1340 case Op_LShiftVI: pslld(dst, shift); break; 1341 case Op_URShiftVI: psrld(dst, shift); break; 1342 1343 default: assert(false, "%s", NodeClassNames[opcode]); 1344 } 1345 } 1346 1347 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1348 if (opcode == Op_RShiftVI) { 1349 vpsrad(dst, nds, shift, vector_len); 1350 } else if (opcode == Op_LShiftVI) { 1351 vpslld(dst, nds, shift, vector_len); 1352 } else { 1353 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1354 vpsrld(dst, nds, shift, vector_len); 1355 } 1356 } 1357 1358 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1359 switch (opcode) { 1360 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1361 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1362 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1363 1364 default: assert(false, "%s", NodeClassNames[opcode]); 1365 } 1366 } 1367 1368 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1369 switch (opcode) { 1370 case Op_RShiftVB: // fall-through 1371 case Op_RShiftVS: psraw(dst, shift); break; 1372 1373 case Op_LShiftVB: // fall-through 1374 case Op_LShiftVS: psllw(dst, shift); break; 1375 1376 case Op_URShiftVS: // fall-through 1377 case Op_URShiftVB: psrlw(dst, shift); break; 1378 1379 default: assert(false, "%s", NodeClassNames[opcode]); 1380 } 1381 } 1382 1383 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1384 switch (opcode) { 1385 case Op_RShiftVB: // fall-through 1386 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1387 1388 case Op_LShiftVB: // fall-through 1389 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1390 1391 case Op_URShiftVS: // fall-through 1392 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1393 1394 default: assert(false, "%s", NodeClassNames[opcode]); 1395 } 1396 } 1397 1398 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1399 switch (opcode) { 1400 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1401 case Op_LShiftVL: psllq(dst, shift); break; 1402 case Op_URShiftVL: psrlq(dst, shift); break; 1403 1404 default: assert(false, "%s", NodeClassNames[opcode]); 1405 } 1406 } 1407 1408 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1409 if (opcode == Op_RShiftVL) { 1410 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1411 } else if (opcode == Op_LShiftVL) { 1412 psllq(dst, shift); 1413 } else { 1414 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1415 psrlq(dst, shift); 1416 } 1417 } 1418 1419 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1420 switch (opcode) { 1421 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1422 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1423 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1424 1425 default: assert(false, "%s", NodeClassNames[opcode]); 1426 } 1427 } 1428 1429 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1430 if (opcode == Op_RShiftVL) { 1431 evpsraq(dst, nds, shift, vector_len); 1432 } else if (opcode == Op_LShiftVL) { 1433 vpsllq(dst, nds, shift, vector_len); 1434 } else { 1435 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1436 vpsrlq(dst, nds, shift, vector_len); 1437 } 1438 } 1439 1440 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1441 switch (opcode) { 1442 case Op_RShiftVB: // fall-through 1443 case Op_RShiftVS: // fall-through 1444 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1445 1446 case Op_LShiftVB: // fall-through 1447 case Op_LShiftVS: // fall-through 1448 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1449 1450 case Op_URShiftVB: // fall-through 1451 case Op_URShiftVS: // fall-through 1452 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1453 1454 default: assert(false, "%s", NodeClassNames[opcode]); 1455 } 1456 } 1457 1458 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1459 switch (opcode) { 1460 case Op_RShiftVB: // fall-through 1461 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1462 1463 case Op_LShiftVB: // fall-through 1464 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1465 1466 case Op_URShiftVB: // fall-through 1467 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1468 1469 default: assert(false, "%s", NodeClassNames[opcode]); 1470 } 1471 } 1472 1473 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1474 assert(UseAVX >= 2, "required"); 1475 switch (opcode) { 1476 case Op_RShiftVL: { 1477 if (UseAVX > 2) { 1478 assert(tmp == xnoreg, "not used"); 1479 if (!VM_Version::supports_avx512vl()) { 1480 vlen_enc = Assembler::AVX_512bit; 1481 } 1482 evpsravq(dst, src, shift, vlen_enc); 1483 } else { 1484 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1485 vpsrlvq(dst, src, shift, vlen_enc); 1486 vpsrlvq(tmp, tmp, shift, vlen_enc); 1487 vpxor(dst, dst, tmp, vlen_enc); 1488 vpsubq(dst, dst, tmp, vlen_enc); 1489 } 1490 break; 1491 } 1492 case Op_LShiftVL: { 1493 assert(tmp == xnoreg, "not used"); 1494 vpsllvq(dst, src, shift, vlen_enc); 1495 break; 1496 } 1497 case Op_URShiftVL: { 1498 assert(tmp == xnoreg, "not used"); 1499 vpsrlvq(dst, src, shift, vlen_enc); 1500 break; 1501 } 1502 default: assert(false, "%s", NodeClassNames[opcode]); 1503 } 1504 } 1505 1506 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1507 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1508 assert(opcode == Op_LShiftVB || 1509 opcode == Op_RShiftVB || 1510 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1511 bool sign = (opcode != Op_URShiftVB); 1512 assert(vector_len == 0, "required"); 1513 vextendbd(sign, dst, src, 1); 1514 vpmovzxbd(vtmp, shift, 1); 1515 varshiftd(opcode, dst, dst, vtmp, 1); 1516 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1517 vextracti128_high(vtmp, dst); 1518 vpackusdw(dst, dst, vtmp, 0); 1519 } 1520 1521 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1522 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1523 assert(opcode == Op_LShiftVB || 1524 opcode == Op_RShiftVB || 1525 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1526 bool sign = (opcode != Op_URShiftVB); 1527 int ext_vector_len = vector_len + 1; 1528 vextendbw(sign, dst, src, ext_vector_len); 1529 vpmovzxbw(vtmp, shift, ext_vector_len); 1530 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1531 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1532 if (vector_len == 0) { 1533 vextracti128_high(vtmp, dst); 1534 vpackuswb(dst, dst, vtmp, vector_len); 1535 } else { 1536 vextracti64x4_high(vtmp, dst); 1537 vpackuswb(dst, dst, vtmp, vector_len); 1538 vpermq(dst, dst, 0xD8, vector_len); 1539 } 1540 } 1541 1542 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1543 switch(typ) { 1544 case T_BYTE: 1545 pinsrb(dst, val, idx); 1546 break; 1547 case T_SHORT: 1548 pinsrw(dst, val, idx); 1549 break; 1550 case T_INT: 1551 pinsrd(dst, val, idx); 1552 break; 1553 case T_LONG: 1554 pinsrq(dst, val, idx); 1555 break; 1556 default: 1557 assert(false,"Should not reach here."); 1558 break; 1559 } 1560 } 1561 1562 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1563 switch(typ) { 1564 case T_BYTE: 1565 vpinsrb(dst, src, val, idx); 1566 break; 1567 case T_SHORT: 1568 vpinsrw(dst, src, val, idx); 1569 break; 1570 case T_INT: 1571 vpinsrd(dst, src, val, idx); 1572 break; 1573 case T_LONG: 1574 vpinsrq(dst, src, val, idx); 1575 break; 1576 default: 1577 assert(false,"Should not reach here."); 1578 break; 1579 } 1580 } 1581 1582 #ifdef _LP64 1583 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1584 XMMRegister dst, Register base, 1585 Register idx_base, 1586 Register offset, Register mask, 1587 Register mask_idx, Register rtmp, 1588 int vlen_enc) { 1589 vpxor(dst, dst, dst, vlen_enc); 1590 if (elem_bt == T_SHORT) { 1591 for (int i = 0; i < 4; i++) { 1592 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1593 Label skip_load; 1594 btq(mask, mask_idx); 1595 jccb(Assembler::carryClear, skip_load); 1596 movl(rtmp, Address(idx_base, i * 4)); 1597 if (offset != noreg) { 1598 addl(rtmp, offset); 1599 } 1600 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1601 bind(skip_load); 1602 incq(mask_idx); 1603 } 1604 } else { 1605 assert(elem_bt == T_BYTE, ""); 1606 for (int i = 0; i < 8; i++) { 1607 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1608 Label skip_load; 1609 btq(mask, mask_idx); 1610 jccb(Assembler::carryClear, skip_load); 1611 movl(rtmp, Address(idx_base, i * 4)); 1612 if (offset != noreg) { 1613 addl(rtmp, offset); 1614 } 1615 pinsrb(dst, Address(base, rtmp), i); 1616 bind(skip_load); 1617 incq(mask_idx); 1618 } 1619 } 1620 } 1621 #endif // _LP64 1622 1623 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1624 Register base, Register idx_base, 1625 Register offset, Register rtmp, 1626 int vlen_enc) { 1627 vpxor(dst, dst, dst, vlen_enc); 1628 if (elem_bt == T_SHORT) { 1629 for (int i = 0; i < 4; i++) { 1630 // dst[i] = src[offset + idx_base[i]] 1631 movl(rtmp, Address(idx_base, i * 4)); 1632 if (offset != noreg) { 1633 addl(rtmp, offset); 1634 } 1635 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1636 } 1637 } else { 1638 assert(elem_bt == T_BYTE, ""); 1639 for (int i = 0; i < 8; i++) { 1640 // dst[i] = src[offset + idx_base[i]] 1641 movl(rtmp, Address(idx_base, i * 4)); 1642 if (offset != noreg) { 1643 addl(rtmp, offset); 1644 } 1645 pinsrb(dst, Address(base, rtmp), i); 1646 } 1647 } 1648 } 1649 1650 /* 1651 * Gather using hybrid algorithm, first partially unroll scalar loop 1652 * to accumulate values from gather indices into a quad-word(64bit) slice. 1653 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1654 * permutation to place the slice into appropriate vector lane 1655 * locations in destination vector. Following pseudo code describes the 1656 * algorithm in detail: 1657 * 1658 * DST_VEC = ZERO_VEC 1659 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1660 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1661 * FOREACH_ITER: 1662 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1663 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1664 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1665 * PERM_INDEX = PERM_INDEX - TWO_VEC 1666 * 1667 * With each iteration, doubleword permute indices (0,1) corresponding 1668 * to gathered quadword gets right shifted by two lane positions. 1669 * 1670 */ 1671 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1672 Register base, Register idx_base, 1673 Register offset, Register mask, 1674 XMMRegister xtmp1, XMMRegister xtmp2, 1675 XMMRegister temp_dst, Register rtmp, 1676 Register mask_idx, Register length, 1677 int vector_len, int vlen_enc) { 1678 Label GATHER8_LOOP; 1679 assert(is_subword_type(elem_ty), ""); 1680 movl(length, vector_len); 1681 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1682 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1683 vallones(xtmp2, vlen_enc); 1684 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1685 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1686 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1687 1688 bind(GATHER8_LOOP); 1689 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1690 if (mask == noreg) { 1691 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1692 } else { 1693 LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc)); 1694 } 1695 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1696 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1697 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1698 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1699 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1700 vpor(dst, dst, temp_dst, vlen_enc); 1701 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1702 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1703 jcc(Assembler::notEqual, GATHER8_LOOP); 1704 } 1705 1706 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1707 switch(typ) { 1708 case T_INT: 1709 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1710 break; 1711 case T_FLOAT: 1712 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1713 break; 1714 case T_LONG: 1715 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1716 break; 1717 case T_DOUBLE: 1718 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1719 break; 1720 default: 1721 assert(false,"Should not reach here."); 1722 break; 1723 } 1724 } 1725 1726 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1727 switch(typ) { 1728 case T_INT: 1729 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1730 break; 1731 case T_FLOAT: 1732 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1733 break; 1734 case T_LONG: 1735 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1736 break; 1737 case T_DOUBLE: 1738 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1739 break; 1740 default: 1741 assert(false,"Should not reach here."); 1742 break; 1743 } 1744 } 1745 1746 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1747 switch(typ) { 1748 case T_INT: 1749 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1750 break; 1751 case T_FLOAT: 1752 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1753 break; 1754 case T_LONG: 1755 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1756 break; 1757 case T_DOUBLE: 1758 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1759 break; 1760 default: 1761 assert(false,"Should not reach here."); 1762 break; 1763 } 1764 } 1765 1766 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1767 if (vlen_in_bytes <= 16) { 1768 pxor (dst, dst); 1769 psubb(dst, src); 1770 switch (elem_bt) { 1771 case T_BYTE: /* nothing to do */ break; 1772 case T_SHORT: pmovsxbw(dst, dst); break; 1773 case T_INT: pmovsxbd(dst, dst); break; 1774 case T_FLOAT: pmovsxbd(dst, dst); break; 1775 case T_LONG: pmovsxbq(dst, dst); break; 1776 case T_DOUBLE: pmovsxbq(dst, dst); break; 1777 1778 default: assert(false, "%s", type2name(elem_bt)); 1779 } 1780 } else { 1781 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1782 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1783 1784 vpxor (dst, dst, dst, vlen_enc); 1785 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1786 1787 switch (elem_bt) { 1788 case T_BYTE: /* nothing to do */ break; 1789 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1790 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1791 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1792 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1793 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1794 1795 default: assert(false, "%s", type2name(elem_bt)); 1796 } 1797 } 1798 } 1799 1800 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1801 if (novlbwdq) { 1802 vpmovsxbd(xtmp, src, vlen_enc); 1803 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1804 Assembler::eq, true, vlen_enc, noreg); 1805 } else { 1806 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1807 vpsubb(xtmp, xtmp, src, vlen_enc); 1808 evpmovb2m(dst, xtmp, vlen_enc); 1809 } 1810 } 1811 1812 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1813 switch (vlen_in_bytes) { 1814 case 4: movdl(dst, src); break; 1815 case 8: movq(dst, src); break; 1816 case 16: movdqu(dst, src); break; 1817 case 32: vmovdqu(dst, src); break; 1818 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1819 default: ShouldNotReachHere(); 1820 } 1821 } 1822 1823 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1824 assert(rscratch != noreg || always_reachable(src), "missing"); 1825 1826 if (reachable(src)) { 1827 load_vector(dst, as_Address(src), vlen_in_bytes); 1828 } else { 1829 lea(rscratch, src); 1830 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1831 } 1832 } 1833 1834 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1835 int vlen_enc = vector_length_encoding(vlen); 1836 if (VM_Version::supports_avx()) { 1837 if (bt == T_LONG) { 1838 if (VM_Version::supports_avx2()) { 1839 vpbroadcastq(dst, src, vlen_enc); 1840 } else { 1841 vmovddup(dst, src, vlen_enc); 1842 } 1843 } else if (bt == T_DOUBLE) { 1844 if (vlen_enc != Assembler::AVX_128bit) { 1845 vbroadcastsd(dst, src, vlen_enc, noreg); 1846 } else { 1847 vmovddup(dst, src, vlen_enc); 1848 } 1849 } else { 1850 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1851 vpbroadcastd(dst, src, vlen_enc); 1852 } else { 1853 vbroadcastss(dst, src, vlen_enc); 1854 } 1855 } 1856 } else if (VM_Version::supports_sse3()) { 1857 movddup(dst, src); 1858 } else { 1859 movq(dst, src); 1860 if (vlen == 16) { 1861 punpcklqdq(dst, dst); 1862 } 1863 } 1864 } 1865 1866 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1867 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1868 int offset = exact_log2(type2aelembytes(bt)) << 6; 1869 if (is_floating_point_type(bt)) { 1870 offset += 128; 1871 } 1872 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1873 load_vector(dst, addr, vlen_in_bytes); 1874 } 1875 1876 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1877 1878 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1879 int vector_len = Assembler::AVX_128bit; 1880 1881 switch (opcode) { 1882 case Op_AndReductionV: pand(dst, src); break; 1883 case Op_OrReductionV: por (dst, src); break; 1884 case Op_XorReductionV: pxor(dst, src); break; 1885 case Op_MinReductionV: 1886 switch (typ) { 1887 case T_BYTE: pminsb(dst, src); break; 1888 case T_SHORT: pminsw(dst, src); break; 1889 case T_INT: pminsd(dst, src); break; 1890 case T_LONG: assert(UseAVX > 2, "required"); 1891 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1892 default: assert(false, "wrong type"); 1893 } 1894 break; 1895 case Op_MaxReductionV: 1896 switch (typ) { 1897 case T_BYTE: pmaxsb(dst, src); break; 1898 case T_SHORT: pmaxsw(dst, src); break; 1899 case T_INT: pmaxsd(dst, src); break; 1900 case T_LONG: assert(UseAVX > 2, "required"); 1901 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1902 default: assert(false, "wrong type"); 1903 } 1904 break; 1905 case Op_AddReductionVF: addss(dst, src); break; 1906 case Op_AddReductionVD: addsd(dst, src); break; 1907 case Op_AddReductionVI: 1908 switch (typ) { 1909 case T_BYTE: paddb(dst, src); break; 1910 case T_SHORT: paddw(dst, src); break; 1911 case T_INT: paddd(dst, src); break; 1912 default: assert(false, "wrong type"); 1913 } 1914 break; 1915 case Op_AddReductionVL: paddq(dst, src); break; 1916 case Op_MulReductionVF: mulss(dst, src); break; 1917 case Op_MulReductionVD: mulsd(dst, src); break; 1918 case Op_MulReductionVI: 1919 switch (typ) { 1920 case T_SHORT: pmullw(dst, src); break; 1921 case T_INT: pmulld(dst, src); break; 1922 default: assert(false, "wrong type"); 1923 } 1924 break; 1925 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1926 evpmullq(dst, dst, src, vector_len); break; 1927 default: assert(false, "wrong opcode"); 1928 } 1929 } 1930 1931 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1932 switch (opcode) { 1933 case Op_AddReductionVF: addps(dst, src); break; 1934 case Op_AddReductionVD: addpd(dst, src); break; 1935 case Op_MulReductionVF: mulps(dst, src); break; 1936 case Op_MulReductionVD: mulpd(dst, src); break; 1937 default: assert(false, "%s", NodeClassNames[opcode]); 1938 } 1939 } 1940 1941 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1942 int vector_len = Assembler::AVX_256bit; 1943 1944 switch (opcode) { 1945 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1946 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1947 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1948 case Op_MinReductionV: 1949 switch (typ) { 1950 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1951 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1952 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1953 case T_LONG: assert(UseAVX > 2, "required"); 1954 vpminsq(dst, src1, src2, vector_len); break; 1955 default: assert(false, "wrong type"); 1956 } 1957 break; 1958 case Op_MaxReductionV: 1959 switch (typ) { 1960 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1961 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1962 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1963 case T_LONG: assert(UseAVX > 2, "required"); 1964 vpmaxsq(dst, src1, src2, vector_len); break; 1965 default: assert(false, "wrong type"); 1966 } 1967 break; 1968 case Op_AddReductionVI: 1969 switch (typ) { 1970 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1971 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1972 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1973 default: assert(false, "wrong type"); 1974 } 1975 break; 1976 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1977 case Op_MulReductionVI: 1978 switch (typ) { 1979 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1980 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1981 default: assert(false, "wrong type"); 1982 } 1983 break; 1984 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1985 default: assert(false, "wrong opcode"); 1986 } 1987 } 1988 1989 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1990 int vector_len = Assembler::AVX_256bit; 1991 1992 switch (opcode) { 1993 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 1994 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 1995 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 1996 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 1997 default: assert(false, "%s", NodeClassNames[opcode]); 1998 } 1999 } 2000 2001 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 2002 XMMRegister dst, XMMRegister src, 2003 XMMRegister vtmp1, XMMRegister vtmp2) { 2004 switch (opcode) { 2005 case Op_AddReductionVF: 2006 case Op_MulReductionVF: 2007 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 2008 break; 2009 2010 case Op_AddReductionVD: 2011 case Op_MulReductionVD: 2012 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 2013 break; 2014 2015 default: assert(false, "wrong opcode"); 2016 } 2017 } 2018 2019 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 2020 XMMRegister dst, XMMRegister src, 2021 XMMRegister vtmp1, XMMRegister vtmp2) { 2022 switch (opcode) { 2023 case Op_AddReductionVF: 2024 case Op_MulReductionVF: 2025 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 2026 break; 2027 2028 case Op_AddReductionVD: 2029 case Op_MulReductionVD: 2030 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 2031 break; 2032 2033 default: assert(false, "%s", NodeClassNames[opcode]); 2034 } 2035 } 2036 2037 void C2_MacroAssembler::reduceB(int opcode, int vlen, 2038 Register dst, Register src1, XMMRegister src2, 2039 XMMRegister vtmp1, XMMRegister vtmp2) { 2040 switch (vlen) { 2041 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2042 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2043 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2044 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2045 2046 default: assert(false, "wrong vector length"); 2047 } 2048 } 2049 2050 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 2051 Register dst, Register src1, XMMRegister src2, 2052 XMMRegister vtmp1, XMMRegister vtmp2) { 2053 switch (vlen) { 2054 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2055 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2056 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2057 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2058 2059 default: assert(false, "wrong vector length"); 2060 } 2061 } 2062 2063 void C2_MacroAssembler::reduceS(int opcode, int vlen, 2064 Register dst, Register src1, XMMRegister src2, 2065 XMMRegister vtmp1, XMMRegister vtmp2) { 2066 switch (vlen) { 2067 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2068 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2069 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2070 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2071 2072 default: assert(false, "wrong vector length"); 2073 } 2074 } 2075 2076 void C2_MacroAssembler::reduceI(int opcode, int vlen, 2077 Register dst, Register src1, XMMRegister src2, 2078 XMMRegister vtmp1, XMMRegister vtmp2) { 2079 switch (vlen) { 2080 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2081 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2082 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2083 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2084 2085 default: assert(false, "wrong vector length"); 2086 } 2087 } 2088 2089 #ifdef _LP64 2090 void C2_MacroAssembler::reduceL(int opcode, int vlen, 2091 Register dst, Register src1, XMMRegister src2, 2092 XMMRegister vtmp1, XMMRegister vtmp2) { 2093 switch (vlen) { 2094 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2095 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2096 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2097 2098 default: assert(false, "wrong vector length"); 2099 } 2100 } 2101 #endif // _LP64 2102 2103 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2104 switch (vlen) { 2105 case 2: 2106 assert(vtmp2 == xnoreg, ""); 2107 reduce2F(opcode, dst, src, vtmp1); 2108 break; 2109 case 4: 2110 assert(vtmp2 == xnoreg, ""); 2111 reduce4F(opcode, dst, src, vtmp1); 2112 break; 2113 case 8: 2114 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2115 break; 2116 case 16: 2117 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2118 break; 2119 default: assert(false, "wrong vector length"); 2120 } 2121 } 2122 2123 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2124 switch (vlen) { 2125 case 2: 2126 assert(vtmp2 == xnoreg, ""); 2127 reduce2D(opcode, dst, src, vtmp1); 2128 break; 2129 case 4: 2130 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2131 break; 2132 case 8: 2133 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2134 break; 2135 default: assert(false, "wrong vector length"); 2136 } 2137 } 2138 2139 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2140 switch (vlen) { 2141 case 2: 2142 assert(vtmp1 == xnoreg, ""); 2143 assert(vtmp2 == xnoreg, ""); 2144 unorderedReduce2F(opcode, dst, src); 2145 break; 2146 case 4: 2147 assert(vtmp2 == xnoreg, ""); 2148 unorderedReduce4F(opcode, dst, src, vtmp1); 2149 break; 2150 case 8: 2151 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 2152 break; 2153 case 16: 2154 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 2155 break; 2156 default: assert(false, "wrong vector length"); 2157 } 2158 } 2159 2160 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2161 switch (vlen) { 2162 case 2: 2163 assert(vtmp1 == xnoreg, ""); 2164 assert(vtmp2 == xnoreg, ""); 2165 unorderedReduce2D(opcode, dst, src); 2166 break; 2167 case 4: 2168 assert(vtmp2 == xnoreg, ""); 2169 unorderedReduce4D(opcode, dst, src, vtmp1); 2170 break; 2171 case 8: 2172 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2173 break; 2174 default: assert(false, "wrong vector length"); 2175 } 2176 } 2177 2178 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2179 if (opcode == Op_AddReductionVI) { 2180 if (vtmp1 != src2) { 2181 movdqu(vtmp1, src2); 2182 } 2183 phaddd(vtmp1, vtmp1); 2184 } else { 2185 pshufd(vtmp1, src2, 0x1); 2186 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2187 } 2188 movdl(vtmp2, src1); 2189 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2190 movdl(dst, vtmp1); 2191 } 2192 2193 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2194 if (opcode == Op_AddReductionVI) { 2195 if (vtmp1 != src2) { 2196 movdqu(vtmp1, src2); 2197 } 2198 phaddd(vtmp1, src2); 2199 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2200 } else { 2201 pshufd(vtmp2, src2, 0xE); 2202 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2203 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2204 } 2205 } 2206 2207 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2208 if (opcode == Op_AddReductionVI) { 2209 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2210 vextracti128_high(vtmp2, vtmp1); 2211 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2212 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2213 } else { 2214 vextracti128_high(vtmp1, src2); 2215 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2216 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2217 } 2218 } 2219 2220 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2221 vextracti64x4_high(vtmp2, src2); 2222 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2223 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2224 } 2225 2226 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2227 pshufd(vtmp2, src2, 0x1); 2228 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2229 movdqu(vtmp1, vtmp2); 2230 psrldq(vtmp1, 2); 2231 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2232 movdqu(vtmp2, vtmp1); 2233 psrldq(vtmp2, 1); 2234 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2235 movdl(vtmp2, src1); 2236 pmovsxbd(vtmp1, vtmp1); 2237 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2238 pextrb(dst, vtmp1, 0x0); 2239 movsbl(dst, dst); 2240 } 2241 2242 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2243 pshufd(vtmp1, src2, 0xE); 2244 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2245 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2246 } 2247 2248 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2249 vextracti128_high(vtmp2, src2); 2250 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2251 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2252 } 2253 2254 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2255 vextracti64x4_high(vtmp1, src2); 2256 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2257 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2258 } 2259 2260 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2261 pmovsxbw(vtmp2, src2); 2262 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2263 } 2264 2265 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2266 if (UseAVX > 1) { 2267 int vector_len = Assembler::AVX_256bit; 2268 vpmovsxbw(vtmp1, src2, vector_len); 2269 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2270 } else { 2271 pmovsxbw(vtmp2, src2); 2272 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2273 pshufd(vtmp2, src2, 0x1); 2274 pmovsxbw(vtmp2, src2); 2275 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2276 } 2277 } 2278 2279 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2280 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2281 int vector_len = Assembler::AVX_512bit; 2282 vpmovsxbw(vtmp1, src2, vector_len); 2283 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2284 } else { 2285 assert(UseAVX >= 2,"Should not reach here."); 2286 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2287 vextracti128_high(vtmp2, src2); 2288 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2289 } 2290 } 2291 2292 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2293 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2294 vextracti64x4_high(vtmp2, src2); 2295 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2296 } 2297 2298 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2299 if (opcode == Op_AddReductionVI) { 2300 if (vtmp1 != src2) { 2301 movdqu(vtmp1, src2); 2302 } 2303 phaddw(vtmp1, vtmp1); 2304 phaddw(vtmp1, vtmp1); 2305 } else { 2306 pshufd(vtmp2, src2, 0x1); 2307 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2308 movdqu(vtmp1, vtmp2); 2309 psrldq(vtmp1, 2); 2310 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2311 } 2312 movdl(vtmp2, src1); 2313 pmovsxwd(vtmp1, vtmp1); 2314 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2315 pextrw(dst, vtmp1, 0x0); 2316 movswl(dst, dst); 2317 } 2318 2319 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2320 if (opcode == Op_AddReductionVI) { 2321 if (vtmp1 != src2) { 2322 movdqu(vtmp1, src2); 2323 } 2324 phaddw(vtmp1, src2); 2325 } else { 2326 pshufd(vtmp1, src2, 0xE); 2327 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2328 } 2329 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2330 } 2331 2332 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2333 if (opcode == Op_AddReductionVI) { 2334 int vector_len = Assembler::AVX_256bit; 2335 vphaddw(vtmp2, src2, src2, vector_len); 2336 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2337 } else { 2338 vextracti128_high(vtmp2, src2); 2339 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2340 } 2341 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2342 } 2343 2344 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2345 int vector_len = Assembler::AVX_256bit; 2346 vextracti64x4_high(vtmp1, src2); 2347 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2348 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2349 } 2350 2351 #ifdef _LP64 2352 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2353 pshufd(vtmp2, src2, 0xE); 2354 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2355 movdq(vtmp1, src1); 2356 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2357 movdq(dst, vtmp1); 2358 } 2359 2360 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2361 vextracti128_high(vtmp1, src2); 2362 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2363 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2364 } 2365 2366 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2367 vextracti64x4_high(vtmp2, src2); 2368 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2369 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2370 } 2371 2372 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2373 mov64(temp, -1L); 2374 bzhiq(temp, temp, len); 2375 kmovql(dst, temp); 2376 } 2377 #endif // _LP64 2378 2379 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2380 reduce_operation_128(T_FLOAT, opcode, dst, src); 2381 pshufd(vtmp, src, 0x1); 2382 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2383 } 2384 2385 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2386 reduce2F(opcode, dst, src, vtmp); 2387 pshufd(vtmp, src, 0x2); 2388 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2389 pshufd(vtmp, src, 0x3); 2390 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2391 } 2392 2393 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2394 reduce4F(opcode, dst, src, vtmp2); 2395 vextractf128_high(vtmp2, src); 2396 reduce4F(opcode, dst, vtmp2, vtmp1); 2397 } 2398 2399 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2400 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2401 vextracti64x4_high(vtmp1, src); 2402 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2403 } 2404 2405 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2406 pshufd(dst, src, 0x1); 2407 reduce_operation_128(T_FLOAT, opcode, dst, src); 2408 } 2409 2410 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2411 pshufd(vtmp, src, 0xE); 2412 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2413 unorderedReduce2F(opcode, dst, vtmp); 2414 } 2415 2416 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2417 vextractf128_high(vtmp1, src); 2418 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2419 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2420 } 2421 2422 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2423 vextractf64x4_high(vtmp2, src); 2424 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2425 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2426 } 2427 2428 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2429 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2430 pshufd(vtmp, src, 0xE); 2431 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2432 } 2433 2434 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2435 reduce2D(opcode, dst, src, vtmp2); 2436 vextractf128_high(vtmp2, src); 2437 reduce2D(opcode, dst, vtmp2, vtmp1); 2438 } 2439 2440 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2441 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2442 vextracti64x4_high(vtmp1, src); 2443 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2444 } 2445 2446 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2447 pshufd(dst, src, 0xE); 2448 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2449 } 2450 2451 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2452 vextractf128_high(vtmp, src); 2453 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2454 unorderedReduce2D(opcode, dst, vtmp); 2455 } 2456 2457 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2458 vextractf64x4_high(vtmp2, src); 2459 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2460 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2461 } 2462 2463 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2464 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2465 } 2466 2467 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2468 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2469 } 2470 2471 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 2472 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2473 } 2474 2475 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2476 int vec_enc) { 2477 switch(elem_bt) { 2478 case T_INT: 2479 case T_FLOAT: 2480 vmaskmovps(dst, src, mask, vec_enc); 2481 break; 2482 case T_LONG: 2483 case T_DOUBLE: 2484 vmaskmovpd(dst, src, mask, vec_enc); 2485 break; 2486 default: 2487 fatal("Unsupported type %s", type2name(elem_bt)); 2488 break; 2489 } 2490 } 2491 2492 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2493 int vec_enc) { 2494 switch(elem_bt) { 2495 case T_INT: 2496 case T_FLOAT: 2497 vmaskmovps(dst, src, mask, vec_enc); 2498 break; 2499 case T_LONG: 2500 case T_DOUBLE: 2501 vmaskmovpd(dst, src, mask, vec_enc); 2502 break; 2503 default: 2504 fatal("Unsupported type %s", type2name(elem_bt)); 2505 break; 2506 } 2507 } 2508 2509 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2510 XMMRegister dst, XMMRegister src, 2511 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2512 XMMRegister xmm_0, XMMRegister xmm_1) { 2513 const int permconst[] = {1, 14}; 2514 XMMRegister wsrc = src; 2515 XMMRegister wdst = xmm_0; 2516 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2517 2518 int vlen_enc = Assembler::AVX_128bit; 2519 if (vlen == 16) { 2520 vlen_enc = Assembler::AVX_256bit; 2521 } 2522 2523 for (int i = log2(vlen) - 1; i >=0; i--) { 2524 if (i == 0 && !is_dst_valid) { 2525 wdst = dst; 2526 } 2527 if (i == 3) { 2528 vextracti64x4_high(wtmp, wsrc); 2529 } else if (i == 2) { 2530 vextracti128_high(wtmp, wsrc); 2531 } else { // i = [0,1] 2532 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2533 } 2534 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2535 wsrc = wdst; 2536 vlen_enc = Assembler::AVX_128bit; 2537 } 2538 if (is_dst_valid) { 2539 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2540 } 2541 } 2542 2543 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2544 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2545 XMMRegister xmm_0, XMMRegister xmm_1) { 2546 XMMRegister wsrc = src; 2547 XMMRegister wdst = xmm_0; 2548 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2549 int vlen_enc = Assembler::AVX_128bit; 2550 if (vlen == 8) { 2551 vlen_enc = Assembler::AVX_256bit; 2552 } 2553 for (int i = log2(vlen) - 1; i >=0; i--) { 2554 if (i == 0 && !is_dst_valid) { 2555 wdst = dst; 2556 } 2557 if (i == 1) { 2558 vextracti128_high(wtmp, wsrc); 2559 } else if (i == 2) { 2560 vextracti64x4_high(wtmp, wsrc); 2561 } else { 2562 assert(i == 0, "%d", i); 2563 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2564 } 2565 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2566 wsrc = wdst; 2567 vlen_enc = Assembler::AVX_128bit; 2568 } 2569 if (is_dst_valid) { 2570 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2571 } 2572 } 2573 2574 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2575 switch (bt) { 2576 case T_BYTE: pextrb(dst, src, idx); break; 2577 case T_SHORT: pextrw(dst, src, idx); break; 2578 case T_INT: pextrd(dst, src, idx); break; 2579 case T_LONG: pextrq(dst, src, idx); break; 2580 2581 default: 2582 assert(false,"Should not reach here."); 2583 break; 2584 } 2585 } 2586 2587 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2588 int esize = type2aelembytes(typ); 2589 int elem_per_lane = 16/esize; 2590 int lane = elemindex / elem_per_lane; 2591 int eindex = elemindex % elem_per_lane; 2592 2593 if (lane >= 2) { 2594 assert(UseAVX > 2, "required"); 2595 vextractf32x4(dst, src, lane & 3); 2596 return dst; 2597 } else if (lane > 0) { 2598 assert(UseAVX > 0, "required"); 2599 vextractf128(dst, src, lane); 2600 return dst; 2601 } else { 2602 return src; 2603 } 2604 } 2605 2606 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2607 if (typ == T_BYTE) { 2608 movsbl(dst, dst); 2609 } else if (typ == T_SHORT) { 2610 movswl(dst, dst); 2611 } 2612 } 2613 2614 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2615 int esize = type2aelembytes(typ); 2616 int elem_per_lane = 16/esize; 2617 int eindex = elemindex % elem_per_lane; 2618 assert(is_integral_type(typ),"required"); 2619 2620 if (eindex == 0) { 2621 if (typ == T_LONG) { 2622 movq(dst, src); 2623 } else { 2624 movdl(dst, src); 2625 movsxl(typ, dst); 2626 } 2627 } else { 2628 extract(typ, dst, src, eindex); 2629 movsxl(typ, dst); 2630 } 2631 } 2632 2633 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2634 int esize = type2aelembytes(typ); 2635 int elem_per_lane = 16/esize; 2636 int eindex = elemindex % elem_per_lane; 2637 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2638 2639 if (eindex == 0) { 2640 movq(dst, src); 2641 } else { 2642 if (typ == T_FLOAT) { 2643 if (UseAVX == 0) { 2644 movdqu(dst, src); 2645 shufps(dst, dst, eindex); 2646 } else { 2647 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2648 } 2649 } else { 2650 if (UseAVX == 0) { 2651 movdqu(dst, src); 2652 psrldq(dst, eindex*esize); 2653 } else { 2654 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2655 } 2656 movq(dst, dst); 2657 } 2658 } 2659 // Zero upper bits 2660 if (typ == T_FLOAT) { 2661 if (UseAVX == 0) { 2662 assert(vtmp != xnoreg, "required."); 2663 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2664 pand(dst, vtmp); 2665 } else { 2666 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2667 } 2668 } 2669 } 2670 2671 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2672 switch(typ) { 2673 case T_BYTE: 2674 case T_BOOLEAN: 2675 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2676 break; 2677 case T_SHORT: 2678 case T_CHAR: 2679 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2680 break; 2681 case T_INT: 2682 case T_FLOAT: 2683 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2684 break; 2685 case T_LONG: 2686 case T_DOUBLE: 2687 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2688 break; 2689 default: 2690 assert(false,"Should not reach here."); 2691 break; 2692 } 2693 } 2694 2695 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2696 assert(rscratch != noreg || always_reachable(src2), "missing"); 2697 2698 switch(typ) { 2699 case T_BOOLEAN: 2700 case T_BYTE: 2701 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2702 break; 2703 case T_CHAR: 2704 case T_SHORT: 2705 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2706 break; 2707 case T_INT: 2708 case T_FLOAT: 2709 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2710 break; 2711 case T_LONG: 2712 case T_DOUBLE: 2713 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2714 break; 2715 default: 2716 assert(false,"Should not reach here."); 2717 break; 2718 } 2719 } 2720 2721 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2722 switch(typ) { 2723 case T_BYTE: 2724 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2725 break; 2726 case T_SHORT: 2727 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2728 break; 2729 case T_INT: 2730 case T_FLOAT: 2731 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2732 break; 2733 case T_LONG: 2734 case T_DOUBLE: 2735 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2736 break; 2737 default: 2738 assert(false,"Should not reach here."); 2739 break; 2740 } 2741 } 2742 2743 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2744 assert(vlen_in_bytes <= 32, ""); 2745 int esize = type2aelembytes(bt); 2746 if (vlen_in_bytes == 32) { 2747 assert(vtmp == xnoreg, "required."); 2748 if (esize >= 4) { 2749 vtestps(src1, src2, AVX_256bit); 2750 } else { 2751 vptest(src1, src2, AVX_256bit); 2752 } 2753 return; 2754 } 2755 if (vlen_in_bytes < 16) { 2756 // Duplicate the lower part to fill the whole register, 2757 // Don't need to do so for src2 2758 assert(vtmp != xnoreg, "required"); 2759 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2760 pshufd(vtmp, src1, shuffle_imm); 2761 } else { 2762 assert(vtmp == xnoreg, "required"); 2763 vtmp = src1; 2764 } 2765 if (esize >= 4 && VM_Version::supports_avx()) { 2766 vtestps(vtmp, src2, AVX_128bit); 2767 } else { 2768 ptest(vtmp, src2); 2769 } 2770 } 2771 2772 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2773 #ifdef ASSERT 2774 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2775 bool is_bw_supported = VM_Version::supports_avx512bw(); 2776 if (is_bw && !is_bw_supported) { 2777 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2778 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2779 "XMM register should be 0-15"); 2780 } 2781 #endif // ASSERT 2782 switch (elem_bt) { 2783 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2784 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2785 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2786 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2787 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2788 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2789 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2790 } 2791 } 2792 2793 #ifdef _LP64 2794 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2795 assert(UseAVX >= 2, "required"); 2796 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2797 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2798 if ((UseAVX > 2) && 2799 (!is_bw || VM_Version::supports_avx512bw()) && 2800 (!is_vl || VM_Version::supports_avx512vl())) { 2801 switch (elem_bt) { 2802 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2803 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2804 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2805 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2806 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2807 } 2808 } else { 2809 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2810 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2811 switch (elem_bt) { 2812 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2813 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2814 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2815 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2816 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2817 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2818 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2819 } 2820 } 2821 } 2822 #endif 2823 2824 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2825 switch (to_elem_bt) { 2826 case T_SHORT: 2827 vpmovsxbw(dst, src, vlen_enc); 2828 break; 2829 case T_INT: 2830 vpmovsxbd(dst, src, vlen_enc); 2831 break; 2832 case T_FLOAT: 2833 vpmovsxbd(dst, src, vlen_enc); 2834 vcvtdq2ps(dst, dst, vlen_enc); 2835 break; 2836 case T_LONG: 2837 vpmovsxbq(dst, src, vlen_enc); 2838 break; 2839 case T_DOUBLE: { 2840 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2841 vpmovsxbd(dst, src, mid_vlen_enc); 2842 vcvtdq2pd(dst, dst, vlen_enc); 2843 break; 2844 } 2845 default: 2846 fatal("Unsupported type %s", type2name(to_elem_bt)); 2847 break; 2848 } 2849 } 2850 2851 //------------------------------------------------------------------------------------------- 2852 2853 // IndexOf for constant substrings with size >= 8 chars 2854 // which don't need to be loaded through stack. 2855 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2856 Register cnt1, Register cnt2, 2857 int int_cnt2, Register result, 2858 XMMRegister vec, Register tmp, 2859 int ae) { 2860 ShortBranchVerifier sbv(this); 2861 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2862 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2863 2864 // This method uses the pcmpestri instruction with bound registers 2865 // inputs: 2866 // xmm - substring 2867 // rax - substring length (elements count) 2868 // mem - scanned string 2869 // rdx - string length (elements count) 2870 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2871 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2872 // outputs: 2873 // rcx - matched index in string 2874 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2875 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2876 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2877 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2878 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2879 2880 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2881 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2882 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2883 2884 // Note, inline_string_indexOf() generates checks: 2885 // if (substr.count > string.count) return -1; 2886 // if (substr.count == 0) return 0; 2887 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2888 2889 // Load substring. 2890 if (ae == StrIntrinsicNode::UL) { 2891 pmovzxbw(vec, Address(str2, 0)); 2892 } else { 2893 movdqu(vec, Address(str2, 0)); 2894 } 2895 movl(cnt2, int_cnt2); 2896 movptr(result, str1); // string addr 2897 2898 if (int_cnt2 > stride) { 2899 jmpb(SCAN_TO_SUBSTR); 2900 2901 // Reload substr for rescan, this code 2902 // is executed only for large substrings (> 8 chars) 2903 bind(RELOAD_SUBSTR); 2904 if (ae == StrIntrinsicNode::UL) { 2905 pmovzxbw(vec, Address(str2, 0)); 2906 } else { 2907 movdqu(vec, Address(str2, 0)); 2908 } 2909 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2910 2911 bind(RELOAD_STR); 2912 // We came here after the beginning of the substring was 2913 // matched but the rest of it was not so we need to search 2914 // again. Start from the next element after the previous match. 2915 2916 // cnt2 is number of substring reminding elements and 2917 // cnt1 is number of string reminding elements when cmp failed. 2918 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2919 subl(cnt1, cnt2); 2920 addl(cnt1, int_cnt2); 2921 movl(cnt2, int_cnt2); // Now restore cnt2 2922 2923 decrementl(cnt1); // Shift to next element 2924 cmpl(cnt1, cnt2); 2925 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2926 2927 addptr(result, (1<<scale1)); 2928 2929 } // (int_cnt2 > 8) 2930 2931 // Scan string for start of substr in 16-byte vectors 2932 bind(SCAN_TO_SUBSTR); 2933 pcmpestri(vec, Address(result, 0), mode); 2934 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2935 subl(cnt1, stride); 2936 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2937 cmpl(cnt1, cnt2); 2938 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2939 addptr(result, 16); 2940 jmpb(SCAN_TO_SUBSTR); 2941 2942 // Found a potential substr 2943 bind(FOUND_CANDIDATE); 2944 // Matched whole vector if first element matched (tmp(rcx) == 0). 2945 if (int_cnt2 == stride) { 2946 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2947 } else { // int_cnt2 > 8 2948 jccb(Assembler::overflow, FOUND_SUBSTR); 2949 } 2950 // After pcmpestri tmp(rcx) contains matched element index 2951 // Compute start addr of substr 2952 lea(result, Address(result, tmp, scale1)); 2953 2954 // Make sure string is still long enough 2955 subl(cnt1, tmp); 2956 cmpl(cnt1, cnt2); 2957 if (int_cnt2 == stride) { 2958 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2959 } else { // int_cnt2 > 8 2960 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2961 } 2962 // Left less then substring. 2963 2964 bind(RET_NOT_FOUND); 2965 movl(result, -1); 2966 jmp(EXIT); 2967 2968 if (int_cnt2 > stride) { 2969 // This code is optimized for the case when whole substring 2970 // is matched if its head is matched. 2971 bind(MATCH_SUBSTR_HEAD); 2972 pcmpestri(vec, Address(result, 0), mode); 2973 // Reload only string if does not match 2974 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2975 2976 Label CONT_SCAN_SUBSTR; 2977 // Compare the rest of substring (> 8 chars). 2978 bind(FOUND_SUBSTR); 2979 // First 8 chars are already matched. 2980 negptr(cnt2); 2981 addptr(cnt2, stride); 2982 2983 bind(SCAN_SUBSTR); 2984 subl(cnt1, stride); 2985 cmpl(cnt2, -stride); // Do not read beyond substring 2986 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2987 // Back-up strings to avoid reading beyond substring: 2988 // cnt1 = cnt1 - cnt2 + 8 2989 addl(cnt1, cnt2); // cnt2 is negative 2990 addl(cnt1, stride); 2991 movl(cnt2, stride); negptr(cnt2); 2992 bind(CONT_SCAN_SUBSTR); 2993 if (int_cnt2 < (int)G) { 2994 int tail_off1 = int_cnt2<<scale1; 2995 int tail_off2 = int_cnt2<<scale2; 2996 if (ae == StrIntrinsicNode::UL) { 2997 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2998 } else { 2999 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 3000 } 3001 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 3002 } else { 3003 // calculate index in register to avoid integer overflow (int_cnt2*2) 3004 movl(tmp, int_cnt2); 3005 addptr(tmp, cnt2); 3006 if (ae == StrIntrinsicNode::UL) { 3007 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 3008 } else { 3009 movdqu(vec, Address(str2, tmp, scale2, 0)); 3010 } 3011 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 3012 } 3013 // Need to reload strings pointers if not matched whole vector 3014 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3015 addptr(cnt2, stride); 3016 jcc(Assembler::negative, SCAN_SUBSTR); 3017 // Fall through if found full substring 3018 3019 } // (int_cnt2 > 8) 3020 3021 bind(RET_FOUND); 3022 // Found result if we matched full small substring. 3023 // Compute substr offset 3024 subptr(result, str1); 3025 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3026 shrl(result, 1); // index 3027 } 3028 bind(EXIT); 3029 3030 } // string_indexofC8 3031 3032 // Small strings are loaded through stack if they cross page boundary. 3033 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 3034 Register cnt1, Register cnt2, 3035 int int_cnt2, Register result, 3036 XMMRegister vec, Register tmp, 3037 int ae) { 3038 ShortBranchVerifier sbv(this); 3039 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3040 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3041 3042 // 3043 // int_cnt2 is length of small (< 8 chars) constant substring 3044 // or (-1) for non constant substring in which case its length 3045 // is in cnt2 register. 3046 // 3047 // Note, inline_string_indexOf() generates checks: 3048 // if (substr.count > string.count) return -1; 3049 // if (substr.count == 0) return 0; 3050 // 3051 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 3052 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 3053 // This method uses the pcmpestri instruction with bound registers 3054 // inputs: 3055 // xmm - substring 3056 // rax - substring length (elements count) 3057 // mem - scanned string 3058 // rdx - string length (elements count) 3059 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 3060 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 3061 // outputs: 3062 // rcx - matched index in string 3063 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3064 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 3065 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 3066 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 3067 3068 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 3069 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 3070 FOUND_CANDIDATE; 3071 3072 { //======================================================== 3073 // We don't know where these strings are located 3074 // and we can't read beyond them. Load them through stack. 3075 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 3076 3077 movptr(tmp, rsp); // save old SP 3078 3079 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 3080 if (int_cnt2 == (1>>scale2)) { // One byte 3081 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 3082 load_unsigned_byte(result, Address(str2, 0)); 3083 movdl(vec, result); // move 32 bits 3084 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 3085 // Not enough header space in 32-bit VM: 12+3 = 15. 3086 movl(result, Address(str2, -1)); 3087 shrl(result, 8); 3088 movdl(vec, result); // move 32 bits 3089 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 3090 load_unsigned_short(result, Address(str2, 0)); 3091 movdl(vec, result); // move 32 bits 3092 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 3093 movdl(vec, Address(str2, 0)); // move 32 bits 3094 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 3095 movq(vec, Address(str2, 0)); // move 64 bits 3096 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 3097 // Array header size is 12 bytes in 32-bit VM 3098 // + 6 bytes for 3 chars == 18 bytes, 3099 // enough space to load vec and shift. 3100 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3101 if (ae == StrIntrinsicNode::UL) { 3102 int tail_off = int_cnt2-8; 3103 pmovzxbw(vec, Address(str2, tail_off)); 3104 psrldq(vec, -2*tail_off); 3105 } 3106 else { 3107 int tail_off = int_cnt2*(1<<scale2); 3108 movdqu(vec, Address(str2, tail_off-16)); 3109 psrldq(vec, 16-tail_off); 3110 } 3111 } 3112 } else { // not constant substring 3113 cmpl(cnt2, stride); 3114 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3115 3116 // We can read beyond string if srt+16 does not cross page boundary 3117 // since heaps are aligned and mapped by pages. 3118 assert(os::vm_page_size() < (int)G, "default page should be small"); 3119 movl(result, str2); // We need only low 32 bits 3120 andl(result, ((int)os::vm_page_size()-1)); 3121 cmpl(result, ((int)os::vm_page_size()-16)); 3122 jccb(Assembler::belowEqual, CHECK_STR); 3123 3124 // Move small strings to stack to allow load 16 bytes into vec. 3125 subptr(rsp, 16); 3126 int stk_offset = wordSize-(1<<scale2); 3127 push(cnt2); 3128 3129 bind(COPY_SUBSTR); 3130 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3131 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3132 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3133 } else if (ae == StrIntrinsicNode::UU) { 3134 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3135 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3136 } 3137 decrement(cnt2); 3138 jccb(Assembler::notZero, COPY_SUBSTR); 3139 3140 pop(cnt2); 3141 movptr(str2, rsp); // New substring address 3142 } // non constant 3143 3144 bind(CHECK_STR); 3145 cmpl(cnt1, stride); 3146 jccb(Assembler::aboveEqual, BIG_STRINGS); 3147 3148 // Check cross page boundary. 3149 movl(result, str1); // We need only low 32 bits 3150 andl(result, ((int)os::vm_page_size()-1)); 3151 cmpl(result, ((int)os::vm_page_size()-16)); 3152 jccb(Assembler::belowEqual, BIG_STRINGS); 3153 3154 subptr(rsp, 16); 3155 int stk_offset = -(1<<scale1); 3156 if (int_cnt2 < 0) { // not constant 3157 push(cnt2); 3158 stk_offset += wordSize; 3159 } 3160 movl(cnt2, cnt1); 3161 3162 bind(COPY_STR); 3163 if (ae == StrIntrinsicNode::LL) { 3164 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3165 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3166 } else { 3167 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3168 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3169 } 3170 decrement(cnt2); 3171 jccb(Assembler::notZero, COPY_STR); 3172 3173 if (int_cnt2 < 0) { // not constant 3174 pop(cnt2); 3175 } 3176 movptr(str1, rsp); // New string address 3177 3178 bind(BIG_STRINGS); 3179 // Load substring. 3180 if (int_cnt2 < 0) { // -1 3181 if (ae == StrIntrinsicNode::UL) { 3182 pmovzxbw(vec, Address(str2, 0)); 3183 } else { 3184 movdqu(vec, Address(str2, 0)); 3185 } 3186 push(cnt2); // substr count 3187 push(str2); // substr addr 3188 push(str1); // string addr 3189 } else { 3190 // Small (< 8 chars) constant substrings are loaded already. 3191 movl(cnt2, int_cnt2); 3192 } 3193 push(tmp); // original SP 3194 3195 } // Finished loading 3196 3197 //======================================================== 3198 // Start search 3199 // 3200 3201 movptr(result, str1); // string addr 3202 3203 if (int_cnt2 < 0) { // Only for non constant substring 3204 jmpb(SCAN_TO_SUBSTR); 3205 3206 // SP saved at sp+0 3207 // String saved at sp+1*wordSize 3208 // Substr saved at sp+2*wordSize 3209 // Substr count saved at sp+3*wordSize 3210 3211 // Reload substr for rescan, this code 3212 // is executed only for large substrings (> 8 chars) 3213 bind(RELOAD_SUBSTR); 3214 movptr(str2, Address(rsp, 2*wordSize)); 3215 movl(cnt2, Address(rsp, 3*wordSize)); 3216 if (ae == StrIntrinsicNode::UL) { 3217 pmovzxbw(vec, Address(str2, 0)); 3218 } else { 3219 movdqu(vec, Address(str2, 0)); 3220 } 3221 // We came here after the beginning of the substring was 3222 // matched but the rest of it was not so we need to search 3223 // again. Start from the next element after the previous match. 3224 subptr(str1, result); // Restore counter 3225 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3226 shrl(str1, 1); 3227 } 3228 addl(cnt1, str1); 3229 decrementl(cnt1); // Shift to next element 3230 cmpl(cnt1, cnt2); 3231 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3232 3233 addptr(result, (1<<scale1)); 3234 } // non constant 3235 3236 // Scan string for start of substr in 16-byte vectors 3237 bind(SCAN_TO_SUBSTR); 3238 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3239 pcmpestri(vec, Address(result, 0), mode); 3240 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3241 subl(cnt1, stride); 3242 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3243 cmpl(cnt1, cnt2); 3244 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3245 addptr(result, 16); 3246 3247 bind(ADJUST_STR); 3248 cmpl(cnt1, stride); // Do not read beyond string 3249 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3250 // Back-up string to avoid reading beyond string. 3251 lea(result, Address(result, cnt1, scale1, -16)); 3252 movl(cnt1, stride); 3253 jmpb(SCAN_TO_SUBSTR); 3254 3255 // Found a potential substr 3256 bind(FOUND_CANDIDATE); 3257 // After pcmpestri tmp(rcx) contains matched element index 3258 3259 // Make sure string is still long enough 3260 subl(cnt1, tmp); 3261 cmpl(cnt1, cnt2); 3262 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3263 // Left less then substring. 3264 3265 bind(RET_NOT_FOUND); 3266 movl(result, -1); 3267 jmp(CLEANUP); 3268 3269 bind(FOUND_SUBSTR); 3270 // Compute start addr of substr 3271 lea(result, Address(result, tmp, scale1)); 3272 if (int_cnt2 > 0) { // Constant substring 3273 // Repeat search for small substring (< 8 chars) 3274 // from new point without reloading substring. 3275 // Have to check that we don't read beyond string. 3276 cmpl(tmp, stride-int_cnt2); 3277 jccb(Assembler::greater, ADJUST_STR); 3278 // Fall through if matched whole substring. 3279 } else { // non constant 3280 assert(int_cnt2 == -1, "should be != 0"); 3281 3282 addl(tmp, cnt2); 3283 // Found result if we matched whole substring. 3284 cmpl(tmp, stride); 3285 jcc(Assembler::lessEqual, RET_FOUND); 3286 3287 // Repeat search for small substring (<= 8 chars) 3288 // from new point 'str1' without reloading substring. 3289 cmpl(cnt2, stride); 3290 // Have to check that we don't read beyond string. 3291 jccb(Assembler::lessEqual, ADJUST_STR); 3292 3293 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3294 // Compare the rest of substring (> 8 chars). 3295 movptr(str1, result); 3296 3297 cmpl(tmp, cnt2); 3298 // First 8 chars are already matched. 3299 jccb(Assembler::equal, CHECK_NEXT); 3300 3301 bind(SCAN_SUBSTR); 3302 pcmpestri(vec, Address(str1, 0), mode); 3303 // Need to reload strings pointers if not matched whole vector 3304 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3305 3306 bind(CHECK_NEXT); 3307 subl(cnt2, stride); 3308 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3309 addptr(str1, 16); 3310 if (ae == StrIntrinsicNode::UL) { 3311 addptr(str2, 8); 3312 } else { 3313 addptr(str2, 16); 3314 } 3315 subl(cnt1, stride); 3316 cmpl(cnt2, stride); // Do not read beyond substring 3317 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3318 // Back-up strings to avoid reading beyond substring. 3319 3320 if (ae == StrIntrinsicNode::UL) { 3321 lea(str2, Address(str2, cnt2, scale2, -8)); 3322 lea(str1, Address(str1, cnt2, scale1, -16)); 3323 } else { 3324 lea(str2, Address(str2, cnt2, scale2, -16)); 3325 lea(str1, Address(str1, cnt2, scale1, -16)); 3326 } 3327 subl(cnt1, cnt2); 3328 movl(cnt2, stride); 3329 addl(cnt1, stride); 3330 bind(CONT_SCAN_SUBSTR); 3331 if (ae == StrIntrinsicNode::UL) { 3332 pmovzxbw(vec, Address(str2, 0)); 3333 } else { 3334 movdqu(vec, Address(str2, 0)); 3335 } 3336 jmp(SCAN_SUBSTR); 3337 3338 bind(RET_FOUND_LONG); 3339 movptr(str1, Address(rsp, wordSize)); 3340 } // non constant 3341 3342 bind(RET_FOUND); 3343 // Compute substr offset 3344 subptr(result, str1); 3345 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3346 shrl(result, 1); // index 3347 } 3348 bind(CLEANUP); 3349 pop(rsp); // restore SP 3350 3351 } // string_indexof 3352 3353 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3354 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3355 ShortBranchVerifier sbv(this); 3356 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3357 3358 int stride = 8; 3359 3360 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3361 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3362 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3363 FOUND_SEQ_CHAR, DONE_LABEL; 3364 3365 movptr(result, str1); 3366 if (UseAVX >= 2) { 3367 cmpl(cnt1, stride); 3368 jcc(Assembler::less, SCAN_TO_CHAR); 3369 cmpl(cnt1, 2*stride); 3370 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3371 movdl(vec1, ch); 3372 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3373 vpxor(vec2, vec2); 3374 movl(tmp, cnt1); 3375 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3376 andl(cnt1,0x0000000F); //tail count (in chars) 3377 3378 bind(SCAN_TO_16_CHAR_LOOP); 3379 vmovdqu(vec3, Address(result, 0)); 3380 vpcmpeqw(vec3, vec3, vec1, 1); 3381 vptest(vec2, vec3); 3382 jcc(Assembler::carryClear, FOUND_CHAR); 3383 addptr(result, 32); 3384 subl(tmp, 2*stride); 3385 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3386 jmp(SCAN_TO_8_CHAR); 3387 bind(SCAN_TO_8_CHAR_INIT); 3388 movdl(vec1, ch); 3389 pshuflw(vec1, vec1, 0x00); 3390 pshufd(vec1, vec1, 0); 3391 pxor(vec2, vec2); 3392 } 3393 bind(SCAN_TO_8_CHAR); 3394 cmpl(cnt1, stride); 3395 jcc(Assembler::less, SCAN_TO_CHAR); 3396 if (UseAVX < 2) { 3397 movdl(vec1, ch); 3398 pshuflw(vec1, vec1, 0x00); 3399 pshufd(vec1, vec1, 0); 3400 pxor(vec2, vec2); 3401 } 3402 movl(tmp, cnt1); 3403 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3404 andl(cnt1,0x00000007); //tail count (in chars) 3405 3406 bind(SCAN_TO_8_CHAR_LOOP); 3407 movdqu(vec3, Address(result, 0)); 3408 pcmpeqw(vec3, vec1); 3409 ptest(vec2, vec3); 3410 jcc(Assembler::carryClear, FOUND_CHAR); 3411 addptr(result, 16); 3412 subl(tmp, stride); 3413 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3414 bind(SCAN_TO_CHAR); 3415 testl(cnt1, cnt1); 3416 jcc(Assembler::zero, RET_NOT_FOUND); 3417 bind(SCAN_TO_CHAR_LOOP); 3418 load_unsigned_short(tmp, Address(result, 0)); 3419 cmpl(ch, tmp); 3420 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3421 addptr(result, 2); 3422 subl(cnt1, 1); 3423 jccb(Assembler::zero, RET_NOT_FOUND); 3424 jmp(SCAN_TO_CHAR_LOOP); 3425 3426 bind(RET_NOT_FOUND); 3427 movl(result, -1); 3428 jmpb(DONE_LABEL); 3429 3430 bind(FOUND_CHAR); 3431 if (UseAVX >= 2) { 3432 vpmovmskb(tmp, vec3); 3433 } else { 3434 pmovmskb(tmp, vec3); 3435 } 3436 bsfl(ch, tmp); 3437 addptr(result, ch); 3438 3439 bind(FOUND_SEQ_CHAR); 3440 subptr(result, str1); 3441 shrl(result, 1); 3442 3443 bind(DONE_LABEL); 3444 } // string_indexof_char 3445 3446 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3447 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3448 ShortBranchVerifier sbv(this); 3449 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3450 3451 int stride = 16; 3452 3453 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3454 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3455 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3456 FOUND_SEQ_CHAR, DONE_LABEL; 3457 3458 movptr(result, str1); 3459 if (UseAVX >= 2) { 3460 cmpl(cnt1, stride); 3461 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3462 cmpl(cnt1, stride*2); 3463 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3464 movdl(vec1, ch); 3465 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3466 vpxor(vec2, vec2); 3467 movl(tmp, cnt1); 3468 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3469 andl(cnt1,0x0000001F); //tail count (in chars) 3470 3471 bind(SCAN_TO_32_CHAR_LOOP); 3472 vmovdqu(vec3, Address(result, 0)); 3473 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3474 vptest(vec2, vec3); 3475 jcc(Assembler::carryClear, FOUND_CHAR); 3476 addptr(result, 32); 3477 subl(tmp, stride*2); 3478 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3479 jmp(SCAN_TO_16_CHAR); 3480 3481 bind(SCAN_TO_16_CHAR_INIT); 3482 movdl(vec1, ch); 3483 pxor(vec2, vec2); 3484 pshufb(vec1, vec2); 3485 } 3486 3487 bind(SCAN_TO_16_CHAR); 3488 cmpl(cnt1, stride); 3489 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3490 if (UseAVX < 2) { 3491 movdl(vec1, ch); 3492 pxor(vec2, vec2); 3493 pshufb(vec1, vec2); 3494 } 3495 movl(tmp, cnt1); 3496 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3497 andl(cnt1,0x0000000F); //tail count (in bytes) 3498 3499 bind(SCAN_TO_16_CHAR_LOOP); 3500 movdqu(vec3, Address(result, 0)); 3501 pcmpeqb(vec3, vec1); 3502 ptest(vec2, vec3); 3503 jcc(Assembler::carryClear, FOUND_CHAR); 3504 addptr(result, 16); 3505 subl(tmp, stride); 3506 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3507 3508 bind(SCAN_TO_CHAR_INIT); 3509 testl(cnt1, cnt1); 3510 jcc(Assembler::zero, RET_NOT_FOUND); 3511 bind(SCAN_TO_CHAR_LOOP); 3512 load_unsigned_byte(tmp, Address(result, 0)); 3513 cmpl(ch, tmp); 3514 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3515 addptr(result, 1); 3516 subl(cnt1, 1); 3517 jccb(Assembler::zero, RET_NOT_FOUND); 3518 jmp(SCAN_TO_CHAR_LOOP); 3519 3520 bind(RET_NOT_FOUND); 3521 movl(result, -1); 3522 jmpb(DONE_LABEL); 3523 3524 bind(FOUND_CHAR); 3525 if (UseAVX >= 2) { 3526 vpmovmskb(tmp, vec3); 3527 } else { 3528 pmovmskb(tmp, vec3); 3529 } 3530 bsfl(ch, tmp); 3531 addptr(result, ch); 3532 3533 bind(FOUND_SEQ_CHAR); 3534 subptr(result, str1); 3535 3536 bind(DONE_LABEL); 3537 } // stringL_indexof_char 3538 3539 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3540 switch (eltype) { 3541 case T_BOOLEAN: return sizeof(jboolean); 3542 case T_BYTE: return sizeof(jbyte); 3543 case T_SHORT: return sizeof(jshort); 3544 case T_CHAR: return sizeof(jchar); 3545 case T_INT: return sizeof(jint); 3546 default: 3547 ShouldNotReachHere(); 3548 return -1; 3549 } 3550 } 3551 3552 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3553 switch (eltype) { 3554 // T_BOOLEAN used as surrogate for unsigned byte 3555 case T_BOOLEAN: movzbl(dst, src); break; 3556 case T_BYTE: movsbl(dst, src); break; 3557 case T_SHORT: movswl(dst, src); break; 3558 case T_CHAR: movzwl(dst, src); break; 3559 case T_INT: movl(dst, src); break; 3560 default: 3561 ShouldNotReachHere(); 3562 } 3563 } 3564 3565 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3566 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3567 } 3568 3569 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3570 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3571 } 3572 3573 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3574 const int vlen = Assembler::AVX_256bit; 3575 switch (eltype) { 3576 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3577 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3578 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3579 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3580 case T_INT: 3581 // do nothing 3582 break; 3583 default: 3584 ShouldNotReachHere(); 3585 } 3586 } 3587 3588 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3589 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3590 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3591 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3592 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3593 BasicType eltype) { 3594 ShortBranchVerifier sbv(this); 3595 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3596 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3597 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3598 3599 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3600 SHORT_UNROLLED_LOOP_EXIT, 3601 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3602 UNROLLED_VECTOR_LOOP_BEGIN, 3603 END; 3604 switch (eltype) { 3605 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3606 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3607 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3608 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3609 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3610 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3611 } 3612 3613 // For "renaming" for readibility of the code 3614 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3615 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3616 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3617 3618 const int elsize = arrays_hashcode_elsize(eltype); 3619 3620 /* 3621 if (cnt1 >= 2) { 3622 if (cnt1 >= 32) { 3623 UNROLLED VECTOR LOOP 3624 } 3625 UNROLLED SCALAR LOOP 3626 } 3627 SINGLE SCALAR 3628 */ 3629 3630 cmpl(cnt1, 32); 3631 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3632 3633 // cnt1 >= 32 && generate_vectorized_loop 3634 xorl(index, index); 3635 3636 // vresult = IntVector.zero(I256); 3637 for (int idx = 0; idx < 4; idx++) { 3638 vpxor(vresult[idx], vresult[idx]); 3639 } 3640 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3641 Register bound = tmp2; 3642 Register next = tmp3; 3643 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3644 movl(next, Address(tmp2, 0)); 3645 movdl(vnext, next); 3646 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3647 3648 // index = 0; 3649 // bound = cnt1 & ~(32 - 1); 3650 movl(bound, cnt1); 3651 andl(bound, ~(32 - 1)); 3652 // for (; index < bound; index += 32) { 3653 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3654 // result *= next; 3655 imull(result, next); 3656 // loop fission to upfront the cost of fetching from memory, OOO execution 3657 // can then hopefully do a better job of prefetching 3658 for (int idx = 0; idx < 4; idx++) { 3659 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3660 } 3661 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3662 for (int idx = 0; idx < 4; idx++) { 3663 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3664 arrays_hashcode_elvcast(vtmp[idx], eltype); 3665 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3666 } 3667 // index += 32; 3668 addl(index, 32); 3669 // index < bound; 3670 cmpl(index, bound); 3671 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3672 // } 3673 3674 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3675 subl(cnt1, bound); 3676 // release bound 3677 3678 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3679 for (int idx = 0; idx < 4; idx++) { 3680 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3681 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3682 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3683 } 3684 // result += vresult.reduceLanes(ADD); 3685 for (int idx = 0; idx < 4; idx++) { 3686 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3687 } 3688 3689 // } else if (cnt1 < 32) { 3690 3691 bind(SHORT_UNROLLED_BEGIN); 3692 // int i = 1; 3693 movl(index, 1); 3694 cmpl(index, cnt1); 3695 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3696 3697 // for (; i < cnt1 ; i += 2) { 3698 bind(SHORT_UNROLLED_LOOP_BEGIN); 3699 movl(tmp3, 961); 3700 imull(result, tmp3); 3701 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3702 movl(tmp3, tmp2); 3703 shll(tmp3, 5); 3704 subl(tmp3, tmp2); 3705 addl(result, tmp3); 3706 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3707 addl(result, tmp3); 3708 addl(index, 2); 3709 cmpl(index, cnt1); 3710 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3711 3712 // } 3713 // if (i >= cnt1) { 3714 bind(SHORT_UNROLLED_LOOP_EXIT); 3715 jccb(Assembler::greater, END); 3716 movl(tmp2, result); 3717 shll(result, 5); 3718 subl(result, tmp2); 3719 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3720 addl(result, tmp3); 3721 // } 3722 bind(END); 3723 3724 BLOCK_COMMENT("} // arrays_hashcode"); 3725 3726 } // arrays_hashcode 3727 3728 // helper function for string_compare 3729 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3730 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3731 Address::ScaleFactor scale2, Register index, int ae) { 3732 if (ae == StrIntrinsicNode::LL) { 3733 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3734 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3735 } else if (ae == StrIntrinsicNode::UU) { 3736 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3737 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3738 } else { 3739 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3740 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3741 } 3742 } 3743 3744 // Compare strings, used for char[] and byte[]. 3745 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3746 Register cnt1, Register cnt2, Register result, 3747 XMMRegister vec1, int ae, KRegister mask) { 3748 ShortBranchVerifier sbv(this); 3749 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3750 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3751 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3752 int stride2x2 = 0x40; 3753 Address::ScaleFactor scale = Address::no_scale; 3754 Address::ScaleFactor scale1 = Address::no_scale; 3755 Address::ScaleFactor scale2 = Address::no_scale; 3756 3757 if (ae != StrIntrinsicNode::LL) { 3758 stride2x2 = 0x20; 3759 } 3760 3761 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3762 shrl(cnt2, 1); 3763 } 3764 // Compute the minimum of the string lengths and the 3765 // difference of the string lengths (stack). 3766 // Do the conditional move stuff 3767 movl(result, cnt1); 3768 subl(cnt1, cnt2); 3769 push(cnt1); 3770 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3771 3772 // Is the minimum length zero? 3773 testl(cnt2, cnt2); 3774 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3775 if (ae == StrIntrinsicNode::LL) { 3776 // Load first bytes 3777 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3778 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3779 } else if (ae == StrIntrinsicNode::UU) { 3780 // Load first characters 3781 load_unsigned_short(result, Address(str1, 0)); 3782 load_unsigned_short(cnt1, Address(str2, 0)); 3783 } else { 3784 load_unsigned_byte(result, Address(str1, 0)); 3785 load_unsigned_short(cnt1, Address(str2, 0)); 3786 } 3787 subl(result, cnt1); 3788 jcc(Assembler::notZero, POP_LABEL); 3789 3790 if (ae == StrIntrinsicNode::UU) { 3791 // Divide length by 2 to get number of chars 3792 shrl(cnt2, 1); 3793 } 3794 cmpl(cnt2, 1); 3795 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3796 3797 // Check if the strings start at the same location and setup scale and stride 3798 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3799 cmpptr(str1, str2); 3800 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3801 if (ae == StrIntrinsicNode::LL) { 3802 scale = Address::times_1; 3803 stride = 16; 3804 } else { 3805 scale = Address::times_2; 3806 stride = 8; 3807 } 3808 } else { 3809 scale1 = Address::times_1; 3810 scale2 = Address::times_2; 3811 // scale not used 3812 stride = 8; 3813 } 3814 3815 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3816 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3817 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3818 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3819 Label COMPARE_TAIL_LONG; 3820 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3821 3822 int pcmpmask = 0x19; 3823 if (ae == StrIntrinsicNode::LL) { 3824 pcmpmask &= ~0x01; 3825 } 3826 3827 // Setup to compare 16-chars (32-bytes) vectors, 3828 // start from first character again because it has aligned address. 3829 if (ae == StrIntrinsicNode::LL) { 3830 stride2 = 32; 3831 } else { 3832 stride2 = 16; 3833 } 3834 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3835 adr_stride = stride << scale; 3836 } else { 3837 adr_stride1 = 8; //stride << scale1; 3838 adr_stride2 = 16; //stride << scale2; 3839 } 3840 3841 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3842 // rax and rdx are used by pcmpestri as elements counters 3843 movl(result, cnt2); 3844 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3845 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3846 3847 // fast path : compare first 2 8-char vectors. 3848 bind(COMPARE_16_CHARS); 3849 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3850 movdqu(vec1, Address(str1, 0)); 3851 } else { 3852 pmovzxbw(vec1, Address(str1, 0)); 3853 } 3854 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3855 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3856 3857 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3858 movdqu(vec1, Address(str1, adr_stride)); 3859 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3860 } else { 3861 pmovzxbw(vec1, Address(str1, adr_stride1)); 3862 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3863 } 3864 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3865 addl(cnt1, stride); 3866 3867 // Compare the characters at index in cnt1 3868 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3869 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3870 subl(result, cnt2); 3871 jmp(POP_LABEL); 3872 3873 // Setup the registers to start vector comparison loop 3874 bind(COMPARE_WIDE_VECTORS); 3875 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3876 lea(str1, Address(str1, result, scale)); 3877 lea(str2, Address(str2, result, scale)); 3878 } else { 3879 lea(str1, Address(str1, result, scale1)); 3880 lea(str2, Address(str2, result, scale2)); 3881 } 3882 subl(result, stride2); 3883 subl(cnt2, stride2); 3884 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3885 negptr(result); 3886 3887 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3888 bind(COMPARE_WIDE_VECTORS_LOOP); 3889 3890 #ifdef _LP64 3891 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3892 cmpl(cnt2, stride2x2); 3893 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3894 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3895 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3896 3897 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3898 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3899 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3900 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3901 } else { 3902 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3903 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3904 } 3905 kortestql(mask, mask); 3906 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3907 addptr(result, stride2x2); // update since we already compared at this addr 3908 subl(cnt2, stride2x2); // and sub the size too 3909 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3910 3911 vpxor(vec1, vec1); 3912 jmpb(COMPARE_WIDE_TAIL); 3913 }//if (VM_Version::supports_avx512vlbw()) 3914 #endif // _LP64 3915 3916 3917 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3918 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3919 vmovdqu(vec1, Address(str1, result, scale)); 3920 vpxor(vec1, Address(str2, result, scale)); 3921 } else { 3922 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3923 vpxor(vec1, Address(str2, result, scale2)); 3924 } 3925 vptest(vec1, vec1); 3926 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3927 addptr(result, stride2); 3928 subl(cnt2, stride2); 3929 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3930 // clean upper bits of YMM registers 3931 vpxor(vec1, vec1); 3932 3933 // compare wide vectors tail 3934 bind(COMPARE_WIDE_TAIL); 3935 testptr(result, result); 3936 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3937 3938 movl(result, stride2); 3939 movl(cnt2, result); 3940 negptr(result); 3941 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3942 3943 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3944 bind(VECTOR_NOT_EQUAL); 3945 // clean upper bits of YMM registers 3946 vpxor(vec1, vec1); 3947 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3948 lea(str1, Address(str1, result, scale)); 3949 lea(str2, Address(str2, result, scale)); 3950 } else { 3951 lea(str1, Address(str1, result, scale1)); 3952 lea(str2, Address(str2, result, scale2)); 3953 } 3954 jmp(COMPARE_16_CHARS); 3955 3956 // Compare tail chars, length between 1 to 15 chars 3957 bind(COMPARE_TAIL_LONG); 3958 movl(cnt2, result); 3959 cmpl(cnt2, stride); 3960 jcc(Assembler::less, COMPARE_SMALL_STR); 3961 3962 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3963 movdqu(vec1, Address(str1, 0)); 3964 } else { 3965 pmovzxbw(vec1, Address(str1, 0)); 3966 } 3967 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3968 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3969 subptr(cnt2, stride); 3970 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3971 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3972 lea(str1, Address(str1, result, scale)); 3973 lea(str2, Address(str2, result, scale)); 3974 } else { 3975 lea(str1, Address(str1, result, scale1)); 3976 lea(str2, Address(str2, result, scale2)); 3977 } 3978 negptr(cnt2); 3979 jmpb(WHILE_HEAD_LABEL); 3980 3981 bind(COMPARE_SMALL_STR); 3982 } else if (UseSSE42Intrinsics) { 3983 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3984 int pcmpmask = 0x19; 3985 // Setup to compare 8-char (16-byte) vectors, 3986 // start from first character again because it has aligned address. 3987 movl(result, cnt2); 3988 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3989 if (ae == StrIntrinsicNode::LL) { 3990 pcmpmask &= ~0x01; 3991 } 3992 jcc(Assembler::zero, COMPARE_TAIL); 3993 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3994 lea(str1, Address(str1, result, scale)); 3995 lea(str2, Address(str2, result, scale)); 3996 } else { 3997 lea(str1, Address(str1, result, scale1)); 3998 lea(str2, Address(str2, result, scale2)); 3999 } 4000 negptr(result); 4001 4002 // pcmpestri 4003 // inputs: 4004 // vec1- substring 4005 // rax - negative string length (elements count) 4006 // mem - scanned string 4007 // rdx - string length (elements count) 4008 // pcmpmask - cmp mode: 11000 (string compare with negated result) 4009 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 4010 // outputs: 4011 // rcx - first mismatched element index 4012 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 4013 4014 bind(COMPARE_WIDE_VECTORS); 4015 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4016 movdqu(vec1, Address(str1, result, scale)); 4017 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 4018 } else { 4019 pmovzxbw(vec1, Address(str1, result, scale1)); 4020 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4021 } 4022 // After pcmpestri cnt1(rcx) contains mismatched element index 4023 4024 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 4025 addptr(result, stride); 4026 subptr(cnt2, stride); 4027 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4028 4029 // compare wide vectors tail 4030 testptr(result, result); 4031 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 4032 4033 movl(cnt2, stride); 4034 movl(result, stride); 4035 negptr(result); 4036 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4037 movdqu(vec1, Address(str1, result, scale)); 4038 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 4039 } else { 4040 pmovzxbw(vec1, Address(str1, result, scale1)); 4041 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4042 } 4043 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 4044 4045 // Mismatched characters in the vectors 4046 bind(VECTOR_NOT_EQUAL); 4047 addptr(cnt1, result); 4048 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 4049 subl(result, cnt2); 4050 jmpb(POP_LABEL); 4051 4052 bind(COMPARE_TAIL); // limit is zero 4053 movl(cnt2, result); 4054 // Fallthru to tail compare 4055 } 4056 // Shift str2 and str1 to the end of the arrays, negate min 4057 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4058 lea(str1, Address(str1, cnt2, scale)); 4059 lea(str2, Address(str2, cnt2, scale)); 4060 } else { 4061 lea(str1, Address(str1, cnt2, scale1)); 4062 lea(str2, Address(str2, cnt2, scale2)); 4063 } 4064 decrementl(cnt2); // first character was compared already 4065 negptr(cnt2); 4066 4067 // Compare the rest of the elements 4068 bind(WHILE_HEAD_LABEL); 4069 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 4070 subl(result, cnt1); 4071 jccb(Assembler::notZero, POP_LABEL); 4072 increment(cnt2); 4073 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 4074 4075 // Strings are equal up to min length. Return the length difference. 4076 bind(LENGTH_DIFF_LABEL); 4077 pop(result); 4078 if (ae == StrIntrinsicNode::UU) { 4079 // Divide diff by 2 to get number of chars 4080 sarl(result, 1); 4081 } 4082 jmpb(DONE_LABEL); 4083 4084 #ifdef _LP64 4085 if (VM_Version::supports_avx512vlbw()) { 4086 4087 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 4088 4089 kmovql(cnt1, mask); 4090 notq(cnt1); 4091 bsfq(cnt2, cnt1); 4092 if (ae != StrIntrinsicNode::LL) { 4093 // Divide diff by 2 to get number of chars 4094 sarl(cnt2, 1); 4095 } 4096 addq(result, cnt2); 4097 if (ae == StrIntrinsicNode::LL) { 4098 load_unsigned_byte(cnt1, Address(str2, result)); 4099 load_unsigned_byte(result, Address(str1, result)); 4100 } else if (ae == StrIntrinsicNode::UU) { 4101 load_unsigned_short(cnt1, Address(str2, result, scale)); 4102 load_unsigned_short(result, Address(str1, result, scale)); 4103 } else { 4104 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4105 load_unsigned_byte(result, Address(str1, result, scale1)); 4106 } 4107 subl(result, cnt1); 4108 jmpb(POP_LABEL); 4109 }//if (VM_Version::supports_avx512vlbw()) 4110 #endif // _LP64 4111 4112 // Discard the stored length difference 4113 bind(POP_LABEL); 4114 pop(cnt1); 4115 4116 // That's it 4117 bind(DONE_LABEL); 4118 if(ae == StrIntrinsicNode::UL) { 4119 negl(result); 4120 } 4121 4122 } 4123 4124 // Search for Non-ASCII character (Negative byte value) in a byte array, 4125 // return the index of the first such character, otherwise the length 4126 // of the array segment searched. 4127 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4128 // @IntrinsicCandidate 4129 // public static int countPositives(byte[] ba, int off, int len) { 4130 // for (int i = off; i < off + len; i++) { 4131 // if (ba[i] < 0) { 4132 // return i - off; 4133 // } 4134 // } 4135 // return len; 4136 // } 4137 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4138 Register result, Register tmp1, 4139 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4140 // rsi: byte array 4141 // rcx: len 4142 // rax: result 4143 ShortBranchVerifier sbv(this); 4144 assert_different_registers(ary1, len, result, tmp1); 4145 assert_different_registers(vec1, vec2); 4146 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4147 4148 movl(result, len); // copy 4149 // len == 0 4150 testl(len, len); 4151 jcc(Assembler::zero, DONE); 4152 4153 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4154 VM_Version::supports_avx512vlbw() && 4155 VM_Version::supports_bmi2()) { 4156 4157 Label test_64_loop, test_tail, BREAK_LOOP; 4158 movl(tmp1, len); 4159 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4160 4161 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4162 andl(len, 0xffffffc0); // vector count (in chars) 4163 jccb(Assembler::zero, test_tail); 4164 4165 lea(ary1, Address(ary1, len, Address::times_1)); 4166 negptr(len); 4167 4168 bind(test_64_loop); 4169 // Check whether our 64 elements of size byte contain negatives 4170 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4171 kortestql(mask1, mask1); 4172 jcc(Assembler::notZero, BREAK_LOOP); 4173 4174 addptr(len, 64); 4175 jccb(Assembler::notZero, test_64_loop); 4176 4177 bind(test_tail); 4178 // bail out when there is nothing to be done 4179 testl(tmp1, -1); 4180 jcc(Assembler::zero, DONE); 4181 4182 4183 // check the tail for absense of negatives 4184 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4185 #ifdef _LP64 4186 { 4187 Register tmp3_aliased = len; 4188 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4189 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4190 notq(tmp3_aliased); 4191 kmovql(mask2, tmp3_aliased); 4192 } 4193 #else 4194 Label k_init; 4195 jmp(k_init); 4196 4197 // We could not read 64-bits from a general purpose register thus we move 4198 // data required to compose 64 1's to the instruction stream 4199 // We emit 64 byte wide series of elements from 0..63 which later on would 4200 // be used as a compare targets with tail count contained in tmp1 register. 4201 // Result would be a k register having tmp1 consecutive number or 1 4202 // counting from least significant bit. 4203 address tmp = pc(); 4204 emit_int64(0x0706050403020100); 4205 emit_int64(0x0F0E0D0C0B0A0908); 4206 emit_int64(0x1716151413121110); 4207 emit_int64(0x1F1E1D1C1B1A1918); 4208 emit_int64(0x2726252423222120); 4209 emit_int64(0x2F2E2D2C2B2A2928); 4210 emit_int64(0x3736353433323130); 4211 emit_int64(0x3F3E3D3C3B3A3938); 4212 4213 bind(k_init); 4214 lea(len, InternalAddress(tmp)); 4215 // create mask to test for negative byte inside a vector 4216 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 4217 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 4218 4219 #endif 4220 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4221 ktestq(mask1, mask2); 4222 jcc(Assembler::zero, DONE); 4223 4224 // do a full check for negative registers in the tail 4225 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4226 // ary1 already pointing to the right place 4227 jmpb(TAIL_START); 4228 4229 bind(BREAK_LOOP); 4230 // At least one byte in the last 64 byte block was negative. 4231 // Set up to look at the last 64 bytes as if they were a tail 4232 lea(ary1, Address(ary1, len, Address::times_1)); 4233 addptr(result, len); 4234 // Ignore the very last byte: if all others are positive, 4235 // it must be negative, so we can skip right to the 2+1 byte 4236 // end comparison at this point 4237 orl(result, 63); 4238 movl(len, 63); 4239 // Fallthru to tail compare 4240 } else { 4241 4242 if (UseAVX >= 2 && UseSSE >= 2) { 4243 // With AVX2, use 32-byte vector compare 4244 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4245 4246 // Compare 32-byte vectors 4247 testl(len, 0xffffffe0); // vector count (in bytes) 4248 jccb(Assembler::zero, TAIL_START); 4249 4250 andl(len, 0xffffffe0); 4251 lea(ary1, Address(ary1, len, Address::times_1)); 4252 negptr(len); 4253 4254 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4255 movdl(vec2, tmp1); 4256 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4257 4258 bind(COMPARE_WIDE_VECTORS); 4259 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4260 vptest(vec1, vec2); 4261 jccb(Assembler::notZero, BREAK_LOOP); 4262 addptr(len, 32); 4263 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4264 4265 testl(result, 0x0000001f); // any bytes remaining? 4266 jcc(Assembler::zero, DONE); 4267 4268 // Quick test using the already prepared vector mask 4269 movl(len, result); 4270 andl(len, 0x0000001f); 4271 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4272 vptest(vec1, vec2); 4273 jcc(Assembler::zero, DONE); 4274 // There are zeros, jump to the tail to determine exactly where 4275 jmpb(TAIL_START); 4276 4277 bind(BREAK_LOOP); 4278 // At least one byte in the last 32-byte vector is negative. 4279 // Set up to look at the last 32 bytes as if they were a tail 4280 lea(ary1, Address(ary1, len, Address::times_1)); 4281 addptr(result, len); 4282 // Ignore the very last byte: if all others are positive, 4283 // it must be negative, so we can skip right to the 2+1 byte 4284 // end comparison at this point 4285 orl(result, 31); 4286 movl(len, 31); 4287 // Fallthru to tail compare 4288 } else if (UseSSE42Intrinsics) { 4289 // With SSE4.2, use double quad vector compare 4290 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4291 4292 // Compare 16-byte vectors 4293 testl(len, 0xfffffff0); // vector count (in bytes) 4294 jcc(Assembler::zero, TAIL_START); 4295 4296 andl(len, 0xfffffff0); 4297 lea(ary1, Address(ary1, len, Address::times_1)); 4298 negptr(len); 4299 4300 movl(tmp1, 0x80808080); 4301 movdl(vec2, tmp1); 4302 pshufd(vec2, vec2, 0); 4303 4304 bind(COMPARE_WIDE_VECTORS); 4305 movdqu(vec1, Address(ary1, len, Address::times_1)); 4306 ptest(vec1, vec2); 4307 jccb(Assembler::notZero, BREAK_LOOP); 4308 addptr(len, 16); 4309 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4310 4311 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4312 jcc(Assembler::zero, DONE); 4313 4314 // Quick test using the already prepared vector mask 4315 movl(len, result); 4316 andl(len, 0x0000000f); // tail count (in bytes) 4317 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4318 ptest(vec1, vec2); 4319 jcc(Assembler::zero, DONE); 4320 jmpb(TAIL_START); 4321 4322 bind(BREAK_LOOP); 4323 // At least one byte in the last 16-byte vector is negative. 4324 // Set up and look at the last 16 bytes as if they were a tail 4325 lea(ary1, Address(ary1, len, Address::times_1)); 4326 addptr(result, len); 4327 // Ignore the very last byte: if all others are positive, 4328 // it must be negative, so we can skip right to the 2+1 byte 4329 // end comparison at this point 4330 orl(result, 15); 4331 movl(len, 15); 4332 // Fallthru to tail compare 4333 } 4334 } 4335 4336 bind(TAIL_START); 4337 // Compare 4-byte vectors 4338 andl(len, 0xfffffffc); // vector count (in bytes) 4339 jccb(Assembler::zero, COMPARE_CHAR); 4340 4341 lea(ary1, Address(ary1, len, Address::times_1)); 4342 negptr(len); 4343 4344 bind(COMPARE_VECTORS); 4345 movl(tmp1, Address(ary1, len, Address::times_1)); 4346 andl(tmp1, 0x80808080); 4347 jccb(Assembler::notZero, TAIL_ADJUST); 4348 addptr(len, 4); 4349 jccb(Assembler::notZero, COMPARE_VECTORS); 4350 4351 // Compare trailing char (final 2-3 bytes), if any 4352 bind(COMPARE_CHAR); 4353 4354 testl(result, 0x2); // tail char 4355 jccb(Assembler::zero, COMPARE_BYTE); 4356 load_unsigned_short(tmp1, Address(ary1, 0)); 4357 andl(tmp1, 0x00008080); 4358 jccb(Assembler::notZero, CHAR_ADJUST); 4359 lea(ary1, Address(ary1, 2)); 4360 4361 bind(COMPARE_BYTE); 4362 testl(result, 0x1); // tail byte 4363 jccb(Assembler::zero, DONE); 4364 load_unsigned_byte(tmp1, Address(ary1, 0)); 4365 testl(tmp1, 0x00000080); 4366 jccb(Assembler::zero, DONE); 4367 subptr(result, 1); 4368 jmpb(DONE); 4369 4370 bind(TAIL_ADJUST); 4371 // there are negative bits in the last 4 byte block. 4372 // Adjust result and check the next three bytes 4373 addptr(result, len); 4374 orl(result, 3); 4375 lea(ary1, Address(ary1, len, Address::times_1)); 4376 jmpb(COMPARE_CHAR); 4377 4378 bind(CHAR_ADJUST); 4379 // We are looking at a char + optional byte tail, and found that one 4380 // of the bytes in the char is negative. Adjust the result, check the 4381 // first byte and readjust if needed. 4382 andl(result, 0xfffffffc); 4383 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4384 jccb(Assembler::notZero, DONE); 4385 addptr(result, 1); 4386 4387 // That's it 4388 bind(DONE); 4389 if (UseAVX >= 2 && UseSSE >= 2) { 4390 // clean upper bits of YMM registers 4391 vpxor(vec1, vec1); 4392 vpxor(vec2, vec2); 4393 } 4394 } 4395 4396 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4397 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4398 Register limit, Register result, Register chr, 4399 XMMRegister vec1, XMMRegister vec2, bool is_char, 4400 KRegister mask, bool expand_ary2) { 4401 // for expand_ary2, limit is the (smaller) size of the second array. 4402 ShortBranchVerifier sbv(this); 4403 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4404 4405 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4406 "Expansion only implemented for AVX2"); 4407 4408 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4409 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4410 4411 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4412 int scaleIncr = expand_ary2 ? 8 : 16; 4413 4414 if (is_array_equ) { 4415 // Check the input args 4416 cmpoop(ary1, ary2); 4417 jcc(Assembler::equal, TRUE_LABEL); 4418 4419 // Need additional checks for arrays_equals. 4420 testptr(ary1, ary1); 4421 jcc(Assembler::zero, FALSE_LABEL); 4422 testptr(ary2, ary2); 4423 jcc(Assembler::zero, FALSE_LABEL); 4424 4425 // Check the lengths 4426 movl(limit, Address(ary1, length_offset)); 4427 cmpl(limit, Address(ary2, length_offset)); 4428 jcc(Assembler::notEqual, FALSE_LABEL); 4429 } 4430 4431 // count == 0 4432 testl(limit, limit); 4433 jcc(Assembler::zero, TRUE_LABEL); 4434 4435 if (is_array_equ) { 4436 // Load array address 4437 lea(ary1, Address(ary1, base_offset)); 4438 lea(ary2, Address(ary2, base_offset)); 4439 } 4440 4441 if (is_array_equ && is_char) { 4442 // arrays_equals when used for char[]. 4443 shll(limit, 1); // byte count != 0 4444 } 4445 movl(result, limit); // copy 4446 4447 if (UseAVX >= 2) { 4448 // With AVX2, use 32-byte vector compare 4449 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4450 4451 // Compare 32-byte vectors 4452 if (expand_ary2) { 4453 andl(result, 0x0000000f); // tail count (in bytes) 4454 andl(limit, 0xfffffff0); // vector count (in bytes) 4455 jcc(Assembler::zero, COMPARE_TAIL); 4456 } else { 4457 andl(result, 0x0000001f); // tail count (in bytes) 4458 andl(limit, 0xffffffe0); // vector count (in bytes) 4459 jcc(Assembler::zero, COMPARE_TAIL_16); 4460 } 4461 4462 lea(ary1, Address(ary1, limit, scaleFactor)); 4463 lea(ary2, Address(ary2, limit, Address::times_1)); 4464 negptr(limit); 4465 4466 #ifdef _LP64 4467 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4468 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4469 4470 cmpl(limit, -64); 4471 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4472 4473 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4474 4475 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4476 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4477 kortestql(mask, mask); 4478 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4479 addptr(limit, 64); // update since we already compared at this addr 4480 cmpl(limit, -64); 4481 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4482 4483 // At this point we may still need to compare -limit+result bytes. 4484 // We could execute the next two instruction and just continue via non-wide path: 4485 // cmpl(limit, 0); 4486 // jcc(Assembler::equal, COMPARE_TAIL); // true 4487 // But since we stopped at the points ary{1,2}+limit which are 4488 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4489 // (|limit| <= 32 and result < 32), 4490 // we may just compare the last 64 bytes. 4491 // 4492 addptr(result, -64); // it is safe, bc we just came from this area 4493 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4494 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4495 kortestql(mask, mask); 4496 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4497 4498 jmp(TRUE_LABEL); 4499 4500 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4501 4502 }//if (VM_Version::supports_avx512vlbw()) 4503 #endif //_LP64 4504 bind(COMPARE_WIDE_VECTORS); 4505 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4506 if (expand_ary2) { 4507 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4508 } else { 4509 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4510 } 4511 vpxor(vec1, vec2); 4512 4513 vptest(vec1, vec1); 4514 jcc(Assembler::notZero, FALSE_LABEL); 4515 addptr(limit, scaleIncr * 2); 4516 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4517 4518 testl(result, result); 4519 jcc(Assembler::zero, TRUE_LABEL); 4520 4521 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4522 if (expand_ary2) { 4523 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4524 } else { 4525 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4526 } 4527 vpxor(vec1, vec2); 4528 4529 vptest(vec1, vec1); 4530 jcc(Assembler::notZero, FALSE_LABEL); 4531 jmp(TRUE_LABEL); 4532 4533 bind(COMPARE_TAIL_16); // limit is zero 4534 movl(limit, result); 4535 4536 // Compare 16-byte chunks 4537 andl(result, 0x0000000f); // tail count (in bytes) 4538 andl(limit, 0xfffffff0); // vector count (in bytes) 4539 jcc(Assembler::zero, COMPARE_TAIL); 4540 4541 lea(ary1, Address(ary1, limit, scaleFactor)); 4542 lea(ary2, Address(ary2, limit, Address::times_1)); 4543 negptr(limit); 4544 4545 bind(COMPARE_WIDE_VECTORS_16); 4546 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4547 if (expand_ary2) { 4548 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4549 } else { 4550 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4551 } 4552 pxor(vec1, vec2); 4553 4554 ptest(vec1, vec1); 4555 jcc(Assembler::notZero, FALSE_LABEL); 4556 addptr(limit, scaleIncr); 4557 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4558 4559 bind(COMPARE_TAIL); // limit is zero 4560 movl(limit, result); 4561 // Fallthru to tail compare 4562 } else if (UseSSE42Intrinsics) { 4563 // With SSE4.2, use double quad vector compare 4564 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4565 4566 // Compare 16-byte vectors 4567 andl(result, 0x0000000f); // tail count (in bytes) 4568 andl(limit, 0xfffffff0); // vector count (in bytes) 4569 jcc(Assembler::zero, COMPARE_TAIL); 4570 4571 lea(ary1, Address(ary1, limit, Address::times_1)); 4572 lea(ary2, Address(ary2, limit, Address::times_1)); 4573 negptr(limit); 4574 4575 bind(COMPARE_WIDE_VECTORS); 4576 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4577 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4578 pxor(vec1, vec2); 4579 4580 ptest(vec1, vec1); 4581 jcc(Assembler::notZero, FALSE_LABEL); 4582 addptr(limit, 16); 4583 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4584 4585 testl(result, result); 4586 jcc(Assembler::zero, TRUE_LABEL); 4587 4588 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4589 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4590 pxor(vec1, vec2); 4591 4592 ptest(vec1, vec1); 4593 jccb(Assembler::notZero, FALSE_LABEL); 4594 jmpb(TRUE_LABEL); 4595 4596 bind(COMPARE_TAIL); // limit is zero 4597 movl(limit, result); 4598 // Fallthru to tail compare 4599 } 4600 4601 // Compare 4-byte vectors 4602 if (expand_ary2) { 4603 testl(result, result); 4604 jccb(Assembler::zero, TRUE_LABEL); 4605 } else { 4606 andl(limit, 0xfffffffc); // vector count (in bytes) 4607 jccb(Assembler::zero, COMPARE_CHAR); 4608 } 4609 4610 lea(ary1, Address(ary1, limit, scaleFactor)); 4611 lea(ary2, Address(ary2, limit, Address::times_1)); 4612 negptr(limit); 4613 4614 bind(COMPARE_VECTORS); 4615 if (expand_ary2) { 4616 // There are no "vector" operations for bytes to shorts 4617 movzbl(chr, Address(ary2, limit, Address::times_1)); 4618 cmpw(Address(ary1, limit, Address::times_2), chr); 4619 jccb(Assembler::notEqual, FALSE_LABEL); 4620 addptr(limit, 1); 4621 jcc(Assembler::notZero, COMPARE_VECTORS); 4622 jmp(TRUE_LABEL); 4623 } else { 4624 movl(chr, Address(ary1, limit, Address::times_1)); 4625 cmpl(chr, Address(ary2, limit, Address::times_1)); 4626 jccb(Assembler::notEqual, FALSE_LABEL); 4627 addptr(limit, 4); 4628 jcc(Assembler::notZero, COMPARE_VECTORS); 4629 } 4630 4631 // Compare trailing char (final 2 bytes), if any 4632 bind(COMPARE_CHAR); 4633 testl(result, 0x2); // tail char 4634 jccb(Assembler::zero, COMPARE_BYTE); 4635 load_unsigned_short(chr, Address(ary1, 0)); 4636 load_unsigned_short(limit, Address(ary2, 0)); 4637 cmpl(chr, limit); 4638 jccb(Assembler::notEqual, FALSE_LABEL); 4639 4640 if (is_array_equ && is_char) { 4641 bind(COMPARE_BYTE); 4642 } else { 4643 lea(ary1, Address(ary1, 2)); 4644 lea(ary2, Address(ary2, 2)); 4645 4646 bind(COMPARE_BYTE); 4647 testl(result, 0x1); // tail byte 4648 jccb(Assembler::zero, TRUE_LABEL); 4649 load_unsigned_byte(chr, Address(ary1, 0)); 4650 load_unsigned_byte(limit, Address(ary2, 0)); 4651 cmpl(chr, limit); 4652 jccb(Assembler::notEqual, FALSE_LABEL); 4653 } 4654 bind(TRUE_LABEL); 4655 movl(result, 1); // return true 4656 jmpb(DONE); 4657 4658 bind(FALSE_LABEL); 4659 xorl(result, result); // return false 4660 4661 // That's it 4662 bind(DONE); 4663 if (UseAVX >= 2) { 4664 // clean upper bits of YMM registers 4665 vpxor(vec1, vec1); 4666 vpxor(vec2, vec2); 4667 } 4668 } 4669 4670 #ifdef _LP64 4671 4672 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4673 #define __ masm. 4674 Register dst = stub.data<0>(); 4675 XMMRegister src = stub.data<1>(); 4676 address target = stub.data<2>(); 4677 __ bind(stub.entry()); 4678 __ subptr(rsp, 8); 4679 __ movdbl(Address(rsp), src); 4680 __ call(RuntimeAddress(target)); 4681 __ pop(dst); 4682 __ jmp(stub.continuation()); 4683 #undef __ 4684 } 4685 4686 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4687 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4688 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4689 4690 address slowpath_target; 4691 if (dst_bt == T_INT) { 4692 if (src_bt == T_FLOAT) { 4693 cvttss2sil(dst, src); 4694 cmpl(dst, 0x80000000); 4695 slowpath_target = StubRoutines::x86::f2i_fixup(); 4696 } else { 4697 cvttsd2sil(dst, src); 4698 cmpl(dst, 0x80000000); 4699 slowpath_target = StubRoutines::x86::d2i_fixup(); 4700 } 4701 } else { 4702 if (src_bt == T_FLOAT) { 4703 cvttss2siq(dst, src); 4704 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4705 slowpath_target = StubRoutines::x86::f2l_fixup(); 4706 } else { 4707 cvttsd2siq(dst, src); 4708 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4709 slowpath_target = StubRoutines::x86::d2l_fixup(); 4710 } 4711 } 4712 4713 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4714 jcc(Assembler::equal, stub->entry()); 4715 bind(stub->continuation()); 4716 } 4717 4718 #endif // _LP64 4719 4720 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4721 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4722 switch(ideal_opc) { 4723 case Op_LShiftVS: 4724 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4725 case Op_LShiftVI: 4726 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4727 case Op_LShiftVL: 4728 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4729 case Op_RShiftVS: 4730 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4731 case Op_RShiftVI: 4732 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4733 case Op_RShiftVL: 4734 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4735 case Op_URShiftVS: 4736 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4737 case Op_URShiftVI: 4738 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4739 case Op_URShiftVL: 4740 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4741 case Op_RotateRightV: 4742 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4743 case Op_RotateLeftV: 4744 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4745 default: 4746 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4747 break; 4748 } 4749 } 4750 4751 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4752 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) { 4753 if (is_unsigned) { 4754 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4755 } else { 4756 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4757 } 4758 } 4759 4760 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4761 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4762 switch (elem_bt) { 4763 case T_BYTE: 4764 if (ideal_opc == Op_SaturatingAddV) { 4765 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4766 } else { 4767 assert(ideal_opc == Op_SaturatingSubV, ""); 4768 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4769 } 4770 break; 4771 case T_SHORT: 4772 if (ideal_opc == Op_SaturatingAddV) { 4773 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4774 } else { 4775 assert(ideal_opc == Op_SaturatingSubV, ""); 4776 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4777 } 4778 break; 4779 default: 4780 fatal("Unsupported type %s", type2name(elem_bt)); 4781 break; 4782 } 4783 } 4784 4785 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4786 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4787 switch (elem_bt) { 4788 case T_BYTE: 4789 if (ideal_opc == Op_SaturatingAddV) { 4790 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4791 } else { 4792 assert(ideal_opc == Op_SaturatingSubV, ""); 4793 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4794 } 4795 break; 4796 case T_SHORT: 4797 if (ideal_opc == Op_SaturatingAddV) { 4798 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4799 } else { 4800 assert(ideal_opc == Op_SaturatingSubV, ""); 4801 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4802 } 4803 break; 4804 default: 4805 fatal("Unsupported type %s", type2name(elem_bt)); 4806 break; 4807 } 4808 } 4809 4810 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4811 Address src2, bool is_unsigned, bool merge, int vlen_enc) { 4812 if (is_unsigned) { 4813 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4814 } else { 4815 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4816 } 4817 } 4818 4819 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4820 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4821 switch (elem_bt) { 4822 case T_BYTE: 4823 if (ideal_opc == Op_SaturatingAddV) { 4824 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4825 } else { 4826 assert(ideal_opc == Op_SaturatingSubV, ""); 4827 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4828 } 4829 break; 4830 case T_SHORT: 4831 if (ideal_opc == Op_SaturatingAddV) { 4832 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4833 } else { 4834 assert(ideal_opc == Op_SaturatingSubV, ""); 4835 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4836 } 4837 break; 4838 default: 4839 fatal("Unsupported type %s", type2name(elem_bt)); 4840 break; 4841 } 4842 } 4843 4844 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4845 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4846 switch (elem_bt) { 4847 case T_BYTE: 4848 if (ideal_opc == Op_SaturatingAddV) { 4849 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4850 } else { 4851 assert(ideal_opc == Op_SaturatingSubV, ""); 4852 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4853 } 4854 break; 4855 case T_SHORT: 4856 if (ideal_opc == Op_SaturatingAddV) { 4857 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4858 } else { 4859 assert(ideal_opc == Op_SaturatingSubV, ""); 4860 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4861 } 4862 break; 4863 default: 4864 fatal("Unsupported type %s", type2name(elem_bt)); 4865 break; 4866 } 4867 } 4868 4869 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4870 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4871 bool is_varshift) { 4872 switch (ideal_opc) { 4873 case Op_AddVB: 4874 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4875 case Op_AddVS: 4876 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4877 case Op_AddVI: 4878 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4879 case Op_AddVL: 4880 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4881 case Op_AddVF: 4882 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4883 case Op_AddVD: 4884 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4885 case Op_SubVB: 4886 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4887 case Op_SubVS: 4888 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4889 case Op_SubVI: 4890 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4891 case Op_SubVL: 4892 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4893 case Op_SubVF: 4894 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4895 case Op_SubVD: 4896 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4897 case Op_MulVS: 4898 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4899 case Op_MulVI: 4900 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4901 case Op_MulVL: 4902 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4903 case Op_MulVF: 4904 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4905 case Op_MulVD: 4906 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4907 case Op_DivVF: 4908 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4909 case Op_DivVD: 4910 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4911 case Op_SqrtVF: 4912 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4913 case Op_SqrtVD: 4914 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4915 case Op_AbsVB: 4916 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4917 case Op_AbsVS: 4918 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4919 case Op_AbsVI: 4920 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4921 case Op_AbsVL: 4922 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4923 case Op_FmaVF: 4924 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4925 case Op_FmaVD: 4926 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4927 case Op_VectorRearrange: 4928 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4929 case Op_LShiftVS: 4930 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4931 case Op_LShiftVI: 4932 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4933 case Op_LShiftVL: 4934 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4935 case Op_RShiftVS: 4936 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4937 case Op_RShiftVI: 4938 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4939 case Op_RShiftVL: 4940 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4941 case Op_URShiftVS: 4942 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4943 case Op_URShiftVI: 4944 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4945 case Op_URShiftVL: 4946 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4947 case Op_RotateLeftV: 4948 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4949 case Op_RotateRightV: 4950 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4951 case Op_MaxV: 4952 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4953 case Op_MinV: 4954 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4955 case Op_UMinV: 4956 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4957 case Op_UMaxV: 4958 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4959 case Op_XorV: 4960 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4961 case Op_OrV: 4962 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4963 case Op_AndV: 4964 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4965 default: 4966 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4967 break; 4968 } 4969 } 4970 4971 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4972 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4973 switch (ideal_opc) { 4974 case Op_AddVB: 4975 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4976 case Op_AddVS: 4977 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4978 case Op_AddVI: 4979 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4980 case Op_AddVL: 4981 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4982 case Op_AddVF: 4983 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4984 case Op_AddVD: 4985 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4986 case Op_SubVB: 4987 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4988 case Op_SubVS: 4989 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4990 case Op_SubVI: 4991 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4992 case Op_SubVL: 4993 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4994 case Op_SubVF: 4995 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4996 case Op_SubVD: 4997 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4998 case Op_MulVS: 4999 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 5000 case Op_MulVI: 5001 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 5002 case Op_MulVL: 5003 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 5004 case Op_MulVF: 5005 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 5006 case Op_MulVD: 5007 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 5008 case Op_DivVF: 5009 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 5010 case Op_DivVD: 5011 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 5012 case Op_FmaVF: 5013 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 5014 case Op_FmaVD: 5015 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 5016 case Op_MaxV: 5017 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5018 case Op_MinV: 5019 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5020 case Op_UMaxV: 5021 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5022 case Op_UMinV: 5023 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5024 case Op_XorV: 5025 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5026 case Op_OrV: 5027 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5028 case Op_AndV: 5029 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5030 default: 5031 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 5032 break; 5033 } 5034 } 5035 5036 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 5037 KRegister src1, KRegister src2) { 5038 BasicType etype = T_ILLEGAL; 5039 switch(mask_len) { 5040 case 2: 5041 case 4: 5042 case 8: etype = T_BYTE; break; 5043 case 16: etype = T_SHORT; break; 5044 case 32: etype = T_INT; break; 5045 case 64: etype = T_LONG; break; 5046 default: fatal("Unsupported type"); break; 5047 } 5048 assert(etype != T_ILLEGAL, ""); 5049 switch(ideal_opc) { 5050 case Op_AndVMask: 5051 kand(etype, dst, src1, src2); break; 5052 case Op_OrVMask: 5053 kor(etype, dst, src1, src2); break; 5054 case Op_XorVMask: 5055 kxor(etype, dst, src1, src2); break; 5056 default: 5057 fatal("Unsupported masked operation"); break; 5058 } 5059 } 5060 5061 /* 5062 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5063 * If src is NaN, the result is 0. 5064 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 5065 * the result is equal to the value of Integer.MIN_VALUE. 5066 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 5067 * the result is equal to the value of Integer.MAX_VALUE. 5068 */ 5069 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5070 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5071 Register rscratch, AddressLiteral float_sign_flip, 5072 int vec_enc) { 5073 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5074 Label done; 5075 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 5076 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5077 vptest(xtmp2, xtmp2, vec_enc); 5078 jccb(Assembler::equal, done); 5079 5080 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 5081 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 5082 5083 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5084 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 5085 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 5086 5087 // Recompute the mask for remaining special value. 5088 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 5089 // Extract SRC values corresponding to TRUE mask lanes. 5090 vpand(xtmp4, xtmp2, src, vec_enc); 5091 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 5092 // values are set. 5093 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 5094 5095 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 5096 bind(done); 5097 } 5098 5099 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5100 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5101 Register rscratch, AddressLiteral float_sign_flip, 5102 int vec_enc) { 5103 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5104 Label done; 5105 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5106 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5107 kortestwl(ktmp1, ktmp1); 5108 jccb(Assembler::equal, done); 5109 5110 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5111 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5112 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5113 5114 kxorwl(ktmp1, ktmp1, ktmp2); 5115 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5116 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5117 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5118 bind(done); 5119 } 5120 5121 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5122 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5123 Register rscratch, AddressLiteral double_sign_flip, 5124 int vec_enc) { 5125 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5126 5127 Label done; 5128 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5129 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 5130 kortestwl(ktmp1, ktmp1); 5131 jccb(Assembler::equal, done); 5132 5133 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5134 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5135 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5136 5137 kxorwl(ktmp1, ktmp1, ktmp2); 5138 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5139 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5140 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5141 bind(done); 5142 } 5143 5144 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5145 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5146 Register rscratch, AddressLiteral float_sign_flip, 5147 int vec_enc) { 5148 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5149 Label done; 5150 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5151 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5152 kortestwl(ktmp1, ktmp1); 5153 jccb(Assembler::equal, done); 5154 5155 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5156 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5157 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5158 5159 kxorwl(ktmp1, ktmp1, ktmp2); 5160 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5161 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5162 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5163 bind(done); 5164 } 5165 5166 /* 5167 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5168 * If src is NaN, the result is 0. 5169 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 5170 * the result is equal to the value of Long.MIN_VALUE. 5171 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 5172 * the result is equal to the value of Long.MAX_VALUE. 5173 */ 5174 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5175 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5176 Register rscratch, AddressLiteral double_sign_flip, 5177 int vec_enc) { 5178 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5179 5180 Label done; 5181 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5182 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 5183 kortestwl(ktmp1, ktmp1); 5184 jccb(Assembler::equal, done); 5185 5186 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5187 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5188 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5189 5190 kxorwl(ktmp1, ktmp1, ktmp2); 5191 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5192 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5193 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5194 bind(done); 5195 } 5196 5197 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5198 XMMRegister xtmp, int index, int vec_enc) { 5199 assert(vec_enc < Assembler::AVX_512bit, ""); 5200 if (vec_enc == Assembler::AVX_256bit) { 5201 vextractf128_high(xtmp, src); 5202 vshufps(dst, src, xtmp, index, vec_enc); 5203 } else { 5204 vshufps(dst, src, zero, index, vec_enc); 5205 } 5206 } 5207 5208 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5209 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5210 AddressLiteral float_sign_flip, int src_vec_enc) { 5211 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5212 5213 Label done; 5214 // Compare the destination lanes with float_sign_flip 5215 // value to get mask for all special values. 5216 movdqu(xtmp1, float_sign_flip, rscratch); 5217 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5218 ptest(xtmp2, xtmp2); 5219 jccb(Assembler::equal, done); 5220 5221 // Flip float_sign_flip to get max integer value. 5222 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5223 pxor(xtmp1, xtmp4); 5224 5225 // Set detination lanes corresponding to unordered source lanes as zero. 5226 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5227 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5228 5229 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5230 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5231 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5232 5233 // Recompute the mask for remaining special value. 5234 pxor(xtmp2, xtmp3); 5235 // Extract mask corresponding to non-negative source lanes. 5236 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5237 5238 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5239 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5240 pand(xtmp3, xtmp2); 5241 5242 // Replace destination lanes holding special value(0x80000000) with max int 5243 // if corresponding source lane holds a +ve value. 5244 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5245 bind(done); 5246 } 5247 5248 5249 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5250 XMMRegister xtmp, Register rscratch, int vec_enc) { 5251 switch(to_elem_bt) { 5252 case T_SHORT: 5253 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5254 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5255 vpackusdw(dst, dst, zero, vec_enc); 5256 if (vec_enc == Assembler::AVX_256bit) { 5257 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5258 } 5259 break; 5260 case T_BYTE: 5261 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5262 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5263 vpackusdw(dst, dst, zero, vec_enc); 5264 if (vec_enc == Assembler::AVX_256bit) { 5265 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5266 } 5267 vpackuswb(dst, dst, zero, vec_enc); 5268 break; 5269 default: assert(false, "%s", type2name(to_elem_bt)); 5270 } 5271 } 5272 5273 /* 5274 * Algorithm for vector D2L and F2I conversions:- 5275 * a) Perform vector D2L/F2I cast. 5276 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5277 * It signifies that source value could be any of the special floating point 5278 * values(NaN,-Inf,Inf,Max,-Min). 5279 * c) Set destination to zero if source is NaN value. 5280 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5281 */ 5282 5283 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5284 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5285 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5286 int to_elem_sz = type2aelembytes(to_elem_bt); 5287 assert(to_elem_sz <= 4, ""); 5288 vcvttps2dq(dst, src, vec_enc); 5289 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5290 if (to_elem_sz < 4) { 5291 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5292 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5293 } 5294 } 5295 5296 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5297 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5298 Register rscratch, int vec_enc) { 5299 int to_elem_sz = type2aelembytes(to_elem_bt); 5300 assert(to_elem_sz <= 4, ""); 5301 vcvttps2dq(dst, src, vec_enc); 5302 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5303 switch(to_elem_bt) { 5304 case T_INT: 5305 break; 5306 case T_SHORT: 5307 evpmovdw(dst, dst, vec_enc); 5308 break; 5309 case T_BYTE: 5310 evpmovdb(dst, dst, vec_enc); 5311 break; 5312 default: assert(false, "%s", type2name(to_elem_bt)); 5313 } 5314 } 5315 5316 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5317 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5318 Register rscratch, int vec_enc) { 5319 evcvttps2qq(dst, src, vec_enc); 5320 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5321 } 5322 5323 // Handling for downcasting from double to integer or sub-word types on AVX2. 5324 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5325 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5326 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5327 int to_elem_sz = type2aelembytes(to_elem_bt); 5328 assert(to_elem_sz < 8, ""); 5329 vcvttpd2dq(dst, src, vec_enc); 5330 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5331 float_sign_flip, vec_enc); 5332 if (to_elem_sz < 4) { 5333 // xtmp4 holds all zero lanes. 5334 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5335 } 5336 } 5337 5338 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5339 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5340 KRegister ktmp2, AddressLiteral sign_flip, 5341 Register rscratch, int vec_enc) { 5342 if (VM_Version::supports_avx512dq()) { 5343 evcvttpd2qq(dst, src, vec_enc); 5344 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5345 switch(to_elem_bt) { 5346 case T_LONG: 5347 break; 5348 case T_INT: 5349 evpmovsqd(dst, dst, vec_enc); 5350 break; 5351 case T_SHORT: 5352 evpmovsqd(dst, dst, vec_enc); 5353 evpmovdw(dst, dst, vec_enc); 5354 break; 5355 case T_BYTE: 5356 evpmovsqd(dst, dst, vec_enc); 5357 evpmovdb(dst, dst, vec_enc); 5358 break; 5359 default: assert(false, "%s", type2name(to_elem_bt)); 5360 } 5361 } else { 5362 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5363 vcvttpd2dq(dst, src, vec_enc); 5364 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5365 switch(to_elem_bt) { 5366 case T_INT: 5367 break; 5368 case T_SHORT: 5369 evpmovdw(dst, dst, vec_enc); 5370 break; 5371 case T_BYTE: 5372 evpmovdb(dst, dst, vec_enc); 5373 break; 5374 default: assert(false, "%s", type2name(to_elem_bt)); 5375 } 5376 } 5377 } 5378 5379 #ifdef _LP64 5380 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5381 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5382 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5383 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5384 // and re-instantiate original MXCSR.RC mode after that. 5385 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5386 5387 mov64(tmp, julong_cast(0.5L)); 5388 evpbroadcastq(xtmp1, tmp, vec_enc); 5389 vaddpd(xtmp1, src , xtmp1, vec_enc); 5390 evcvtpd2qq(dst, xtmp1, vec_enc); 5391 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5392 double_sign_flip, vec_enc);; 5393 5394 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5395 } 5396 5397 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5398 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5399 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5400 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5401 // and re-instantiate original MXCSR.RC mode after that. 5402 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5403 5404 movl(tmp, jint_cast(0.5)); 5405 movq(xtmp1, tmp); 5406 vbroadcastss(xtmp1, xtmp1, vec_enc); 5407 vaddps(xtmp1, src , xtmp1, vec_enc); 5408 vcvtps2dq(dst, xtmp1, vec_enc); 5409 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5410 float_sign_flip, vec_enc); 5411 5412 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5413 } 5414 5415 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5416 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5417 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5418 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5419 // and re-instantiate original MXCSR.RC mode after that. 5420 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5421 5422 movl(tmp, jint_cast(0.5)); 5423 movq(xtmp1, tmp); 5424 vbroadcastss(xtmp1, xtmp1, vec_enc); 5425 vaddps(xtmp1, src , xtmp1, vec_enc); 5426 vcvtps2dq(dst, xtmp1, vec_enc); 5427 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5428 5429 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5430 } 5431 #endif // _LP64 5432 5433 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5434 BasicType from_elem_bt, BasicType to_elem_bt) { 5435 switch (from_elem_bt) { 5436 case T_BYTE: 5437 switch (to_elem_bt) { 5438 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5439 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5440 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5441 default: ShouldNotReachHere(); 5442 } 5443 break; 5444 case T_SHORT: 5445 switch (to_elem_bt) { 5446 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5447 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5448 default: ShouldNotReachHere(); 5449 } 5450 break; 5451 case T_INT: 5452 assert(to_elem_bt == T_LONG, ""); 5453 vpmovzxdq(dst, src, vlen_enc); 5454 break; 5455 default: 5456 ShouldNotReachHere(); 5457 } 5458 } 5459 5460 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5461 BasicType from_elem_bt, BasicType to_elem_bt) { 5462 switch (from_elem_bt) { 5463 case T_BYTE: 5464 switch (to_elem_bt) { 5465 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5466 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5467 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5468 default: ShouldNotReachHere(); 5469 } 5470 break; 5471 case T_SHORT: 5472 switch (to_elem_bt) { 5473 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5474 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5475 default: ShouldNotReachHere(); 5476 } 5477 break; 5478 case T_INT: 5479 assert(to_elem_bt == T_LONG, ""); 5480 vpmovsxdq(dst, src, vlen_enc); 5481 break; 5482 default: 5483 ShouldNotReachHere(); 5484 } 5485 } 5486 5487 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5488 BasicType dst_bt, BasicType src_bt, int vlen) { 5489 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5490 assert(vlen_enc != AVX_512bit, ""); 5491 5492 int dst_bt_size = type2aelembytes(dst_bt); 5493 int src_bt_size = type2aelembytes(src_bt); 5494 if (dst_bt_size > src_bt_size) { 5495 switch (dst_bt_size / src_bt_size) { 5496 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5497 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5498 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5499 default: ShouldNotReachHere(); 5500 } 5501 } else { 5502 assert(dst_bt_size < src_bt_size, ""); 5503 switch (src_bt_size / dst_bt_size) { 5504 case 2: { 5505 if (vlen_enc == AVX_128bit) { 5506 vpacksswb(dst, src, src, vlen_enc); 5507 } else { 5508 vpacksswb(dst, src, src, vlen_enc); 5509 vpermq(dst, dst, 0x08, vlen_enc); 5510 } 5511 break; 5512 } 5513 case 4: { 5514 if (vlen_enc == AVX_128bit) { 5515 vpackssdw(dst, src, src, vlen_enc); 5516 vpacksswb(dst, dst, dst, vlen_enc); 5517 } else { 5518 vpackssdw(dst, src, src, vlen_enc); 5519 vpermq(dst, dst, 0x08, vlen_enc); 5520 vpacksswb(dst, dst, dst, AVX_128bit); 5521 } 5522 break; 5523 } 5524 case 8: { 5525 if (vlen_enc == AVX_128bit) { 5526 vpshufd(dst, src, 0x08, vlen_enc); 5527 vpackssdw(dst, dst, dst, vlen_enc); 5528 vpacksswb(dst, dst, dst, vlen_enc); 5529 } else { 5530 vpshufd(dst, src, 0x08, vlen_enc); 5531 vpermq(dst, dst, 0x08, vlen_enc); 5532 vpackssdw(dst, dst, dst, AVX_128bit); 5533 vpacksswb(dst, dst, dst, AVX_128bit); 5534 } 5535 break; 5536 } 5537 default: ShouldNotReachHere(); 5538 } 5539 } 5540 } 5541 5542 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5543 bool merge, BasicType bt, int vlen_enc) { 5544 if (bt == T_INT) { 5545 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5546 } else { 5547 assert(bt == T_LONG, ""); 5548 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5549 } 5550 } 5551 5552 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5553 bool merge, BasicType bt, int vlen_enc) { 5554 if (bt == T_INT) { 5555 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5556 } else { 5557 assert(bt == T_LONG, ""); 5558 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5559 } 5560 } 5561 5562 #ifdef _LP64 5563 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5564 Register rtmp2, XMMRegister xtmp, int mask_len, 5565 int vec_enc) { 5566 int index = 0; 5567 int vindex = 0; 5568 mov64(rtmp1, 0x0101010101010101L); 5569 pdepq(rtmp1, src, rtmp1); 5570 if (mask_len > 8) { 5571 movq(rtmp2, src); 5572 vpxor(xtmp, xtmp, xtmp, vec_enc); 5573 movq(xtmp, rtmp1); 5574 } 5575 movq(dst, rtmp1); 5576 5577 mask_len -= 8; 5578 while (mask_len > 0) { 5579 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5580 index++; 5581 if ((index % 2) == 0) { 5582 pxor(xtmp, xtmp); 5583 } 5584 mov64(rtmp1, 0x0101010101010101L); 5585 shrq(rtmp2, 8); 5586 pdepq(rtmp1, rtmp2, rtmp1); 5587 pinsrq(xtmp, rtmp1, index % 2); 5588 vindex = index / 2; 5589 if (vindex) { 5590 // Write entire 16 byte vector when both 64 bit 5591 // lanes are update to save redundant instructions. 5592 if (index % 2) { 5593 vinsertf128(dst, dst, xtmp, vindex); 5594 } 5595 } else { 5596 vmovdqu(dst, xtmp); 5597 } 5598 mask_len -= 8; 5599 } 5600 } 5601 5602 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5603 switch(opc) { 5604 case Op_VectorMaskTrueCount: 5605 popcntq(dst, tmp); 5606 break; 5607 case Op_VectorMaskLastTrue: 5608 if (VM_Version::supports_lzcnt()) { 5609 lzcntq(tmp, tmp); 5610 movl(dst, 63); 5611 subl(dst, tmp); 5612 } else { 5613 movl(dst, -1); 5614 bsrq(tmp, tmp); 5615 cmov32(Assembler::notZero, dst, tmp); 5616 } 5617 break; 5618 case Op_VectorMaskFirstTrue: 5619 if (VM_Version::supports_bmi1()) { 5620 if (masklen < 32) { 5621 orl(tmp, 1 << masklen); 5622 tzcntl(dst, tmp); 5623 } else if (masklen == 32) { 5624 tzcntl(dst, tmp); 5625 } else { 5626 assert(masklen == 64, ""); 5627 tzcntq(dst, tmp); 5628 } 5629 } else { 5630 if (masklen < 32) { 5631 orl(tmp, 1 << masklen); 5632 bsfl(dst, tmp); 5633 } else { 5634 assert(masklen == 32 || masklen == 64, ""); 5635 movl(dst, masklen); 5636 if (masklen == 32) { 5637 bsfl(tmp, tmp); 5638 } else { 5639 bsfq(tmp, tmp); 5640 } 5641 cmov32(Assembler::notZero, dst, tmp); 5642 } 5643 } 5644 break; 5645 case Op_VectorMaskToLong: 5646 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5647 break; 5648 default: assert(false, "Unhandled mask operation"); 5649 } 5650 } 5651 5652 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5653 int masklen, int masksize, int vec_enc) { 5654 assert(VM_Version::supports_popcnt(), ""); 5655 5656 if(VM_Version::supports_avx512bw()) { 5657 kmovql(tmp, mask); 5658 } else { 5659 assert(masklen <= 16, ""); 5660 kmovwl(tmp, mask); 5661 } 5662 5663 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5664 // operations needs to be clipped. 5665 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5666 andq(tmp, (1 << masklen) - 1); 5667 } 5668 5669 vector_mask_operation_helper(opc, dst, tmp, masklen); 5670 } 5671 5672 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5673 Register tmp, int masklen, BasicType bt, int vec_enc) { 5674 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5675 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5676 assert(VM_Version::supports_popcnt(), ""); 5677 5678 bool need_clip = false; 5679 switch(bt) { 5680 case T_BOOLEAN: 5681 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5682 vpxor(xtmp, xtmp, xtmp, vec_enc); 5683 vpsubb(xtmp, xtmp, mask, vec_enc); 5684 vpmovmskb(tmp, xtmp, vec_enc); 5685 need_clip = masklen < 16; 5686 break; 5687 case T_BYTE: 5688 vpmovmskb(tmp, mask, vec_enc); 5689 need_clip = masklen < 16; 5690 break; 5691 case T_SHORT: 5692 vpacksswb(xtmp, mask, mask, vec_enc); 5693 if (masklen >= 16) { 5694 vpermpd(xtmp, xtmp, 8, vec_enc); 5695 } 5696 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5697 need_clip = masklen < 16; 5698 break; 5699 case T_INT: 5700 case T_FLOAT: 5701 vmovmskps(tmp, mask, vec_enc); 5702 need_clip = masklen < 4; 5703 break; 5704 case T_LONG: 5705 case T_DOUBLE: 5706 vmovmskpd(tmp, mask, vec_enc); 5707 need_clip = masklen < 2; 5708 break; 5709 default: assert(false, "Unhandled type, %s", type2name(bt)); 5710 } 5711 5712 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5713 // operations needs to be clipped. 5714 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5715 // need_clip implies masklen < 32 5716 andq(tmp, (1 << masklen) - 1); 5717 } 5718 5719 vector_mask_operation_helper(opc, dst, tmp, masklen); 5720 } 5721 5722 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5723 Register rtmp2, int mask_len) { 5724 kmov(rtmp1, src); 5725 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5726 mov64(rtmp2, -1L); 5727 pextq(rtmp2, rtmp2, rtmp1); 5728 kmov(dst, rtmp2); 5729 } 5730 5731 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5732 XMMRegister mask, Register rtmp, Register rscratch, 5733 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5734 int vec_enc) { 5735 assert(type2aelembytes(bt) >= 4, ""); 5736 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5737 address compress_perm_table = nullptr; 5738 address expand_perm_table = nullptr; 5739 if (type2aelembytes(bt) == 8) { 5740 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5741 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5742 vmovmskpd(rtmp, mask, vec_enc); 5743 } else { 5744 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5745 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5746 vmovmskps(rtmp, mask, vec_enc); 5747 } 5748 shlq(rtmp, 5); // for 32 byte permute row. 5749 if (opcode == Op_CompressV) { 5750 lea(rscratch, ExternalAddress(compress_perm_table)); 5751 } else { 5752 lea(rscratch, ExternalAddress(expand_perm_table)); 5753 } 5754 addptr(rtmp, rscratch); 5755 vmovdqu(permv, Address(rtmp)); 5756 vpermps(dst, permv, src, Assembler::AVX_256bit); 5757 vpxor(xtmp, xtmp, xtmp, vec_enc); 5758 // Blend the result with zero vector using permute mask, each column entry 5759 // in a permute table row contains either a valid permute index or a -1 (default) 5760 // value, this can potentially be used as a blending mask after 5761 // compressing/expanding the source vector lanes. 5762 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5763 } 5764 5765 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5766 bool merge, BasicType bt, int vec_enc) { 5767 if (opcode == Op_CompressV) { 5768 switch(bt) { 5769 case T_BYTE: 5770 evpcompressb(dst, mask, src, merge, vec_enc); 5771 break; 5772 case T_CHAR: 5773 case T_SHORT: 5774 evpcompressw(dst, mask, src, merge, vec_enc); 5775 break; 5776 case T_INT: 5777 evpcompressd(dst, mask, src, merge, vec_enc); 5778 break; 5779 case T_FLOAT: 5780 evcompressps(dst, mask, src, merge, vec_enc); 5781 break; 5782 case T_LONG: 5783 evpcompressq(dst, mask, src, merge, vec_enc); 5784 break; 5785 case T_DOUBLE: 5786 evcompresspd(dst, mask, src, merge, vec_enc); 5787 break; 5788 default: 5789 fatal("Unsupported type %s", type2name(bt)); 5790 break; 5791 } 5792 } else { 5793 assert(opcode == Op_ExpandV, ""); 5794 switch(bt) { 5795 case T_BYTE: 5796 evpexpandb(dst, mask, src, merge, vec_enc); 5797 break; 5798 case T_CHAR: 5799 case T_SHORT: 5800 evpexpandw(dst, mask, src, merge, vec_enc); 5801 break; 5802 case T_INT: 5803 evpexpandd(dst, mask, src, merge, vec_enc); 5804 break; 5805 case T_FLOAT: 5806 evexpandps(dst, mask, src, merge, vec_enc); 5807 break; 5808 case T_LONG: 5809 evpexpandq(dst, mask, src, merge, vec_enc); 5810 break; 5811 case T_DOUBLE: 5812 evexpandpd(dst, mask, src, merge, vec_enc); 5813 break; 5814 default: 5815 fatal("Unsupported type %s", type2name(bt)); 5816 break; 5817 } 5818 } 5819 } 5820 #endif 5821 5822 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5823 KRegister ktmp1, int vec_enc) { 5824 if (opcode == Op_SignumVD) { 5825 vsubpd(dst, zero, one, vec_enc); 5826 // if src < 0 ? -1 : 1 5827 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5828 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5829 // if src == NaN, -0.0 or 0.0 return src. 5830 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5831 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5832 } else { 5833 assert(opcode == Op_SignumVF, ""); 5834 vsubps(dst, zero, one, vec_enc); 5835 // if src < 0 ? -1 : 1 5836 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5837 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5838 // if src == NaN, -0.0 or 0.0 return src. 5839 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5840 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5841 } 5842 } 5843 5844 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5845 XMMRegister xtmp1, int vec_enc) { 5846 if (opcode == Op_SignumVD) { 5847 vsubpd(dst, zero, one, vec_enc); 5848 // if src < 0 ? -1 : 1 5849 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5850 // if src == NaN, -0.0 or 0.0 return src. 5851 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5852 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5853 } else { 5854 assert(opcode == Op_SignumVF, ""); 5855 vsubps(dst, zero, one, vec_enc); 5856 // if src < 0 ? -1 : 1 5857 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5858 // if src == NaN, -0.0 or 0.0 return src. 5859 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5860 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5861 } 5862 } 5863 5864 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5865 if (VM_Version::supports_avx512bw()) { 5866 if (mask_len > 32) { 5867 kmovql(dst, src); 5868 } else { 5869 kmovdl(dst, src); 5870 if (mask_len != 32) { 5871 kshiftrdl(dst, dst, 32 - mask_len); 5872 } 5873 } 5874 } else { 5875 assert(mask_len <= 16, ""); 5876 kmovwl(dst, src); 5877 if (mask_len != 16) { 5878 kshiftrwl(dst, dst, 16 - mask_len); 5879 } 5880 } 5881 } 5882 5883 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5884 int lane_size = type2aelembytes(bt); 5885 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5886 if ((is_LP64 || lane_size < 8) && 5887 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5888 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5889 movptr(rtmp, imm32); 5890 switch(lane_size) { 5891 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5892 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5893 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5894 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5895 fatal("Unsupported lane size %d", lane_size); 5896 break; 5897 } 5898 } else { 5899 movptr(rtmp, imm32); 5900 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5901 switch(lane_size) { 5902 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5903 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5904 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5905 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5906 fatal("Unsupported lane size %d", lane_size); 5907 break; 5908 } 5909 } 5910 } 5911 5912 // 5913 // Following is lookup table based popcount computation algorithm:- 5914 // Index Bit set count 5915 // [ 0000 -> 0, 5916 // 0001 -> 1, 5917 // 0010 -> 1, 5918 // 0011 -> 2, 5919 // 0100 -> 1, 5920 // 0101 -> 2, 5921 // 0110 -> 2, 5922 // 0111 -> 3, 5923 // 1000 -> 1, 5924 // 1001 -> 2, 5925 // 1010 -> 3, 5926 // 1011 -> 3, 5927 // 1100 -> 2, 5928 // 1101 -> 3, 5929 // 1111 -> 4 ] 5930 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5931 // shuffle indices for lookup table access. 5932 // b. Right shift each byte of vector lane by 4 positions. 5933 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5934 // shuffle indices for lookup table access. 5935 // d. Add the bitset count of upper and lower 4 bits of each byte. 5936 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5937 // count of all the bytes of a quadword. 5938 // f. Perform step e. for upper 128bit vector lane. 5939 // g. Pack the bitset count of quadwords back to double word. 5940 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5941 5942 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5943 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5944 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5945 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5946 vpsrlw(dst, src, 4, vec_enc); 5947 vpand(dst, dst, xtmp1, vec_enc); 5948 vpand(xtmp1, src, xtmp1, vec_enc); 5949 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5950 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5951 vpshufb(dst, xtmp2, dst, vec_enc); 5952 vpaddb(dst, dst, xtmp1, vec_enc); 5953 } 5954 5955 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5956 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5957 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5958 // Following code is as per steps e,f,g and h of above algorithm. 5959 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5960 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5961 vpsadbw(dst, dst, xtmp2, vec_enc); 5962 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5963 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5964 vpackuswb(dst, xtmp1, dst, vec_enc); 5965 } 5966 5967 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5968 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5969 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5970 // Add the popcount of upper and lower bytes of word. 5971 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5972 vpsrlw(dst, xtmp1, 8, vec_enc); 5973 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5974 vpaddw(dst, dst, xtmp1, vec_enc); 5975 } 5976 5977 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5978 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5979 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5980 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5981 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5982 } 5983 5984 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5985 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5986 switch(bt) { 5987 case T_LONG: 5988 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5989 break; 5990 case T_INT: 5991 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5992 break; 5993 case T_CHAR: 5994 case T_SHORT: 5995 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5996 break; 5997 case T_BYTE: 5998 case T_BOOLEAN: 5999 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 6000 break; 6001 default: 6002 fatal("Unsupported type %s", type2name(bt)); 6003 break; 6004 } 6005 } 6006 6007 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6008 KRegister mask, bool merge, int vec_enc) { 6009 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6010 switch(bt) { 6011 case T_LONG: 6012 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 6013 evpopcntq(dst, mask, src, merge, vec_enc); 6014 break; 6015 case T_INT: 6016 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 6017 evpopcntd(dst, mask, src, merge, vec_enc); 6018 break; 6019 case T_CHAR: 6020 case T_SHORT: 6021 assert(VM_Version::supports_avx512_bitalg(), ""); 6022 evpopcntw(dst, mask, src, merge, vec_enc); 6023 break; 6024 case T_BYTE: 6025 case T_BOOLEAN: 6026 assert(VM_Version::supports_avx512_bitalg(), ""); 6027 evpopcntb(dst, mask, src, merge, vec_enc); 6028 break; 6029 default: 6030 fatal("Unsupported type %s", type2name(bt)); 6031 break; 6032 } 6033 } 6034 6035 #ifndef _LP64 6036 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 6037 assert(VM_Version::supports_avx512bw(), ""); 6038 kmovdl(tmp, src); 6039 kunpckdql(dst, tmp, tmp); 6040 } 6041 #endif 6042 6043 // Bit reversal algorithm first reverses the bits of each byte followed by 6044 // a byte level reversal for multi-byte primitive types (short/int/long). 6045 // Algorithm performs a lookup table access to get reverse bit sequence 6046 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 6047 // is obtained by swapping the reverse bit sequences of upper and lower 6048 // nibble of a byte. 6049 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6050 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6051 if (VM_Version::supports_avx512vlbw()) { 6052 6053 // Get the reverse bit sequence of lower nibble of each byte. 6054 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 6055 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6056 evpandq(dst, xtmp2, src, vec_enc); 6057 vpshufb(dst, xtmp1, dst, vec_enc); 6058 vpsllq(dst, dst, 4, vec_enc); 6059 6060 // Get the reverse bit sequence of upper nibble of each byte. 6061 vpandn(xtmp2, xtmp2, src, vec_enc); 6062 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6063 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6064 6065 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6066 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6067 evporq(xtmp2, dst, xtmp2, vec_enc); 6068 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6069 6070 } else if(vec_enc == Assembler::AVX_512bit) { 6071 // Shift based bit reversal. 6072 assert(bt == T_LONG || bt == T_INT, ""); 6073 6074 // Swap lower and upper nibble of each byte. 6075 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 6076 6077 // Swap two least and most significant bits of each nibble. 6078 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 6079 6080 // Swap adjacent pair of bits. 6081 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6082 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 6083 6084 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6085 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 6086 } else { 6087 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 6088 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6089 6090 // Get the reverse bit sequence of lower nibble of each byte. 6091 vpand(dst, xtmp2, src, vec_enc); 6092 vpshufb(dst, xtmp1, dst, vec_enc); 6093 vpsllq(dst, dst, 4, vec_enc); 6094 6095 // Get the reverse bit sequence of upper nibble of each byte. 6096 vpandn(xtmp2, xtmp2, src, vec_enc); 6097 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6098 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6099 6100 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6101 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6102 vpor(xtmp2, dst, xtmp2, vec_enc); 6103 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6104 } 6105 } 6106 6107 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 6108 XMMRegister xtmp, Register rscratch) { 6109 assert(VM_Version::supports_gfni(), ""); 6110 assert(rscratch != noreg || always_reachable(mask), "missing"); 6111 6112 // Galois field instruction based bit reversal based on following algorithm. 6113 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6114 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 6115 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 6116 vector_reverse_byte(bt, dst, xtmp, vec_enc); 6117 } 6118 6119 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 6120 XMMRegister xtmp1, Register rtmp, int vec_enc) { 6121 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 6122 evpandq(dst, xtmp1, src, vec_enc); 6123 vpsllq(dst, dst, nbits, vec_enc); 6124 vpandn(xtmp1, xtmp1, src, vec_enc); 6125 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 6126 evporq(dst, dst, xtmp1, vec_enc); 6127 } 6128 6129 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6130 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6131 // Shift based bit reversal. 6132 assert(VM_Version::supports_evex(), ""); 6133 switch(bt) { 6134 case T_LONG: 6135 // Swap upper and lower double word of each quad word. 6136 evprorq(xtmp1, k0, src, 32, true, vec_enc); 6137 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 6138 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6139 break; 6140 case T_INT: 6141 // Swap upper and lower word of each double word. 6142 evprord(xtmp1, k0, src, 16, true, vec_enc); 6143 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6144 break; 6145 case T_CHAR: 6146 case T_SHORT: 6147 // Swap upper and lower byte of each word. 6148 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 6149 break; 6150 case T_BYTE: 6151 evmovdquq(dst, k0, src, true, vec_enc); 6152 break; 6153 default: 6154 fatal("Unsupported type %s", type2name(bt)); 6155 break; 6156 } 6157 } 6158 6159 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 6160 if (bt == T_BYTE) { 6161 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 6162 evmovdquq(dst, k0, src, true, vec_enc); 6163 } else { 6164 vmovdqu(dst, src); 6165 } 6166 return; 6167 } 6168 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 6169 // pre-computed shuffle indices. 6170 switch(bt) { 6171 case T_LONG: 6172 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 6173 break; 6174 case T_INT: 6175 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 6176 break; 6177 case T_CHAR: 6178 case T_SHORT: 6179 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6180 break; 6181 default: 6182 fatal("Unsupported type %s", type2name(bt)); 6183 break; 6184 } 6185 vpshufb(dst, src, dst, vec_enc); 6186 } 6187 6188 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6189 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6190 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6191 assert(is_integral_type(bt), ""); 6192 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6193 assert(VM_Version::supports_avx512cd(), ""); 6194 switch(bt) { 6195 case T_LONG: 6196 evplzcntq(dst, ktmp, src, merge, vec_enc); 6197 break; 6198 case T_INT: 6199 evplzcntd(dst, ktmp, src, merge, vec_enc); 6200 break; 6201 case T_SHORT: 6202 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6203 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6204 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6205 vpunpckhwd(dst, xtmp1, src, vec_enc); 6206 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6207 vpackusdw(dst, xtmp2, dst, vec_enc); 6208 break; 6209 case T_BYTE: 6210 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6211 // accessing the lookup table. 6212 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6213 // accessing the lookup table. 6214 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6215 assert(VM_Version::supports_avx512bw(), ""); 6216 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6217 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6218 vpand(xtmp2, dst, src, vec_enc); 6219 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6220 vpsrlw(xtmp3, src, 4, vec_enc); 6221 vpand(xtmp3, dst, xtmp3, vec_enc); 6222 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6223 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6224 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6225 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6226 break; 6227 default: 6228 fatal("Unsupported type %s", type2name(bt)); 6229 break; 6230 } 6231 } 6232 6233 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6234 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6235 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6236 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6237 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6238 // accessing the lookup table. 6239 vpand(dst, xtmp2, src, vec_enc); 6240 vpshufb(dst, xtmp1, dst, vec_enc); 6241 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6242 // accessing the lookup table. 6243 vpsrlw(xtmp3, src, 4, vec_enc); 6244 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6245 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6246 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6247 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6248 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6249 vpaddb(dst, dst, xtmp2, vec_enc); 6250 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6251 } 6252 6253 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6254 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6255 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6256 // Add zero counts of lower byte and upper byte of a word if 6257 // upper byte holds a zero value. 6258 vpsrlw(xtmp3, src, 8, vec_enc); 6259 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6260 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6261 vpsllw(xtmp2, dst, 8, vec_enc); 6262 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6263 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6264 vpsrlw(dst, dst, 8, vec_enc); 6265 } 6266 6267 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6268 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6269 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6270 // hence biased exponent can be used to compute leading zero count as per 6271 // following formula:- 6272 // LZCNT = 32 - (biased_exp - 127) 6273 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6274 6275 // Broadcast 0xFF 6276 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6277 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6278 6279 // Extract biased exponent. 6280 vcvtdq2ps(dst, src, vec_enc); 6281 vpsrld(dst, dst, 23, vec_enc); 6282 vpand(dst, dst, xtmp1, vec_enc); 6283 6284 // Broadcast 127. 6285 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6286 // Exponent = biased_exp - 127 6287 vpsubd(dst, dst, xtmp1, vec_enc); 6288 6289 // Exponent = Exponent + 1 6290 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6291 vpaddd(dst, dst, xtmp3, vec_enc); 6292 6293 // Replace -ve exponent with zero, exponent is -ve when src 6294 // lane contains a zero value. 6295 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6296 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6297 6298 // Rematerialize broadcast 32. 6299 vpslld(xtmp1, xtmp3, 5, vec_enc); 6300 // Exponent is 32 if corresponding source lane contains max_int value. 6301 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6302 // LZCNT = 32 - exponent 6303 vpsubd(dst, xtmp1, dst, vec_enc); 6304 6305 // Replace LZCNT with a value 1 if corresponding source lane 6306 // contains max_int value. 6307 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6308 6309 // Replace biased_exp with 0 if source lane value is less than zero. 6310 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6311 vblendvps(dst, dst, xtmp2, src, vec_enc); 6312 } 6313 6314 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6315 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6316 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6317 // Add zero counts of lower word and upper word of a double word if 6318 // upper word holds a zero value. 6319 vpsrld(xtmp3, src, 16, vec_enc); 6320 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6321 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6322 vpslld(xtmp2, dst, 16, vec_enc); 6323 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6324 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6325 vpsrld(dst, dst, 16, vec_enc); 6326 // Add zero counts of lower doubleword and upper doubleword of a 6327 // quadword if upper doubleword holds a zero value. 6328 vpsrlq(xtmp3, src, 32, vec_enc); 6329 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6330 vpsllq(xtmp2, dst, 32, vec_enc); 6331 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6332 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6333 vpsrlq(dst, dst, 32, vec_enc); 6334 } 6335 6336 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6337 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6338 Register rtmp, int vec_enc) { 6339 assert(is_integral_type(bt), "unexpected type"); 6340 assert(vec_enc < Assembler::AVX_512bit, ""); 6341 switch(bt) { 6342 case T_LONG: 6343 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6344 break; 6345 case T_INT: 6346 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6347 break; 6348 case T_SHORT: 6349 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6350 break; 6351 case T_BYTE: 6352 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6353 break; 6354 default: 6355 fatal("Unsupported type %s", type2name(bt)); 6356 break; 6357 } 6358 } 6359 6360 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6361 switch(bt) { 6362 case T_BYTE: 6363 vpsubb(dst, src1, src2, vec_enc); 6364 break; 6365 case T_SHORT: 6366 vpsubw(dst, src1, src2, vec_enc); 6367 break; 6368 case T_INT: 6369 vpsubd(dst, src1, src2, vec_enc); 6370 break; 6371 case T_LONG: 6372 vpsubq(dst, src1, src2, vec_enc); 6373 break; 6374 default: 6375 fatal("Unsupported type %s", type2name(bt)); 6376 break; 6377 } 6378 } 6379 6380 // Trailing zero count computation is based on leading zero count operation as per 6381 // following equation. All AVX3 targets support AVX512CD feature which offers 6382 // direct vector instruction to compute leading zero count. 6383 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6384 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6385 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6386 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6387 assert(is_integral_type(bt), ""); 6388 // xtmp = -1 6389 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6390 // xtmp = xtmp + src 6391 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6392 // xtmp = xtmp & ~src 6393 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6394 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6395 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6396 vpsub(bt, dst, xtmp4, dst, vec_enc); 6397 } 6398 6399 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6400 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6401 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6402 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6403 assert(is_integral_type(bt), ""); 6404 // xtmp = 0 6405 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6406 // xtmp = 0 - src 6407 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6408 // xtmp = xtmp | src 6409 vpor(xtmp3, xtmp3, src, vec_enc); 6410 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6411 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6412 vpsub(bt, dst, xtmp1, dst, vec_enc); 6413 } 6414 6415 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6416 Label done; 6417 Label neg_divisor_fastpath; 6418 cmpl(divisor, 0); 6419 jccb(Assembler::less, neg_divisor_fastpath); 6420 xorl(rdx, rdx); 6421 divl(divisor); 6422 jmpb(done); 6423 bind(neg_divisor_fastpath); 6424 // Fastpath for divisor < 0: 6425 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6426 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6427 movl(rdx, rax); 6428 subl(rdx, divisor); 6429 if (VM_Version::supports_bmi1()) { 6430 andnl(rax, rdx, rax); 6431 } else { 6432 notl(rdx); 6433 andl(rax, rdx); 6434 } 6435 shrl(rax, 31); 6436 bind(done); 6437 } 6438 6439 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6440 Label done; 6441 Label neg_divisor_fastpath; 6442 cmpl(divisor, 0); 6443 jccb(Assembler::less, neg_divisor_fastpath); 6444 xorl(rdx, rdx); 6445 divl(divisor); 6446 jmpb(done); 6447 bind(neg_divisor_fastpath); 6448 // Fastpath when divisor < 0: 6449 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6450 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6451 movl(rdx, rax); 6452 subl(rax, divisor); 6453 if (VM_Version::supports_bmi1()) { 6454 andnl(rax, rax, rdx); 6455 } else { 6456 notl(rax); 6457 andl(rax, rdx); 6458 } 6459 sarl(rax, 31); 6460 andl(rax, divisor); 6461 subl(rdx, rax); 6462 bind(done); 6463 } 6464 6465 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6466 Label done; 6467 Label neg_divisor_fastpath; 6468 6469 cmpl(divisor, 0); 6470 jccb(Assembler::less, neg_divisor_fastpath); 6471 xorl(rdx, rdx); 6472 divl(divisor); 6473 jmpb(done); 6474 bind(neg_divisor_fastpath); 6475 // Fastpath for divisor < 0: 6476 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6477 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6478 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6479 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6480 movl(rdx, rax); 6481 subl(rax, divisor); 6482 if (VM_Version::supports_bmi1()) { 6483 andnl(rax, rax, rdx); 6484 } else { 6485 notl(rax); 6486 andl(rax, rdx); 6487 } 6488 movl(tmp, rax); 6489 shrl(rax, 31); // quotient 6490 sarl(tmp, 31); 6491 andl(tmp, divisor); 6492 subl(rdx, tmp); // remainder 6493 bind(done); 6494 } 6495 6496 #ifdef _LP64 6497 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6498 XMMRegister xtmp2, Register rtmp) { 6499 if(VM_Version::supports_gfni()) { 6500 // Galois field instruction based bit reversal based on following algorithm. 6501 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6502 mov64(rtmp, 0x8040201008040201L); 6503 movq(xtmp1, src); 6504 movq(xtmp2, rtmp); 6505 gf2p8affineqb(xtmp1, xtmp2, 0); 6506 movq(dst, xtmp1); 6507 } else { 6508 // Swap even and odd numbered bits. 6509 movl(rtmp, src); 6510 andl(rtmp, 0x55555555); 6511 shll(rtmp, 1); 6512 movl(dst, src); 6513 andl(dst, 0xAAAAAAAA); 6514 shrl(dst, 1); 6515 orl(dst, rtmp); 6516 6517 // Swap LSB and MSB 2 bits of each nibble. 6518 movl(rtmp, dst); 6519 andl(rtmp, 0x33333333); 6520 shll(rtmp, 2); 6521 andl(dst, 0xCCCCCCCC); 6522 shrl(dst, 2); 6523 orl(dst, rtmp); 6524 6525 // Swap LSB and MSB 4 bits of each byte. 6526 movl(rtmp, dst); 6527 andl(rtmp, 0x0F0F0F0F); 6528 shll(rtmp, 4); 6529 andl(dst, 0xF0F0F0F0); 6530 shrl(dst, 4); 6531 orl(dst, rtmp); 6532 } 6533 bswapl(dst); 6534 } 6535 6536 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6537 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6538 if(VM_Version::supports_gfni()) { 6539 // Galois field instruction based bit reversal based on following algorithm. 6540 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6541 mov64(rtmp1, 0x8040201008040201L); 6542 movq(xtmp1, src); 6543 movq(xtmp2, rtmp1); 6544 gf2p8affineqb(xtmp1, xtmp2, 0); 6545 movq(dst, xtmp1); 6546 } else { 6547 // Swap even and odd numbered bits. 6548 movq(rtmp1, src); 6549 mov64(rtmp2, 0x5555555555555555L); 6550 andq(rtmp1, rtmp2); 6551 shlq(rtmp1, 1); 6552 movq(dst, src); 6553 notq(rtmp2); 6554 andq(dst, rtmp2); 6555 shrq(dst, 1); 6556 orq(dst, rtmp1); 6557 6558 // Swap LSB and MSB 2 bits of each nibble. 6559 movq(rtmp1, dst); 6560 mov64(rtmp2, 0x3333333333333333L); 6561 andq(rtmp1, rtmp2); 6562 shlq(rtmp1, 2); 6563 notq(rtmp2); 6564 andq(dst, rtmp2); 6565 shrq(dst, 2); 6566 orq(dst, rtmp1); 6567 6568 // Swap LSB and MSB 4 bits of each byte. 6569 movq(rtmp1, dst); 6570 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6571 andq(rtmp1, rtmp2); 6572 shlq(rtmp1, 4); 6573 notq(rtmp2); 6574 andq(dst, rtmp2); 6575 shrq(dst, 4); 6576 orq(dst, rtmp1); 6577 } 6578 bswapq(dst); 6579 } 6580 6581 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6582 Label done; 6583 Label neg_divisor_fastpath; 6584 cmpq(divisor, 0); 6585 jccb(Assembler::less, neg_divisor_fastpath); 6586 xorl(rdx, rdx); 6587 divq(divisor); 6588 jmpb(done); 6589 bind(neg_divisor_fastpath); 6590 // Fastpath for divisor < 0: 6591 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6592 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6593 movq(rdx, rax); 6594 subq(rdx, divisor); 6595 if (VM_Version::supports_bmi1()) { 6596 andnq(rax, rdx, rax); 6597 } else { 6598 notq(rdx); 6599 andq(rax, rdx); 6600 } 6601 shrq(rax, 63); 6602 bind(done); 6603 } 6604 6605 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6606 Label done; 6607 Label neg_divisor_fastpath; 6608 cmpq(divisor, 0); 6609 jccb(Assembler::less, neg_divisor_fastpath); 6610 xorq(rdx, rdx); 6611 divq(divisor); 6612 jmp(done); 6613 bind(neg_divisor_fastpath); 6614 // Fastpath when divisor < 0: 6615 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6616 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6617 movq(rdx, rax); 6618 subq(rax, divisor); 6619 if (VM_Version::supports_bmi1()) { 6620 andnq(rax, rax, rdx); 6621 } else { 6622 notq(rax); 6623 andq(rax, rdx); 6624 } 6625 sarq(rax, 63); 6626 andq(rax, divisor); 6627 subq(rdx, rax); 6628 bind(done); 6629 } 6630 6631 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6632 Label done; 6633 Label neg_divisor_fastpath; 6634 cmpq(divisor, 0); 6635 jccb(Assembler::less, neg_divisor_fastpath); 6636 xorq(rdx, rdx); 6637 divq(divisor); 6638 jmp(done); 6639 bind(neg_divisor_fastpath); 6640 // Fastpath for divisor < 0: 6641 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6642 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6643 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6644 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6645 movq(rdx, rax); 6646 subq(rax, divisor); 6647 if (VM_Version::supports_bmi1()) { 6648 andnq(rax, rax, rdx); 6649 } else { 6650 notq(rax); 6651 andq(rax, rdx); 6652 } 6653 movq(tmp, rax); 6654 shrq(rax, 63); // quotient 6655 sarq(tmp, 63); 6656 andq(tmp, divisor); 6657 subq(rdx, tmp); // remainder 6658 bind(done); 6659 } 6660 #endif 6661 6662 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6663 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6664 int vlen_enc) { 6665 assert(VM_Version::supports_avx512bw(), ""); 6666 // Byte shuffles are inlane operations and indices are determined using 6667 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6668 // normalized to index range 0-15. This makes sure that all the multiples 6669 // of an index value are placed at same relative position in 128 bit 6670 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6671 // will be 16th element in their respective 128 bit lanes. 6672 movl(rtmp, 16); 6673 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6674 6675 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6676 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6677 // original shuffle indices and move the shuffled lanes corresponding to true 6678 // mask to destination vector. 6679 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6680 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6681 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6682 6683 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6684 // and broadcasting second 128 bit lane. 6685 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6686 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6687 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6688 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6689 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6690 6691 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6692 // and broadcasting third 128 bit lane. 6693 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6694 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6695 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6696 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6697 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6698 6699 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6700 // and broadcasting third 128 bit lane. 6701 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6702 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6703 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6704 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6705 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6706 } 6707 6708 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6709 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6710 if (vlen_enc == AVX_128bit) { 6711 vpermilps(dst, src, shuffle, vlen_enc); 6712 } else if (bt == T_INT) { 6713 vpermd(dst, shuffle, src, vlen_enc); 6714 } else { 6715 assert(bt == T_FLOAT, ""); 6716 vpermps(dst, shuffle, src, vlen_enc); 6717 } 6718 } 6719 6720 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6721 switch(elem_bt) { 6722 case T_BYTE: 6723 if (ideal_opc == Op_SaturatingAddV) { 6724 vpaddsb(dst, src1, src2, vlen_enc); 6725 } else { 6726 assert(ideal_opc == Op_SaturatingSubV, ""); 6727 vpsubsb(dst, src1, src2, vlen_enc); 6728 } 6729 break; 6730 case T_SHORT: 6731 if (ideal_opc == Op_SaturatingAddV) { 6732 vpaddsw(dst, src1, src2, vlen_enc); 6733 } else { 6734 assert(ideal_opc == Op_SaturatingSubV, ""); 6735 vpsubsw(dst, src1, src2, vlen_enc); 6736 } 6737 break; 6738 default: 6739 fatal("Unsupported type %s", type2name(elem_bt)); 6740 break; 6741 } 6742 } 6743 6744 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6745 switch(elem_bt) { 6746 case T_BYTE: 6747 if (ideal_opc == Op_SaturatingAddV) { 6748 vpaddusb(dst, src1, src2, vlen_enc); 6749 } else { 6750 assert(ideal_opc == Op_SaturatingSubV, ""); 6751 vpsubusb(dst, src1, src2, vlen_enc); 6752 } 6753 break; 6754 case T_SHORT: 6755 if (ideal_opc == Op_SaturatingAddV) { 6756 vpaddusw(dst, src1, src2, vlen_enc); 6757 } else { 6758 assert(ideal_opc == Op_SaturatingSubV, ""); 6759 vpsubusw(dst, src1, src2, vlen_enc); 6760 } 6761 break; 6762 default: 6763 fatal("Unsupported type %s", type2name(elem_bt)); 6764 break; 6765 } 6766 } 6767 6768 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6769 XMMRegister src2, KRegister ktmp, int vlen_enc) { 6770 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input. 6771 // overflow_mask = Inp1 <u Inp2 6772 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc); 6773 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative) 6774 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false); 6775 } 6776 6777 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6778 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6779 // Emulate unsigned comparison using signed comparison 6780 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE 6781 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true); 6782 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc); 6783 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc); 6784 6785 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc); 6786 6787 // Res = INP1 - INP2 (non-commutative and non-associative) 6788 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6789 // Res = Mask ? Zero : Res 6790 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); 6791 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc); 6792 } 6793 6794 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6795 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) { 6796 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation. 6797 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2) 6798 // Res = Signed Add INP1, INP2 6799 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6800 // T1 = SRC1 | SRC2 6801 vpor(xtmp1, src1, src2, vlen_enc); 6802 // Max_Unsigned = -1 6803 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6804 // Unsigned compare: Mask = Res <u T1 6805 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc); 6806 // res = Mask ? Max_Unsigned : Res 6807 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc); 6808 } 6809 6810 // 6811 // Section 2-13 Hacker's Delight list following overflow detection check for saturating 6812 // unsigned addition operation. 6813 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1 6814 // 6815 // We empirically determined its semantic equivalence to following reduced expression 6816 // overflow_mask = (a + b) <u (a | b) 6817 // 6818 // and also verified it though Alive2 solver. 6819 // (https://alive2.llvm.org/ce/z/XDQ7dY) 6820 // 6821 6822 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6823 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) { 6824 // Res = Signed Add INP1, INP2 6825 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6826 // Compute T1 = INP1 | INP2 6827 vpor(xtmp3, src1, src2, vlen_enc); 6828 // T1 = Minimum signed value. 6829 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6830 // Convert T1 to signed value, T1 = T1 + MIN_VALUE 6831 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc); 6832 // Convert Res to signed value, Res<s> = Res + MIN_VALUE 6833 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc); 6834 // Compute overflow detection mask = Res<1> <s T1 6835 if (elem_bt == T_INT) { 6836 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc); 6837 } else { 6838 assert(elem_bt == T_LONG, ""); 6839 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc); 6840 } 6841 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6842 } 6843 6844 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6845 int vlen_enc, bool xtmp2_hold_M1) { 6846 if (VM_Version::supports_avx512dq()) { 6847 evpmovq2m(ktmp, src, vlen_enc); 6848 } else { 6849 assert(VM_Version::supports_evex(), ""); 6850 if (!xtmp2_hold_M1) { 6851 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6852 } 6853 evpsraq(xtmp1, src, 63, vlen_enc); 6854 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6855 } 6856 } 6857 6858 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6859 int vlen_enc, bool xtmp2_hold_M1) { 6860 if (VM_Version::supports_avx512dq()) { 6861 evpmovd2m(ktmp, src, vlen_enc); 6862 } else { 6863 assert(VM_Version::supports_evex(), ""); 6864 if (!xtmp2_hold_M1) { 6865 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6866 } 6867 vpsrad(xtmp1, src, 31, vlen_enc); 6868 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6869 } 6870 } 6871 6872 6873 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 6874 if (elem_bt == T_LONG) { 6875 if (VM_Version::supports_evex()) { 6876 evpsraq(dst, src, 63, vlen_enc); 6877 } else { 6878 vpsrad(dst, src, 31, vlen_enc); 6879 vpshufd(dst, dst, 0xF5, vlen_enc); 6880 } 6881 } else { 6882 assert(elem_bt == T_INT, ""); 6883 vpsrad(dst, src, 31, vlen_enc); 6884 } 6885 } 6886 6887 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6888 if (compute_allones) { 6889 if (vlen_enc == Assembler::AVX_512bit) { 6890 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6891 } else { 6892 vpcmpeqq(allones, allones, allones, vlen_enc); 6893 } 6894 } 6895 if (elem_bt == T_LONG) { 6896 vpsrlq(dst, allones, 1, vlen_enc); 6897 } else { 6898 assert(elem_bt == T_INT, ""); 6899 vpsrld(dst, allones, 1, vlen_enc); 6900 } 6901 } 6902 6903 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6904 if (compute_allones) { 6905 if (vlen_enc == Assembler::AVX_512bit) { 6906 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6907 } else { 6908 vpcmpeqq(allones, allones, allones, vlen_enc); 6909 } 6910 } 6911 if (elem_bt == T_LONG) { 6912 vpsllq(dst, allones, 63, vlen_enc); 6913 } else { 6914 assert(elem_bt == T_INT, ""); 6915 vpslld(dst, allones, 31, vlen_enc); 6916 } 6917 } 6918 6919 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2, 6920 Assembler::ComparisonPredicate cond, int vlen_enc) { 6921 switch(elem_bt) { 6922 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break; 6923 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break; 6924 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break; 6925 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break; 6926 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6927 } 6928 } 6929 6930 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6931 switch(elem_bt) { 6932 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break; 6933 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break; 6934 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break; 6935 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break; 6936 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6937 } 6938 } 6939 6940 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1, 6941 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) { 6942 if (elem_bt == T_LONG) { 6943 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6944 } else { 6945 assert(elem_bt == T_INT, ""); 6946 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6947 } 6948 } 6949 6950 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6951 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6952 KRegister ktmp1, KRegister ktmp2, int vlen_enc) { 6953 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6954 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6955 // Overflow detection based on Hacker's delight section 2-13. 6956 if (ideal_opc == Op_SaturatingAddV) { 6957 // res = src1 + src2 6958 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6959 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6960 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6961 vpxor(xtmp1, dst, src1, vlen_enc); 6962 vpxor(xtmp2, dst, src2, vlen_enc); 6963 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6964 } else { 6965 assert(ideal_opc == Op_SaturatingSubV, ""); 6966 // res = src1 - src2 6967 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6968 // Overflow occurs when both inputs have opposite polarity and 6969 // result polarity does not comply with first input polarity. 6970 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6971 vpxor(xtmp1, src1, src2, vlen_enc); 6972 vpxor(xtmp2, dst, src1, vlen_enc); 6973 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6974 } 6975 6976 // Compute overflow detection mask. 6977 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc); 6978 // Note: xtmp1 hold -1 in all its lanes after above call. 6979 6980 // Compute mask based on first input polarity. 6981 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true); 6982 6983 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6984 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6985 6986 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to 6987 // set bits in first input polarity mask holds a min value. 6988 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc); 6989 // Blend destination lanes with saturated values using overflow detection mask. 6990 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc); 6991 } 6992 6993 6994 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6995 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6996 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) { 6997 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6998 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6999 // Overflow detection based on Hacker's delight section 2-13. 7000 if (ideal_opc == Op_SaturatingAddV) { 7001 // res = src1 + src2 7002 vpadd(elem_bt, dst, src1, src2, vlen_enc); 7003 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 7004 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 7005 vpxor(xtmp1, dst, src1, vlen_enc); 7006 vpxor(xtmp2, dst, src2, vlen_enc); 7007 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 7008 } else { 7009 assert(ideal_opc == Op_SaturatingSubV, ""); 7010 // res = src1 - src2 7011 vpsub(elem_bt, dst, src1, src2, vlen_enc); 7012 // Overflow occurs when both inputs have opposite polarity and 7013 // result polarity does not comply with first input polarity. 7014 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 7015 vpxor(xtmp1, src1, src2, vlen_enc); 7016 vpxor(xtmp2, dst, src1, vlen_enc); 7017 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 7018 } 7019 7020 // Sign-extend to compute overflow detection mask. 7021 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc); 7022 7023 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc); 7024 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc); 7025 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 7026 7027 // Compose saturating min/max vector using first input polarity mask. 7028 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc); 7029 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc); 7030 7031 // Blend result with saturating vector using overflow detection mask. 7032 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 7033 } 7034 7035 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7036 switch(elem_bt) { 7037 case T_BYTE: 7038 if (ideal_opc == Op_SaturatingAddV) { 7039 vpaddsb(dst, src1, src2, vlen_enc); 7040 } else { 7041 assert(ideal_opc == Op_SaturatingSubV, ""); 7042 vpsubsb(dst, src1, src2, vlen_enc); 7043 } 7044 break; 7045 case T_SHORT: 7046 if (ideal_opc == Op_SaturatingAddV) { 7047 vpaddsw(dst, src1, src2, vlen_enc); 7048 } else { 7049 assert(ideal_opc == Op_SaturatingSubV, ""); 7050 vpsubsw(dst, src1, src2, vlen_enc); 7051 } 7052 break; 7053 default: 7054 fatal("Unsupported type %s", type2name(elem_bt)); 7055 break; 7056 } 7057 } 7058 7059 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7060 switch(elem_bt) { 7061 case T_BYTE: 7062 if (ideal_opc == Op_SaturatingAddV) { 7063 vpaddusb(dst, src1, src2, vlen_enc); 7064 } else { 7065 assert(ideal_opc == Op_SaturatingSubV, ""); 7066 vpsubusb(dst, src1, src2, vlen_enc); 7067 } 7068 break; 7069 case T_SHORT: 7070 if (ideal_opc == Op_SaturatingAddV) { 7071 vpaddusw(dst, src1, src2, vlen_enc); 7072 } else { 7073 assert(ideal_opc == Op_SaturatingSubV, ""); 7074 vpsubusw(dst, src1, src2, vlen_enc); 7075 } 7076 break; 7077 default: 7078 fatal("Unsupported type %s", type2name(elem_bt)); 7079 break; 7080 } 7081 } 7082 7083 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 7084 XMMRegister src2, int vlen_enc) { 7085 switch(elem_bt) { 7086 case T_BYTE: 7087 evpermi2b(dst, src1, src2, vlen_enc); 7088 break; 7089 case T_SHORT: 7090 evpermi2w(dst, src1, src2, vlen_enc); 7091 break; 7092 case T_INT: 7093 evpermi2d(dst, src1, src2, vlen_enc); 7094 break; 7095 case T_LONG: 7096 evpermi2q(dst, src1, src2, vlen_enc); 7097 break; 7098 case T_FLOAT: 7099 evpermi2ps(dst, src1, src2, vlen_enc); 7100 break; 7101 case T_DOUBLE: 7102 evpermi2pd(dst, src1, src2, vlen_enc); 7103 break; 7104 default: 7105 fatal("Unsupported type %s", type2name(elem_bt)); 7106 break; 7107 } 7108 } 7109 7110 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) { 7111 if (is_unsigned) { 7112 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7113 } else { 7114 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7115 } 7116 } 7117 7118 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) { 7119 if (is_unsigned) { 7120 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7121 } else { 7122 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7123 } 7124 }