1 /* 2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "gc/shared/barrierSet.hpp" 28 #include "gc/shared/barrierSetAssembler.hpp" 29 #include "oops/methodData.hpp" 30 #include "opto/c2_MacroAssembler.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/output.hpp" 33 #include "opto/opcodes.hpp" 34 #include "opto/subnode.hpp" 35 #include "runtime/globals.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 #include "utilities/checkedCast.hpp" 39 #include "utilities/globalDefinitions.hpp" 40 #include "utilities/powerOfTwo.hpp" 41 #include "utilities/sizes.hpp" 42 43 #ifdef PRODUCT 44 #define BLOCK_COMMENT(str) /* nothing */ 45 #define STOP(error) stop(error) 46 #else 47 #define BLOCK_COMMENT(str) block_comment(str) 48 #define STOP(error) block_comment(error); stop(error) 49 #endif 50 51 // C2 compiled method's prolog code. 52 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) { 53 if (C->clinit_barrier_on_entry()) { 54 assert(VM_Version::supports_fast_class_init_checks(), "sanity"); 55 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started"); 56 57 Label L_skip_barrier; 58 Register klass = rscratch1; 59 60 mov_metadata(klass, C->method()->holder()->constant_encoding()); 61 clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 62 63 jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 64 65 bind(L_skip_barrier); 66 } 67 68 int framesize = C->output()->frame_size_in_bytes(); 69 int bangsize = C->output()->bang_size_in_bytes(); 70 bool fp_mode_24b = false; 71 int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0; 72 73 // WARNING: Initial instruction MUST be 5 bytes or longer so that 74 // NativeJump::patch_verified_entry will be able to patch out the entry 75 // code safely. The push to verify stack depth is ok at 5 bytes, 76 // the frame allocation can be either 3 or 6 bytes. So if we don't do 77 // stack bang then we must use the 6 byte frame allocation even if 78 // we have no frame. :-( 79 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 80 81 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 82 // Remove word for return addr 83 framesize -= wordSize; 84 stack_bang_size -= wordSize; 85 86 // Calls to C2R adapters often do not accept exceptional returns. 87 // We require that their callers must bang for them. But be careful, because 88 // some VM calls (such as call site linkage) can use several kilobytes of 89 // stack. But the stack safety zone should account for that. 90 // See bugs 4446381, 4468289, 4497237. 91 if (stack_bang_size > 0) { 92 generate_stack_overflow_check(stack_bang_size); 93 94 // We always push rbp, so that on return to interpreter rbp, will be 95 // restored correctly and we can correct the stack. 96 push(rbp); 97 // Save caller's stack pointer into RBP if the frame pointer is preserved. 98 if (PreserveFramePointer) { 99 mov(rbp, rsp); 100 } 101 // Remove word for ebp 102 framesize -= wordSize; 103 104 // Create frame 105 if (framesize) { 106 subptr(rsp, framesize); 107 } 108 } else { 109 // Create frame (force generation of a 4 byte immediate value) 110 subptr_imm32(rsp, framesize); 111 112 // Save RBP register now. 113 framesize -= wordSize; 114 movptr(Address(rsp, framesize), rbp); 115 // Save caller's stack pointer into RBP if the frame pointer is preserved. 116 if (PreserveFramePointer) { 117 movptr(rbp, rsp); 118 if (framesize > 0) { 119 addptr(rbp, framesize); 120 } 121 } 122 } 123 124 if (C->needs_stack_repair()) { 125 // Save stack increment just below the saved rbp (also account for fixed framesize and rbp) 126 assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned"); 127 movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize); 128 } 129 130 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 131 framesize -= wordSize; 132 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 133 } 134 135 #ifndef _LP64 136 // If method sets FPU control word do it now 137 if (fp_mode_24b) { 138 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 139 } 140 if (UseSSE >= 2 && VerifyFPU) { 141 verify_FPU(0, "FPU stack must be clean on entry"); 142 } 143 #endif 144 145 #ifdef ASSERT 146 if (VerifyStackAtCalls) { 147 Label L; 148 push(rax); 149 mov(rax, rsp); 150 andptr(rax, StackAlignmentInBytes-1); 151 cmpptr(rax, StackAlignmentInBytes-wordSize); 152 pop(rax); 153 jcc(Assembler::equal, L); 154 STOP("Stack is not properly aligned!"); 155 bind(L); 156 } 157 #endif 158 } 159 160 void C2_MacroAssembler::entry_barrier() { 161 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 162 #ifdef _LP64 163 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 164 Label dummy_slow_path; 165 Label dummy_continuation; 166 Label* slow_path = &dummy_slow_path; 167 Label* continuation = &dummy_continuation; 168 if (!Compile::current()->output()->in_scratch_emit_size()) { 169 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 170 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 171 Compile::current()->output()->add_stub(stub); 172 slow_path = &stub->entry(); 173 continuation = &stub->continuation(); 174 } 175 bs->nmethod_entry_barrier(this, slow_path, continuation); 176 #else 177 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 178 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 179 #endif 180 } 181 182 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 183 switch (vlen_in_bytes) { 184 case 4: // fall-through 185 case 8: // fall-through 186 case 16: return Assembler::AVX_128bit; 187 case 32: return Assembler::AVX_256bit; 188 case 64: return Assembler::AVX_512bit; 189 190 default: { 191 ShouldNotReachHere(); 192 return Assembler::AVX_NoVec; 193 } 194 } 195 } 196 197 // fast_lock and fast_unlock used by C2 198 199 // Because the transitions from emitted code to the runtime 200 // monitorenter/exit helper stubs are so slow it's critical that 201 // we inline both the stack-locking fast path and the inflated fast path. 202 // 203 // See also: cmpFastLock and cmpFastUnlock. 204 // 205 // What follows is a specialized inline transliteration of the code 206 // in enter() and exit(). If we're concerned about I$ bloat another 207 // option would be to emit TrySlowEnter and TrySlowExit methods 208 // at startup-time. These methods would accept arguments as 209 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 210 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 211 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 212 // In practice, however, the # of lock sites is bounded and is usually small. 213 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 214 // if the processor uses simple bimodal branch predictors keyed by EIP 215 // Since the helper routines would be called from multiple synchronization 216 // sites. 217 // 218 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 219 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 220 // to those specialized methods. That'd give us a mostly platform-independent 221 // implementation that the JITs could optimize and inline at their pleasure. 222 // Done correctly, the only time we'd need to cross to native could would be 223 // to park() or unpark() threads. We'd also need a few more unsafe operators 224 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 225 // (b) explicit barriers or fence operations. 226 // 227 // TODO: 228 // 229 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 230 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 231 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 232 // the lock operators would typically be faster than reifying Self. 233 // 234 // * Ideally I'd define the primitives as: 235 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 236 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 237 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 238 // Instead, we're stuck with a rather awkward and brittle register assignments below. 239 // Furthermore the register assignments are overconstrained, possibly resulting in 240 // sub-optimal code near the synchronization site. 241 // 242 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 243 // Alternately, use a better sp-proximity test. 244 // 245 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 246 // Either one is sufficient to uniquely identify a thread. 247 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 248 // 249 // * Intrinsify notify() and notifyAll() for the common cases where the 250 // object is locked by the calling thread but the waitlist is empty. 251 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 252 // 253 // * use jccb and jmpb instead of jcc and jmp to improve code density. 254 // But beware of excessive branch density on AMD Opterons. 255 // 256 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 257 // or failure of the fast path. If the fast path fails then we pass 258 // control to the slow path, typically in C. In fast_lock and 259 // fast_unlock we often branch to DONE_LABEL, just to find that C2 260 // will emit a conditional branch immediately after the node. 261 // So we have branches to branches and lots of ICC.ZF games. 262 // Instead, it might be better to have C2 pass a "FailureLabel" 263 // into fast_lock and fast_unlock. In the case of success, control 264 // will drop through the node. ICC.ZF is undefined at exit. 265 // In the case of failure, the node will branch directly to the 266 // FailureLabel 267 268 269 // obj: object to lock 270 // box: on-stack box address (displaced header location) - KILLED 271 // rax,: tmp -- KILLED 272 // scr: tmp -- KILLED 273 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 274 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 275 Metadata* method_data) { 276 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 277 // Ensure the register assignments are disjoint 278 assert(tmpReg == rax, ""); 279 assert(cx1Reg == noreg, ""); 280 assert(cx2Reg == noreg, ""); 281 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 282 283 // Possible cases that we'll encounter in fast_lock 284 // ------------------------------------------------ 285 // * Inflated 286 // -- unlocked 287 // -- Locked 288 // = by self 289 // = by other 290 // * neutral 291 // * stack-locked 292 // -- by self 293 // = sp-proximity test hits 294 // = sp-proximity test generates false-negative 295 // -- by other 296 // 297 298 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 299 300 if (DiagnoseSyncOnValueBasedClasses != 0) { 301 load_klass(tmpReg, objReg, scrReg); 302 testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 303 jcc(Assembler::notZero, DONE_LABEL); 304 } 305 306 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 307 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 308 jcc(Assembler::notZero, IsInflated); 309 310 if (LockingMode == LM_MONITOR) { 311 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 312 testptr(objReg, objReg); 313 } else { 314 assert(LockingMode == LM_LEGACY, "must be"); 315 // Attempt stack-locking ... 316 orptr (tmpReg, markWord::unlocked_value); 317 if (EnableValhalla) { 318 // Mask inline_type bit such that we go to the slow path if object is an inline type 319 andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place)); 320 } 321 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 322 lock(); 323 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 324 jcc(Assembler::equal, COUNT); // Success 325 326 // Recursive locking. 327 // The object is stack-locked: markword contains stack pointer to BasicLock. 328 // Locked by current thread if difference with current SP is less than one page. 329 subptr(tmpReg, rsp); 330 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 331 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 332 movptr(Address(boxReg, 0), tmpReg); 333 } 334 jmp(DONE_LABEL); 335 336 bind(IsInflated); 337 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 338 339 #ifndef _LP64 340 // Just take slow path to avoid dealing with 64 bit atomic instructions here. 341 orl(boxReg, 1); // set ICC.ZF=0 to indicate failure 342 #else 343 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 344 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 345 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 346 347 // It's inflated and we use scrReg for ObjectMonitor* in this section. 348 movptr(boxReg, Address(r15_thread, JavaThread::monitor_owner_id_offset())); 349 movq(scrReg, tmpReg); 350 xorq(tmpReg, tmpReg); 351 lock(); 352 cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 353 354 // Propagate ICC.ZF from CAS above into DONE_LABEL. 355 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 356 357 cmpptr(boxReg, rax); // Check if we are already the owner (recursive lock) 358 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 359 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 360 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 361 #endif // _LP64 362 bind(DONE_LABEL); 363 364 // ZFlag == 1 count in fast path 365 // ZFlag == 0 count in slow path 366 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 367 368 bind(COUNT); 369 if (LockingMode == LM_LEGACY) { 370 #ifdef _LP64 371 // Count monitors in fast path 372 increment(Address(thread, JavaThread::held_monitor_count_offset())); 373 #endif 374 } 375 xorl(tmpReg, tmpReg); // Set ZF == 1 376 377 bind(NO_COUNT); 378 379 // At NO_COUNT the icc ZFlag is set as follows ... 380 // fast_unlock uses the same protocol. 381 // ZFlag == 1 -> Success 382 // ZFlag == 0 -> Failure - force control through the slow path 383 } 384 385 // obj: object to unlock 386 // box: box address (displaced header location), killed. Must be EAX. 387 // tmp: killed, cannot be obj nor box. 388 // 389 // Some commentary on balanced locking: 390 // 391 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 392 // Methods that don't have provably balanced locking are forced to run in the 393 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 394 // The interpreter provides two properties: 395 // I1: At return-time the interpreter automatically and quietly unlocks any 396 // objects acquired the current activation (frame). Recall that the 397 // interpreter maintains an on-stack list of locks currently held by 398 // a frame. 399 // I2: If a method attempts to unlock an object that is not held by the 400 // the frame the interpreter throws IMSX. 401 // 402 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 403 // B() doesn't have provably balanced locking so it runs in the interpreter. 404 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 405 // is still locked by A(). 406 // 407 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 408 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 409 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 410 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 411 // Arguably given that the spec legislates the JNI case as undefined our implementation 412 // could reasonably *avoid* checking owner in fast_unlock(). 413 // In the interest of performance we elide m->Owner==Self check in unlock. 414 // A perfectly viable alternative is to elide the owner check except when 415 // Xcheck:jni is enabled. 416 417 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { 418 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 419 assert(boxReg == rax, ""); 420 assert_different_registers(objReg, boxReg, tmpReg); 421 422 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 423 424 if (LockingMode == LM_LEGACY) { 425 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 426 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 427 } 428 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 429 if (LockingMode != LM_MONITOR) { 430 testptr(tmpReg, markWord::monitor_value); // Inflated? 431 jcc(Assembler::zero, Stacked); 432 } 433 434 // It's inflated. 435 436 #ifndef _LP64 437 // Just take slow path to avoid dealing with 64 bit atomic instructions here. 438 orl(boxReg, 1); // set ICC.ZF=0 to indicate failure 439 jmpb(DONE_LABEL); 440 #else 441 // Despite our balanced locking property we still check that m->_owner == Self 442 // as java routines or native JNI code called by this thread might 443 // have released the lock. 444 // 445 // If there's no contention try a 1-0 exit. That is, exit without 446 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 447 // we detect and recover from the race that the 1-0 exit admits. 448 // 449 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 450 // before it STs null into _owner, releasing the lock. Updates 451 // to data protected by the critical section must be visible before 452 // we drop the lock (and thus before any other thread could acquire 453 // the lock and observe the fields protected by the lock). 454 // IA32's memory-model is SPO, so STs are ordered with respect to 455 // each other and there's no need for an explicit barrier (fence). 456 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 457 Label LSuccess, LNotRecursive; 458 459 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 460 jccb(Assembler::equal, LNotRecursive); 461 462 // Recursive inflated unlock 463 decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 464 jmpb(LSuccess); 465 466 bind(LNotRecursive); 467 468 // Set owner to null. 469 // Release to satisfy the JMM 470 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 471 // We need a full fence after clearing owner to avoid stranding. 472 // StoreLoad achieves this. 473 membar(StoreLoad); 474 475 // Check if the entry_list is empty. 476 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(entry_list)), NULL_WORD); 477 jccb(Assembler::zero, LSuccess); // If so we are done. 478 479 // Check if there is a successor. 480 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 481 jccb(Assembler::notZero, LSuccess); // If so we are done. 482 483 // Save the monitor pointer in the current thread, so we can try to 484 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 485 andptr(tmpReg, ~(int32_t)markWord::monitor_value); 486 movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg); 487 488 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 489 jmpb (DONE_LABEL); 490 491 bind (LSuccess); 492 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 493 jmpb (DONE_LABEL); 494 #endif // _LP64 495 496 if (LockingMode == LM_LEGACY) { 497 bind (Stacked); 498 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 499 lock(); 500 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 501 // Intentional fall-thru into DONE_LABEL 502 } 503 504 bind(DONE_LABEL); 505 506 // ZFlag == 1 count in fast path 507 // ZFlag == 0 count in slow path 508 jccb(Assembler::notZero, NO_COUNT); 509 510 bind(COUNT); 511 512 if (LockingMode == LM_LEGACY) { 513 // Count monitors in fast path 514 #ifdef _LP64 515 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 516 #endif 517 } 518 519 xorl(tmpReg, tmpReg); // Set ZF == 1 520 521 bind(NO_COUNT); 522 } 523 524 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 525 Register t, Register thread) { 526 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 527 assert(rax_reg == rax, "Used for CAS"); 528 assert_different_registers(obj, box, rax_reg, t, thread); 529 530 // Handle inflated monitor. 531 Label inflated; 532 // Finish fast lock successfully. ZF value is irrelevant. 533 Label locked; 534 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 535 Label slow_path; 536 537 if (UseObjectMonitorTable) { 538 // Clear cache in case fast locking succeeds. 539 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 540 } 541 542 if (DiagnoseSyncOnValueBasedClasses != 0) { 543 load_klass(rax_reg, obj, t); 544 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 545 jcc(Assembler::notZero, slow_path); 546 } 547 548 const Register mark = t; 549 550 { // Lightweight Lock 551 552 Label push; 553 554 const Register top = UseObjectMonitorTable ? rax_reg : box; 555 556 // Load the mark. 557 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 558 559 // Prefetch top. 560 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 561 562 // Check for monitor (0b10). 563 testptr(mark, markWord::monitor_value); 564 jcc(Assembler::notZero, inflated); 565 566 // Check if lock-stack is full. 567 cmpl(top, LockStack::end_offset() - 1); 568 jcc(Assembler::greater, slow_path); 569 570 // Check if recursive. 571 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 572 jccb(Assembler::equal, push); 573 574 // Try to lock. Transition lock bits 0b01 => 0b00 575 movptr(rax_reg, mark); 576 orptr(rax_reg, markWord::unlocked_value); 577 andptr(mark, ~(int32_t)markWord::unlocked_value); 578 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 579 jcc(Assembler::notEqual, slow_path); 580 581 if (UseObjectMonitorTable) { 582 // Need to reload top, clobbered by CAS. 583 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 584 } 585 bind(push); 586 // After successful lock, push object on lock-stack. 587 movptr(Address(thread, top), obj); 588 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 589 jmpb(locked); 590 } 591 592 { // Handle inflated monitor. 593 bind(inflated); 594 595 #ifndef _LP64 596 // Just take slow path to avoid dealing with 64 bit atomic instructions here. 597 orl(box, 1); // set ICC.ZF=0 to indicate failure 598 jmpb(slow_path); 599 #else 600 const Register monitor = t; 601 602 if (!UseObjectMonitorTable) { 603 assert(mark == monitor, "should be the same here"); 604 } else { 605 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 606 // Fetch ObjectMonitor* from the cache or take the slow-path. 607 Label monitor_found; 608 609 // Load cache address 610 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 611 612 const int num_unrolled = 2; 613 for (int i = 0; i < num_unrolled; i++) { 614 cmpptr(obj, Address(t)); 615 jccb(Assembler::equal, monitor_found); 616 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 617 } 618 619 Label loop; 620 621 // Search for obj in cache. 622 bind(loop); 623 624 // Check for match. 625 cmpptr(obj, Address(t)); 626 jccb(Assembler::equal, monitor_found); 627 628 // Search until null encountered, guaranteed _null_sentinel at end. 629 cmpptr(Address(t), 1); 630 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 631 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 632 jmpb(loop); 633 634 // Cache hit. 635 bind(monitor_found); 636 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 637 } 638 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 639 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 640 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 641 642 Label monitor_locked; 643 // Lock the monitor. 644 645 if (UseObjectMonitorTable) { 646 // Cache the monitor for unlock before trashing box. On failure to acquire 647 // the lock, the slow path will reset the entry accordingly (see CacheSetter). 648 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 649 } 650 651 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 652 xorptr(rax_reg, rax_reg); 653 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset())); 654 lock(); cmpxchgptr(box, owner_address); 655 jccb(Assembler::equal, monitor_locked); 656 657 // Check if recursive. 658 cmpptr(box, rax_reg); 659 jccb(Assembler::notEqual, slow_path); 660 661 // Recursive. 662 increment(recursions_address); 663 664 bind(monitor_locked); 665 #endif // _LP64 666 } 667 668 bind(locked); 669 // Set ZF = 1 670 xorl(rax_reg, rax_reg); 671 672 #ifdef ASSERT 673 // Check that locked label is reached with ZF set. 674 Label zf_correct; 675 Label zf_bad_zero; 676 jcc(Assembler::zero, zf_correct); 677 jmp(zf_bad_zero); 678 #endif 679 680 bind(slow_path); 681 #ifdef ASSERT 682 // Check that slow_path label is reached with ZF not set. 683 jcc(Assembler::notZero, zf_correct); 684 stop("Fast Lock ZF != 0"); 685 bind(zf_bad_zero); 686 stop("Fast Lock ZF != 1"); 687 bind(zf_correct); 688 #endif 689 // C2 uses the value of ZF to determine the continuation. 690 } 691 692 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 693 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 694 assert(reg_rax == rax, "Used for CAS"); 695 assert_different_registers(obj, reg_rax, t); 696 697 // Handle inflated monitor. 698 Label inflated, inflated_check_lock_stack; 699 // Finish fast unlock successfully. MUST jump with ZF == 1 700 Label unlocked, slow_path; 701 702 const Register mark = t; 703 const Register monitor = t; 704 const Register top = UseObjectMonitorTable ? t : reg_rax; 705 const Register box = reg_rax; 706 707 Label dummy; 708 C2FastUnlockLightweightStub* stub = nullptr; 709 710 if (!Compile::current()->output()->in_scratch_emit_size()) { 711 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 712 Compile::current()->output()->add_stub(stub); 713 } 714 715 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 716 717 { // Lightweight Unlock 718 719 // Load top. 720 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 721 722 if (!UseObjectMonitorTable) { 723 // Prefetch mark. 724 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 725 } 726 727 // Check if obj is top of lock-stack. 728 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 729 // Top of lock stack was not obj. Must be monitor. 730 jcc(Assembler::notEqual, inflated_check_lock_stack); 731 732 // Pop lock-stack. 733 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 734 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 735 736 // Check if recursive. 737 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 738 jcc(Assembler::equal, unlocked); 739 740 // We elide the monitor check, let the CAS fail instead. 741 742 if (UseObjectMonitorTable) { 743 // Load mark. 744 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 745 } 746 747 // Try to unlock. Transition lock bits 0b00 => 0b01 748 movptr(reg_rax, mark); 749 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 750 orptr(mark, markWord::unlocked_value); 751 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 752 jcc(Assembler::notEqual, push_and_slow_path); 753 jmp(unlocked); 754 } 755 756 757 { // Handle inflated monitor. 758 bind(inflated_check_lock_stack); 759 #ifdef ASSERT 760 Label check_done; 761 subl(top, oopSize); 762 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 763 jcc(Assembler::below, check_done); 764 cmpptr(obj, Address(thread, top)); 765 jccb(Assembler::notEqual, inflated_check_lock_stack); 766 stop("Fast Unlock lock on stack"); 767 bind(check_done); 768 if (UseObjectMonitorTable) { 769 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 770 } 771 testptr(mark, markWord::monitor_value); 772 jccb(Assembler::notZero, inflated); 773 stop("Fast Unlock not monitor"); 774 #endif 775 776 bind(inflated); 777 778 #ifndef _LP64 779 // Just take slow path to avoid dealing with 64 bit atomic instructions here. 780 orl(t, 1); // set ICC.ZF=0 to indicate failure 781 jmpb(slow_path); 782 #else 783 if (!UseObjectMonitorTable) { 784 assert(mark == monitor, "should be the same here"); 785 } else { 786 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 787 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 788 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 789 cmpptr(monitor, alignof(ObjectMonitor*)); 790 jcc(Assembler::below, slow_path); 791 } 792 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 793 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 794 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 795 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag}; 796 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 797 798 Label recursive; 799 800 // Check if recursive. 801 cmpptr(recursions_address, 0); 802 jccb(Assembler::notZero, recursive); 803 804 // Set owner to null. 805 // Release to satisfy the JMM 806 movptr(owner_address, NULL_WORD); 807 // We need a full fence after clearing owner to avoid stranding. 808 // StoreLoad achieves this. 809 membar(StoreLoad); 810 811 // Check if the entry_list is empty. 812 cmpptr(entry_list_address, NULL_WORD); 813 jccb(Assembler::zero, unlocked); // If so we are done. 814 815 // Check if there is a successor. 816 cmpptr(succ_address, NULL_WORD); 817 jccb(Assembler::notZero, unlocked); // If so we are done. 818 819 // Save the monitor pointer in the current thread, so we can try to 820 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 821 if (!UseObjectMonitorTable) { 822 andptr(monitor, ~(int32_t)markWord::monitor_value); 823 } 824 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 825 826 orl(t, 1); // Fast Unlock ZF = 0 827 jmpb(slow_path); 828 829 // Recursive unlock. 830 bind(recursive); 831 decrement(recursions_address); 832 #endif // _LP64 833 } 834 835 bind(unlocked); 836 xorl(t, t); // Fast Unlock ZF = 1 837 838 #ifdef ASSERT 839 // Check that unlocked label is reached with ZF set. 840 Label zf_correct; 841 Label zf_bad_zero; 842 jcc(Assembler::zero, zf_correct); 843 jmp(zf_bad_zero); 844 #endif 845 846 bind(slow_path); 847 if (stub != nullptr) { 848 bind(stub->slow_path_continuation()); 849 } 850 #ifdef ASSERT 851 // Check that stub->continuation() label is reached with ZF not set. 852 jcc(Assembler::notZero, zf_correct); 853 stop("Fast Unlock ZF != 0"); 854 bind(zf_bad_zero); 855 stop("Fast Unlock ZF != 1"); 856 bind(zf_correct); 857 #endif 858 // C2 uses the value of ZF to determine the continuation. 859 } 860 861 //------------------------------------------------------------------------------------------- 862 // Generic instructions support for use in .ad files C2 code generation 863 864 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 865 if (dst != src) { 866 movdqu(dst, src); 867 } 868 if (opcode == Op_AbsVD) { 869 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 870 } else { 871 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 872 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 873 } 874 } 875 876 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 877 if (opcode == Op_AbsVD) { 878 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 879 } else { 880 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 881 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 882 } 883 } 884 885 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 886 if (dst != src) { 887 movdqu(dst, src); 888 } 889 if (opcode == Op_AbsVF) { 890 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 891 } else { 892 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 893 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 894 } 895 } 896 897 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 898 if (opcode == Op_AbsVF) { 899 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 900 } else { 901 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 902 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 903 } 904 } 905 906 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 907 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 908 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 909 910 if (opcode == Op_MinV) { 911 if (elem_bt == T_BYTE) { 912 pminsb(dst, src); 913 } else if (elem_bt == T_SHORT) { 914 pminsw(dst, src); 915 } else if (elem_bt == T_INT) { 916 pminsd(dst, src); 917 } else { 918 assert(elem_bt == T_LONG, "required"); 919 assert(tmp == xmm0, "required"); 920 assert_different_registers(dst, src, tmp); 921 movdqu(xmm0, dst); 922 pcmpgtq(xmm0, src); 923 blendvpd(dst, src); // xmm0 as mask 924 } 925 } else { // opcode == Op_MaxV 926 if (elem_bt == T_BYTE) { 927 pmaxsb(dst, src); 928 } else if (elem_bt == T_SHORT) { 929 pmaxsw(dst, src); 930 } else if (elem_bt == T_INT) { 931 pmaxsd(dst, src); 932 } else { 933 assert(elem_bt == T_LONG, "required"); 934 assert(tmp == xmm0, "required"); 935 assert_different_registers(dst, src, tmp); 936 movdqu(xmm0, src); 937 pcmpgtq(xmm0, dst); 938 blendvpd(dst, src); // xmm0 as mask 939 } 940 } 941 } 942 943 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 944 XMMRegister src1, Address src2, int vlen_enc) { 945 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 946 if (opcode == Op_UMinV) { 947 switch(elem_bt) { 948 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 949 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 950 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 951 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 952 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 953 } 954 } else { 955 assert(opcode == Op_UMaxV, "required"); 956 switch(elem_bt) { 957 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 958 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 959 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 960 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 961 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 962 } 963 } 964 } 965 966 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 967 // For optimality, leverage a full vector width of 512 bits 968 // for operations over smaller vector sizes on AVX512 targets. 969 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { 970 if (opcode == Op_UMaxV) { 971 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 972 } else { 973 assert(opcode == Op_UMinV, "required"); 974 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 975 } 976 } else { 977 // T1 = -1 978 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc); 979 // T1 = -1 << 63 980 vpsllq(xtmp1, xtmp1, 63, vlen_enc); 981 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2 982 vpaddq(xtmp2, xtmp1, src2, vlen_enc); 983 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1 984 vpaddq(xtmp1, xtmp1, src1, vlen_enc); 985 // Mask = T2 > T1 986 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc); 987 if (opcode == Op_UMaxV) { 988 // Res = Mask ? Src2 : Src1 989 vpblendvb(dst, src1, src2, xtmp1, vlen_enc); 990 } else { 991 // Res = Mask ? Src1 : Src2 992 vpblendvb(dst, src2, src1, xtmp1, vlen_enc); 993 } 994 } 995 } 996 997 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 998 XMMRegister src1, XMMRegister src2, int vlen_enc) { 999 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 1000 if (opcode == Op_UMinV) { 1001 switch(elem_bt) { 1002 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 1003 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 1004 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 1005 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 1006 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 1007 } 1008 } else { 1009 assert(opcode == Op_UMaxV, "required"); 1010 switch(elem_bt) { 1011 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 1012 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 1013 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 1014 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 1015 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 1016 } 1017 } 1018 } 1019 1020 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1021 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1022 int vlen_enc) { 1023 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1024 1025 if (opcode == Op_MinV) { 1026 if (elem_bt == T_BYTE) { 1027 vpminsb(dst, src1, src2, vlen_enc); 1028 } else if (elem_bt == T_SHORT) { 1029 vpminsw(dst, src1, src2, vlen_enc); 1030 } else if (elem_bt == T_INT) { 1031 vpminsd(dst, src1, src2, vlen_enc); 1032 } else { 1033 assert(elem_bt == T_LONG, "required"); 1034 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1035 vpminsq(dst, src1, src2, vlen_enc); 1036 } else { 1037 assert_different_registers(dst, src1, src2); 1038 vpcmpgtq(dst, src1, src2, vlen_enc); 1039 vblendvpd(dst, src1, src2, dst, vlen_enc); 1040 } 1041 } 1042 } else { // opcode == Op_MaxV 1043 if (elem_bt == T_BYTE) { 1044 vpmaxsb(dst, src1, src2, vlen_enc); 1045 } else if (elem_bt == T_SHORT) { 1046 vpmaxsw(dst, src1, src2, vlen_enc); 1047 } else if (elem_bt == T_INT) { 1048 vpmaxsd(dst, src1, src2, vlen_enc); 1049 } else { 1050 assert(elem_bt == T_LONG, "required"); 1051 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1052 vpmaxsq(dst, src1, src2, vlen_enc); 1053 } else { 1054 assert_different_registers(dst, src1, src2); 1055 vpcmpgtq(dst, src1, src2, vlen_enc); 1056 vblendvpd(dst, src2, src1, dst, vlen_enc); 1057 } 1058 } 1059 } 1060 } 1061 1062 // Float/Double min max 1063 1064 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1065 XMMRegister dst, XMMRegister a, XMMRegister b, 1066 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1067 int vlen_enc) { 1068 assert(UseAVX > 0, "required"); 1069 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1070 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1071 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1072 assert_different_registers(a, tmp, atmp, btmp); 1073 assert_different_registers(b, tmp, atmp, btmp); 1074 1075 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1076 bool is_double_word = is_double_word_type(elem_bt); 1077 1078 /* Note on 'non-obvious' assembly sequence: 1079 * 1080 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1081 * and Java on how they handle floats: 1082 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1083 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1084 * 1085 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1086 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1087 * (only useful when signs differ, noop otherwise) 1088 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1089 1090 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1091 * btmp = (b < +0.0) ? a : b 1092 * atmp = (b < +0.0) ? b : a 1093 * Tmp = Max_Float(atmp , btmp) 1094 * Res = (atmp == NaN) ? atmp : Tmp 1095 */ 1096 1097 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1098 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1099 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1100 XMMRegister mask; 1101 1102 if (!is_double_word && is_min) { 1103 mask = a; 1104 vblend = &MacroAssembler::vblendvps; 1105 vmaxmin = &MacroAssembler::vminps; 1106 vcmp = &MacroAssembler::vcmpps; 1107 } else if (!is_double_word && !is_min) { 1108 mask = b; 1109 vblend = &MacroAssembler::vblendvps; 1110 vmaxmin = &MacroAssembler::vmaxps; 1111 vcmp = &MacroAssembler::vcmpps; 1112 } else if (is_double_word && is_min) { 1113 mask = a; 1114 vblend = &MacroAssembler::vblendvpd; 1115 vmaxmin = &MacroAssembler::vminpd; 1116 vcmp = &MacroAssembler::vcmppd; 1117 } else { 1118 assert(is_double_word && !is_min, "sanity"); 1119 mask = b; 1120 vblend = &MacroAssembler::vblendvpd; 1121 vmaxmin = &MacroAssembler::vmaxpd; 1122 vcmp = &MacroAssembler::vcmppd; 1123 } 1124 1125 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1126 XMMRegister maxmin, scratch; 1127 if (dst == btmp) { 1128 maxmin = btmp; 1129 scratch = tmp; 1130 } else { 1131 maxmin = tmp; 1132 scratch = btmp; 1133 } 1134 1135 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1136 if (precompute_mask && !is_double_word) { 1137 vpsrad(tmp, mask, 32, vlen_enc); 1138 mask = tmp; 1139 } else if (precompute_mask && is_double_word) { 1140 vpxor(tmp, tmp, tmp, vlen_enc); 1141 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1142 mask = tmp; 1143 } 1144 1145 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1146 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1147 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1148 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1149 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1150 } 1151 1152 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1153 XMMRegister dst, XMMRegister a, XMMRegister b, 1154 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1155 int vlen_enc) { 1156 assert(UseAVX > 2, "required"); 1157 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1158 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1159 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1160 assert_different_registers(dst, a, atmp, btmp); 1161 assert_different_registers(dst, b, atmp, btmp); 1162 1163 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1164 bool is_double_word = is_double_word_type(elem_bt); 1165 bool merge = true; 1166 1167 if (!is_double_word && is_min) { 1168 evpmovd2m(ktmp, a, vlen_enc); 1169 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1170 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1171 vminps(dst, atmp, btmp, vlen_enc); 1172 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1173 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1174 } else if (!is_double_word && !is_min) { 1175 evpmovd2m(ktmp, b, vlen_enc); 1176 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1177 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1178 vmaxps(dst, atmp, btmp, vlen_enc); 1179 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1180 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1181 } else if (is_double_word && is_min) { 1182 evpmovq2m(ktmp, a, vlen_enc); 1183 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1184 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1185 vminpd(dst, atmp, btmp, vlen_enc); 1186 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1187 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1188 } else { 1189 assert(is_double_word && !is_min, "sanity"); 1190 evpmovq2m(ktmp, b, vlen_enc); 1191 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1192 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1193 vmaxpd(dst, atmp, btmp, vlen_enc); 1194 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1195 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1196 } 1197 } 1198 1199 // Float/Double signum 1200 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1201 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1202 1203 Label DONE_LABEL; 1204 1205 if (opcode == Op_SignumF) { 1206 assert(UseSSE > 0, "required"); 1207 ucomiss(dst, zero); 1208 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1209 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1210 movflt(dst, one); 1211 jcc(Assembler::above, DONE_LABEL); 1212 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1213 } else if (opcode == Op_SignumD) { 1214 assert(UseSSE > 1, "required"); 1215 ucomisd(dst, zero); 1216 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1217 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1218 movdbl(dst, one); 1219 jcc(Assembler::above, DONE_LABEL); 1220 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1221 } 1222 1223 bind(DONE_LABEL); 1224 } 1225 1226 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1227 if (sign) { 1228 pmovsxbw(dst, src); 1229 } else { 1230 pmovzxbw(dst, src); 1231 } 1232 } 1233 1234 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1235 if (sign) { 1236 vpmovsxbw(dst, src, vector_len); 1237 } else { 1238 vpmovzxbw(dst, src, vector_len); 1239 } 1240 } 1241 1242 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1243 if (sign) { 1244 vpmovsxbd(dst, src, vector_len); 1245 } else { 1246 vpmovzxbd(dst, src, vector_len); 1247 } 1248 } 1249 1250 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1251 if (sign) { 1252 vpmovsxwd(dst, src, vector_len); 1253 } else { 1254 vpmovzxwd(dst, src, vector_len); 1255 } 1256 } 1257 1258 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1259 int shift, int vector_len) { 1260 if (opcode == Op_RotateLeftV) { 1261 if (etype == T_INT) { 1262 evprold(dst, src, shift, vector_len); 1263 } else { 1264 assert(etype == T_LONG, "expected type T_LONG"); 1265 evprolq(dst, src, shift, vector_len); 1266 } 1267 } else { 1268 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1269 if (etype == T_INT) { 1270 evprord(dst, src, shift, vector_len); 1271 } else { 1272 assert(etype == T_LONG, "expected type T_LONG"); 1273 evprorq(dst, src, shift, vector_len); 1274 } 1275 } 1276 } 1277 1278 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1279 XMMRegister shift, int vector_len) { 1280 if (opcode == Op_RotateLeftV) { 1281 if (etype == T_INT) { 1282 evprolvd(dst, src, shift, vector_len); 1283 } else { 1284 assert(etype == T_LONG, "expected type T_LONG"); 1285 evprolvq(dst, src, shift, vector_len); 1286 } 1287 } else { 1288 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1289 if (etype == T_INT) { 1290 evprorvd(dst, src, shift, vector_len); 1291 } else { 1292 assert(etype == T_LONG, "expected type T_LONG"); 1293 evprorvq(dst, src, shift, vector_len); 1294 } 1295 } 1296 } 1297 1298 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1299 if (opcode == Op_RShiftVI) { 1300 psrad(dst, shift); 1301 } else if (opcode == Op_LShiftVI) { 1302 pslld(dst, shift); 1303 } else { 1304 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1305 psrld(dst, shift); 1306 } 1307 } 1308 1309 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1310 switch (opcode) { 1311 case Op_RShiftVI: psrad(dst, shift); break; 1312 case Op_LShiftVI: pslld(dst, shift); break; 1313 case Op_URShiftVI: psrld(dst, shift); break; 1314 1315 default: assert(false, "%s", NodeClassNames[opcode]); 1316 } 1317 } 1318 1319 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1320 if (opcode == Op_RShiftVI) { 1321 vpsrad(dst, nds, shift, vector_len); 1322 } else if (opcode == Op_LShiftVI) { 1323 vpslld(dst, nds, shift, vector_len); 1324 } else { 1325 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1326 vpsrld(dst, nds, shift, vector_len); 1327 } 1328 } 1329 1330 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1331 switch (opcode) { 1332 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1333 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1334 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1335 1336 default: assert(false, "%s", NodeClassNames[opcode]); 1337 } 1338 } 1339 1340 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1341 switch (opcode) { 1342 case Op_RShiftVB: // fall-through 1343 case Op_RShiftVS: psraw(dst, shift); break; 1344 1345 case Op_LShiftVB: // fall-through 1346 case Op_LShiftVS: psllw(dst, shift); break; 1347 1348 case Op_URShiftVS: // fall-through 1349 case Op_URShiftVB: psrlw(dst, shift); break; 1350 1351 default: assert(false, "%s", NodeClassNames[opcode]); 1352 } 1353 } 1354 1355 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1356 switch (opcode) { 1357 case Op_RShiftVB: // fall-through 1358 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1359 1360 case Op_LShiftVB: // fall-through 1361 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1362 1363 case Op_URShiftVS: // fall-through 1364 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1365 1366 default: assert(false, "%s", NodeClassNames[opcode]); 1367 } 1368 } 1369 1370 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1371 switch (opcode) { 1372 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1373 case Op_LShiftVL: psllq(dst, shift); break; 1374 case Op_URShiftVL: psrlq(dst, shift); break; 1375 1376 default: assert(false, "%s", NodeClassNames[opcode]); 1377 } 1378 } 1379 1380 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1381 if (opcode == Op_RShiftVL) { 1382 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1383 } else if (opcode == Op_LShiftVL) { 1384 psllq(dst, shift); 1385 } else { 1386 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1387 psrlq(dst, shift); 1388 } 1389 } 1390 1391 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1392 switch (opcode) { 1393 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1394 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1395 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1396 1397 default: assert(false, "%s", NodeClassNames[opcode]); 1398 } 1399 } 1400 1401 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1402 if (opcode == Op_RShiftVL) { 1403 evpsraq(dst, nds, shift, vector_len); 1404 } else if (opcode == Op_LShiftVL) { 1405 vpsllq(dst, nds, shift, vector_len); 1406 } else { 1407 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1408 vpsrlq(dst, nds, shift, vector_len); 1409 } 1410 } 1411 1412 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1413 switch (opcode) { 1414 case Op_RShiftVB: // fall-through 1415 case Op_RShiftVS: // fall-through 1416 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1417 1418 case Op_LShiftVB: // fall-through 1419 case Op_LShiftVS: // fall-through 1420 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1421 1422 case Op_URShiftVB: // fall-through 1423 case Op_URShiftVS: // fall-through 1424 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1425 1426 default: assert(false, "%s", NodeClassNames[opcode]); 1427 } 1428 } 1429 1430 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1431 switch (opcode) { 1432 case Op_RShiftVB: // fall-through 1433 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1434 1435 case Op_LShiftVB: // fall-through 1436 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1437 1438 case Op_URShiftVB: // fall-through 1439 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1440 1441 default: assert(false, "%s", NodeClassNames[opcode]); 1442 } 1443 } 1444 1445 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1446 assert(UseAVX >= 2, "required"); 1447 switch (opcode) { 1448 case Op_RShiftVL: { 1449 if (UseAVX > 2) { 1450 assert(tmp == xnoreg, "not used"); 1451 if (!VM_Version::supports_avx512vl()) { 1452 vlen_enc = Assembler::AVX_512bit; 1453 } 1454 evpsravq(dst, src, shift, vlen_enc); 1455 } else { 1456 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1457 vpsrlvq(dst, src, shift, vlen_enc); 1458 vpsrlvq(tmp, tmp, shift, vlen_enc); 1459 vpxor(dst, dst, tmp, vlen_enc); 1460 vpsubq(dst, dst, tmp, vlen_enc); 1461 } 1462 break; 1463 } 1464 case Op_LShiftVL: { 1465 assert(tmp == xnoreg, "not used"); 1466 vpsllvq(dst, src, shift, vlen_enc); 1467 break; 1468 } 1469 case Op_URShiftVL: { 1470 assert(tmp == xnoreg, "not used"); 1471 vpsrlvq(dst, src, shift, vlen_enc); 1472 break; 1473 } 1474 default: assert(false, "%s", NodeClassNames[opcode]); 1475 } 1476 } 1477 1478 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1479 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1480 assert(opcode == Op_LShiftVB || 1481 opcode == Op_RShiftVB || 1482 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1483 bool sign = (opcode != Op_URShiftVB); 1484 assert(vector_len == 0, "required"); 1485 vextendbd(sign, dst, src, 1); 1486 vpmovzxbd(vtmp, shift, 1); 1487 varshiftd(opcode, dst, dst, vtmp, 1); 1488 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1489 vextracti128_high(vtmp, dst); 1490 vpackusdw(dst, dst, vtmp, 0); 1491 } 1492 1493 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1494 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1495 assert(opcode == Op_LShiftVB || 1496 opcode == Op_RShiftVB || 1497 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1498 bool sign = (opcode != Op_URShiftVB); 1499 int ext_vector_len = vector_len + 1; 1500 vextendbw(sign, dst, src, ext_vector_len); 1501 vpmovzxbw(vtmp, shift, ext_vector_len); 1502 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1503 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1504 if (vector_len == 0) { 1505 vextracti128_high(vtmp, dst); 1506 vpackuswb(dst, dst, vtmp, vector_len); 1507 } else { 1508 vextracti64x4_high(vtmp, dst); 1509 vpackuswb(dst, dst, vtmp, vector_len); 1510 vpermq(dst, dst, 0xD8, vector_len); 1511 } 1512 } 1513 1514 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1515 switch(typ) { 1516 case T_BYTE: 1517 pinsrb(dst, val, idx); 1518 break; 1519 case T_SHORT: 1520 pinsrw(dst, val, idx); 1521 break; 1522 case T_INT: 1523 pinsrd(dst, val, idx); 1524 break; 1525 case T_LONG: 1526 pinsrq(dst, val, idx); 1527 break; 1528 default: 1529 assert(false,"Should not reach here."); 1530 break; 1531 } 1532 } 1533 1534 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1535 switch(typ) { 1536 case T_BYTE: 1537 vpinsrb(dst, src, val, idx); 1538 break; 1539 case T_SHORT: 1540 vpinsrw(dst, src, val, idx); 1541 break; 1542 case T_INT: 1543 vpinsrd(dst, src, val, idx); 1544 break; 1545 case T_LONG: 1546 vpinsrq(dst, src, val, idx); 1547 break; 1548 default: 1549 assert(false,"Should not reach here."); 1550 break; 1551 } 1552 } 1553 1554 #ifdef _LP64 1555 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1556 XMMRegister dst, Register base, 1557 Register idx_base, 1558 Register offset, Register mask, 1559 Register mask_idx, Register rtmp, 1560 int vlen_enc) { 1561 vpxor(dst, dst, dst, vlen_enc); 1562 if (elem_bt == T_SHORT) { 1563 for (int i = 0; i < 4; i++) { 1564 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1565 Label skip_load; 1566 btq(mask, mask_idx); 1567 jccb(Assembler::carryClear, skip_load); 1568 movl(rtmp, Address(idx_base, i * 4)); 1569 if (offset != noreg) { 1570 addl(rtmp, offset); 1571 } 1572 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1573 bind(skip_load); 1574 incq(mask_idx); 1575 } 1576 } else { 1577 assert(elem_bt == T_BYTE, ""); 1578 for (int i = 0; i < 8; i++) { 1579 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1580 Label skip_load; 1581 btq(mask, mask_idx); 1582 jccb(Assembler::carryClear, skip_load); 1583 movl(rtmp, Address(idx_base, i * 4)); 1584 if (offset != noreg) { 1585 addl(rtmp, offset); 1586 } 1587 pinsrb(dst, Address(base, rtmp), i); 1588 bind(skip_load); 1589 incq(mask_idx); 1590 } 1591 } 1592 } 1593 #endif // _LP64 1594 1595 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1596 Register base, Register idx_base, 1597 Register offset, Register rtmp, 1598 int vlen_enc) { 1599 vpxor(dst, dst, dst, vlen_enc); 1600 if (elem_bt == T_SHORT) { 1601 for (int i = 0; i < 4; i++) { 1602 // dst[i] = src[offset + idx_base[i]] 1603 movl(rtmp, Address(idx_base, i * 4)); 1604 if (offset != noreg) { 1605 addl(rtmp, offset); 1606 } 1607 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1608 } 1609 } else { 1610 assert(elem_bt == T_BYTE, ""); 1611 for (int i = 0; i < 8; i++) { 1612 // dst[i] = src[offset + idx_base[i]] 1613 movl(rtmp, Address(idx_base, i * 4)); 1614 if (offset != noreg) { 1615 addl(rtmp, offset); 1616 } 1617 pinsrb(dst, Address(base, rtmp), i); 1618 } 1619 } 1620 } 1621 1622 /* 1623 * Gather using hybrid algorithm, first partially unroll scalar loop 1624 * to accumulate values from gather indices into a quad-word(64bit) slice. 1625 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1626 * permutation to place the slice into appropriate vector lane 1627 * locations in destination vector. Following pseudo code describes the 1628 * algorithm in detail: 1629 * 1630 * DST_VEC = ZERO_VEC 1631 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1632 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1633 * FOREACH_ITER: 1634 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1635 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1636 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1637 * PERM_INDEX = PERM_INDEX - TWO_VEC 1638 * 1639 * With each iteration, doubleword permute indices (0,1) corresponding 1640 * to gathered quadword gets right shifted by two lane positions. 1641 * 1642 */ 1643 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1644 Register base, Register idx_base, 1645 Register offset, Register mask, 1646 XMMRegister xtmp1, XMMRegister xtmp2, 1647 XMMRegister temp_dst, Register rtmp, 1648 Register mask_idx, Register length, 1649 int vector_len, int vlen_enc) { 1650 Label GATHER8_LOOP; 1651 assert(is_subword_type(elem_ty), ""); 1652 movl(length, vector_len); 1653 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1654 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1655 vallones(xtmp2, vlen_enc); 1656 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1657 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1658 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1659 1660 bind(GATHER8_LOOP); 1661 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1662 if (mask == noreg) { 1663 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1664 } else { 1665 LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc)); 1666 } 1667 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1668 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1669 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1670 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1671 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1672 vpor(dst, dst, temp_dst, vlen_enc); 1673 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1674 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1675 jcc(Assembler::notEqual, GATHER8_LOOP); 1676 } 1677 1678 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1679 switch(typ) { 1680 case T_INT: 1681 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1682 break; 1683 case T_FLOAT: 1684 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1685 break; 1686 case T_LONG: 1687 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1688 break; 1689 case T_DOUBLE: 1690 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1691 break; 1692 default: 1693 assert(false,"Should not reach here."); 1694 break; 1695 } 1696 } 1697 1698 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1699 switch(typ) { 1700 case T_INT: 1701 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1702 break; 1703 case T_FLOAT: 1704 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1705 break; 1706 case T_LONG: 1707 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1708 break; 1709 case T_DOUBLE: 1710 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1711 break; 1712 default: 1713 assert(false,"Should not reach here."); 1714 break; 1715 } 1716 } 1717 1718 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1719 switch(typ) { 1720 case T_INT: 1721 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1722 break; 1723 case T_FLOAT: 1724 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1725 break; 1726 case T_LONG: 1727 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1728 break; 1729 case T_DOUBLE: 1730 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1731 break; 1732 default: 1733 assert(false,"Should not reach here."); 1734 break; 1735 } 1736 } 1737 1738 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1739 if (vlen_in_bytes <= 16) { 1740 pxor (dst, dst); 1741 psubb(dst, src); 1742 switch (elem_bt) { 1743 case T_BYTE: /* nothing to do */ break; 1744 case T_SHORT: pmovsxbw(dst, dst); break; 1745 case T_INT: pmovsxbd(dst, dst); break; 1746 case T_FLOAT: pmovsxbd(dst, dst); break; 1747 case T_LONG: pmovsxbq(dst, dst); break; 1748 case T_DOUBLE: pmovsxbq(dst, dst); break; 1749 1750 default: assert(false, "%s", type2name(elem_bt)); 1751 } 1752 } else { 1753 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1754 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1755 1756 vpxor (dst, dst, dst, vlen_enc); 1757 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1758 1759 switch (elem_bt) { 1760 case T_BYTE: /* nothing to do */ break; 1761 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1762 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1763 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1764 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1765 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1766 1767 default: assert(false, "%s", type2name(elem_bt)); 1768 } 1769 } 1770 } 1771 1772 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1773 if (novlbwdq) { 1774 vpmovsxbd(xtmp, src, vlen_enc); 1775 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1776 Assembler::eq, true, vlen_enc, noreg); 1777 } else { 1778 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1779 vpsubb(xtmp, xtmp, src, vlen_enc); 1780 evpmovb2m(dst, xtmp, vlen_enc); 1781 } 1782 } 1783 1784 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) { 1785 if (is_integral_type(bt)) { 1786 switch (vlen_in_bytes) { 1787 case 4: movdl(dst, src); break; 1788 case 8: movq(dst, src); break; 1789 case 16: movdqu(dst, src); break; 1790 case 32: vmovdqu(dst, src); break; 1791 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1792 default: ShouldNotReachHere(); 1793 } 1794 } else { 1795 switch (vlen_in_bytes) { 1796 case 4: movflt(dst, src); break; 1797 case 8: movdbl(dst, src); break; 1798 case 16: movups(dst, src); break; 1799 case 32: vmovups(dst, src, Assembler::AVX_256bit); break; 1800 case 64: vmovups(dst, src, Assembler::AVX_512bit); break; 1801 default: ShouldNotReachHere(); 1802 } 1803 } 1804 } 1805 1806 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1807 assert(rscratch != noreg || always_reachable(src), "missing"); 1808 1809 if (reachable(src)) { 1810 load_vector(bt, dst, as_Address(src), vlen_in_bytes); 1811 } else { 1812 lea(rscratch, src); 1813 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes); 1814 } 1815 } 1816 1817 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1818 int vlen_enc = vector_length_encoding(vlen); 1819 if (VM_Version::supports_avx()) { 1820 if (bt == T_LONG) { 1821 if (VM_Version::supports_avx2()) { 1822 vpbroadcastq(dst, src, vlen_enc); 1823 } else { 1824 vmovddup(dst, src, vlen_enc); 1825 } 1826 } else if (bt == T_DOUBLE) { 1827 if (vlen_enc != Assembler::AVX_128bit) { 1828 vbroadcastsd(dst, src, vlen_enc, noreg); 1829 } else { 1830 vmovddup(dst, src, vlen_enc); 1831 } 1832 } else { 1833 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1834 vpbroadcastd(dst, src, vlen_enc); 1835 } else { 1836 vbroadcastss(dst, src, vlen_enc); 1837 } 1838 } 1839 } else if (VM_Version::supports_sse3()) { 1840 movddup(dst, src); 1841 } else { 1842 load_vector(bt, dst, src, vlen); 1843 } 1844 } 1845 1846 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1847 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1848 int offset = exact_log2(type2aelembytes(bt)) << 6; 1849 if (is_floating_point_type(bt)) { 1850 offset += 128; 1851 } 1852 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1853 load_vector(T_BYTE, dst, addr, vlen_in_bytes); 1854 } 1855 1856 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1857 1858 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1859 int vector_len = Assembler::AVX_128bit; 1860 1861 switch (opcode) { 1862 case Op_AndReductionV: pand(dst, src); break; 1863 case Op_OrReductionV: por (dst, src); break; 1864 case Op_XorReductionV: pxor(dst, src); break; 1865 case Op_MinReductionV: 1866 switch (typ) { 1867 case T_BYTE: pminsb(dst, src); break; 1868 case T_SHORT: pminsw(dst, src); break; 1869 case T_INT: pminsd(dst, src); break; 1870 case T_LONG: assert(UseAVX > 2, "required"); 1871 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1872 default: assert(false, "wrong type"); 1873 } 1874 break; 1875 case Op_MaxReductionV: 1876 switch (typ) { 1877 case T_BYTE: pmaxsb(dst, src); break; 1878 case T_SHORT: pmaxsw(dst, src); break; 1879 case T_INT: pmaxsd(dst, src); break; 1880 case T_LONG: assert(UseAVX > 2, "required"); 1881 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1882 default: assert(false, "wrong type"); 1883 } 1884 break; 1885 case Op_AddReductionVF: addss(dst, src); break; 1886 case Op_AddReductionVD: addsd(dst, src); break; 1887 case Op_AddReductionVI: 1888 switch (typ) { 1889 case T_BYTE: paddb(dst, src); break; 1890 case T_SHORT: paddw(dst, src); break; 1891 case T_INT: paddd(dst, src); break; 1892 default: assert(false, "wrong type"); 1893 } 1894 break; 1895 case Op_AddReductionVL: paddq(dst, src); break; 1896 case Op_MulReductionVF: mulss(dst, src); break; 1897 case Op_MulReductionVD: mulsd(dst, src); break; 1898 case Op_MulReductionVI: 1899 switch (typ) { 1900 case T_SHORT: pmullw(dst, src); break; 1901 case T_INT: pmulld(dst, src); break; 1902 default: assert(false, "wrong type"); 1903 } 1904 break; 1905 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1906 evpmullq(dst, dst, src, vector_len); break; 1907 default: assert(false, "wrong opcode"); 1908 } 1909 } 1910 1911 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1912 switch (opcode) { 1913 case Op_AddReductionVF: addps(dst, src); break; 1914 case Op_AddReductionVD: addpd(dst, src); break; 1915 case Op_MulReductionVF: mulps(dst, src); break; 1916 case Op_MulReductionVD: mulpd(dst, src); break; 1917 default: assert(false, "%s", NodeClassNames[opcode]); 1918 } 1919 } 1920 1921 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1922 int vector_len = Assembler::AVX_256bit; 1923 1924 switch (opcode) { 1925 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1926 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1927 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1928 case Op_MinReductionV: 1929 switch (typ) { 1930 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1931 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1932 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1933 case T_LONG: assert(UseAVX > 2, "required"); 1934 vpminsq(dst, src1, src2, vector_len); break; 1935 default: assert(false, "wrong type"); 1936 } 1937 break; 1938 case Op_MaxReductionV: 1939 switch (typ) { 1940 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1941 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1942 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1943 case T_LONG: assert(UseAVX > 2, "required"); 1944 vpmaxsq(dst, src1, src2, vector_len); break; 1945 default: assert(false, "wrong type"); 1946 } 1947 break; 1948 case Op_AddReductionVI: 1949 switch (typ) { 1950 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1951 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1952 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1953 default: assert(false, "wrong type"); 1954 } 1955 break; 1956 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1957 case Op_MulReductionVI: 1958 switch (typ) { 1959 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1960 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1961 default: assert(false, "wrong type"); 1962 } 1963 break; 1964 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1965 default: assert(false, "wrong opcode"); 1966 } 1967 } 1968 1969 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1970 int vector_len = Assembler::AVX_256bit; 1971 1972 switch (opcode) { 1973 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 1974 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 1975 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 1976 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 1977 default: assert(false, "%s", NodeClassNames[opcode]); 1978 } 1979 } 1980 1981 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1982 XMMRegister dst, XMMRegister src, 1983 XMMRegister vtmp1, XMMRegister vtmp2) { 1984 switch (opcode) { 1985 case Op_AddReductionVF: 1986 case Op_MulReductionVF: 1987 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1988 break; 1989 1990 case Op_AddReductionVD: 1991 case Op_MulReductionVD: 1992 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1993 break; 1994 1995 default: assert(false, "wrong opcode"); 1996 } 1997 } 1998 1999 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 2000 XMMRegister dst, XMMRegister src, 2001 XMMRegister vtmp1, XMMRegister vtmp2) { 2002 switch (opcode) { 2003 case Op_AddReductionVF: 2004 case Op_MulReductionVF: 2005 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 2006 break; 2007 2008 case Op_AddReductionVD: 2009 case Op_MulReductionVD: 2010 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 2011 break; 2012 2013 default: assert(false, "%s", NodeClassNames[opcode]); 2014 } 2015 } 2016 2017 void C2_MacroAssembler::reduceB(int opcode, int vlen, 2018 Register dst, Register src1, XMMRegister src2, 2019 XMMRegister vtmp1, XMMRegister vtmp2) { 2020 switch (vlen) { 2021 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2022 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2023 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2024 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2025 2026 default: assert(false, "wrong vector length"); 2027 } 2028 } 2029 2030 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 2031 Register dst, Register src1, XMMRegister src2, 2032 XMMRegister vtmp1, XMMRegister vtmp2) { 2033 switch (vlen) { 2034 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2035 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2036 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2037 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2038 2039 default: assert(false, "wrong vector length"); 2040 } 2041 } 2042 2043 void C2_MacroAssembler::reduceS(int opcode, int vlen, 2044 Register dst, Register src1, XMMRegister src2, 2045 XMMRegister vtmp1, XMMRegister vtmp2) { 2046 switch (vlen) { 2047 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2048 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2049 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2050 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2051 2052 default: assert(false, "wrong vector length"); 2053 } 2054 } 2055 2056 void C2_MacroAssembler::reduceI(int opcode, int vlen, 2057 Register dst, Register src1, XMMRegister src2, 2058 XMMRegister vtmp1, XMMRegister vtmp2) { 2059 switch (vlen) { 2060 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2061 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2062 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2063 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2064 2065 default: assert(false, "wrong vector length"); 2066 } 2067 } 2068 2069 #ifdef _LP64 2070 void C2_MacroAssembler::reduceL(int opcode, int vlen, 2071 Register dst, Register src1, XMMRegister src2, 2072 XMMRegister vtmp1, XMMRegister vtmp2) { 2073 switch (vlen) { 2074 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2075 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2076 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2077 2078 default: assert(false, "wrong vector length"); 2079 } 2080 } 2081 #endif // _LP64 2082 2083 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2084 switch (vlen) { 2085 case 2: 2086 assert(vtmp2 == xnoreg, ""); 2087 reduce2F(opcode, dst, src, vtmp1); 2088 break; 2089 case 4: 2090 assert(vtmp2 == xnoreg, ""); 2091 reduce4F(opcode, dst, src, vtmp1); 2092 break; 2093 case 8: 2094 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2095 break; 2096 case 16: 2097 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2098 break; 2099 default: assert(false, "wrong vector length"); 2100 } 2101 } 2102 2103 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2104 switch (vlen) { 2105 case 2: 2106 assert(vtmp2 == xnoreg, ""); 2107 reduce2D(opcode, dst, src, vtmp1); 2108 break; 2109 case 4: 2110 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2111 break; 2112 case 8: 2113 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2114 break; 2115 default: assert(false, "wrong vector length"); 2116 } 2117 } 2118 2119 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2120 switch (vlen) { 2121 case 2: 2122 assert(vtmp1 == xnoreg, ""); 2123 assert(vtmp2 == xnoreg, ""); 2124 unorderedReduce2F(opcode, dst, src); 2125 break; 2126 case 4: 2127 assert(vtmp2 == xnoreg, ""); 2128 unorderedReduce4F(opcode, dst, src, vtmp1); 2129 break; 2130 case 8: 2131 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 2132 break; 2133 case 16: 2134 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 2135 break; 2136 default: assert(false, "wrong vector length"); 2137 } 2138 } 2139 2140 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2141 switch (vlen) { 2142 case 2: 2143 assert(vtmp1 == xnoreg, ""); 2144 assert(vtmp2 == xnoreg, ""); 2145 unorderedReduce2D(opcode, dst, src); 2146 break; 2147 case 4: 2148 assert(vtmp2 == xnoreg, ""); 2149 unorderedReduce4D(opcode, dst, src, vtmp1); 2150 break; 2151 case 8: 2152 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2153 break; 2154 default: assert(false, "wrong vector length"); 2155 } 2156 } 2157 2158 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2159 if (opcode == Op_AddReductionVI) { 2160 if (vtmp1 != src2) { 2161 movdqu(vtmp1, src2); 2162 } 2163 phaddd(vtmp1, vtmp1); 2164 } else { 2165 pshufd(vtmp1, src2, 0x1); 2166 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2167 } 2168 movdl(vtmp2, src1); 2169 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2170 movdl(dst, vtmp1); 2171 } 2172 2173 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2174 if (opcode == Op_AddReductionVI) { 2175 if (vtmp1 != src2) { 2176 movdqu(vtmp1, src2); 2177 } 2178 phaddd(vtmp1, src2); 2179 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2180 } else { 2181 pshufd(vtmp2, src2, 0xE); 2182 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2183 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2184 } 2185 } 2186 2187 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2188 if (opcode == Op_AddReductionVI) { 2189 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2190 vextracti128_high(vtmp2, vtmp1); 2191 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2192 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2193 } else { 2194 vextracti128_high(vtmp1, src2); 2195 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2196 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2197 } 2198 } 2199 2200 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2201 vextracti64x4_high(vtmp2, src2); 2202 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2203 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2204 } 2205 2206 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2207 pshufd(vtmp2, src2, 0x1); 2208 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2209 movdqu(vtmp1, vtmp2); 2210 psrldq(vtmp1, 2); 2211 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2212 movdqu(vtmp2, vtmp1); 2213 psrldq(vtmp2, 1); 2214 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2215 movdl(vtmp2, src1); 2216 pmovsxbd(vtmp1, vtmp1); 2217 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2218 pextrb(dst, vtmp1, 0x0); 2219 movsbl(dst, dst); 2220 } 2221 2222 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2223 pshufd(vtmp1, src2, 0xE); 2224 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2225 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2226 } 2227 2228 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2229 vextracti128_high(vtmp2, src2); 2230 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2231 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2232 } 2233 2234 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2235 vextracti64x4_high(vtmp1, src2); 2236 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2237 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2238 } 2239 2240 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2241 pmovsxbw(vtmp2, src2); 2242 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2243 } 2244 2245 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2246 if (UseAVX > 1) { 2247 int vector_len = Assembler::AVX_256bit; 2248 vpmovsxbw(vtmp1, src2, vector_len); 2249 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2250 } else { 2251 pmovsxbw(vtmp2, src2); 2252 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2253 pshufd(vtmp2, src2, 0x1); 2254 pmovsxbw(vtmp2, src2); 2255 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2256 } 2257 } 2258 2259 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2260 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2261 int vector_len = Assembler::AVX_512bit; 2262 vpmovsxbw(vtmp1, src2, vector_len); 2263 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2264 } else { 2265 assert(UseAVX >= 2,"Should not reach here."); 2266 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2267 vextracti128_high(vtmp2, src2); 2268 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2269 } 2270 } 2271 2272 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2273 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2274 vextracti64x4_high(vtmp2, src2); 2275 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2276 } 2277 2278 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2279 if (opcode == Op_AddReductionVI) { 2280 if (vtmp1 != src2) { 2281 movdqu(vtmp1, src2); 2282 } 2283 phaddw(vtmp1, vtmp1); 2284 phaddw(vtmp1, vtmp1); 2285 } else { 2286 pshufd(vtmp2, src2, 0x1); 2287 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2288 movdqu(vtmp1, vtmp2); 2289 psrldq(vtmp1, 2); 2290 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2291 } 2292 movdl(vtmp2, src1); 2293 pmovsxwd(vtmp1, vtmp1); 2294 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2295 pextrw(dst, vtmp1, 0x0); 2296 movswl(dst, dst); 2297 } 2298 2299 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2300 if (opcode == Op_AddReductionVI) { 2301 if (vtmp1 != src2) { 2302 movdqu(vtmp1, src2); 2303 } 2304 phaddw(vtmp1, src2); 2305 } else { 2306 pshufd(vtmp1, src2, 0xE); 2307 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2308 } 2309 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2310 } 2311 2312 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2313 if (opcode == Op_AddReductionVI) { 2314 int vector_len = Assembler::AVX_256bit; 2315 vphaddw(vtmp2, src2, src2, vector_len); 2316 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2317 } else { 2318 vextracti128_high(vtmp2, src2); 2319 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2320 } 2321 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2322 } 2323 2324 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2325 int vector_len = Assembler::AVX_256bit; 2326 vextracti64x4_high(vtmp1, src2); 2327 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2328 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2329 } 2330 2331 #ifdef _LP64 2332 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2333 pshufd(vtmp2, src2, 0xE); 2334 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2335 movdq(vtmp1, src1); 2336 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2337 movdq(dst, vtmp1); 2338 } 2339 2340 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2341 vextracti128_high(vtmp1, src2); 2342 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2343 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2344 } 2345 2346 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2347 vextracti64x4_high(vtmp2, src2); 2348 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2349 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2350 } 2351 2352 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2353 mov64(temp, -1L); 2354 bzhiq(temp, temp, len); 2355 kmovql(dst, temp); 2356 } 2357 #endif // _LP64 2358 2359 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2360 reduce_operation_128(T_FLOAT, opcode, dst, src); 2361 pshufd(vtmp, src, 0x1); 2362 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2363 } 2364 2365 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2366 reduce2F(opcode, dst, src, vtmp); 2367 pshufd(vtmp, src, 0x2); 2368 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2369 pshufd(vtmp, src, 0x3); 2370 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2371 } 2372 2373 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2374 reduce4F(opcode, dst, src, vtmp2); 2375 vextractf128_high(vtmp2, src); 2376 reduce4F(opcode, dst, vtmp2, vtmp1); 2377 } 2378 2379 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2380 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2381 vextracti64x4_high(vtmp1, src); 2382 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2383 } 2384 2385 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2386 pshufd(dst, src, 0x1); 2387 reduce_operation_128(T_FLOAT, opcode, dst, src); 2388 } 2389 2390 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2391 pshufd(vtmp, src, 0xE); 2392 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2393 unorderedReduce2F(opcode, dst, vtmp); 2394 } 2395 2396 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2397 vextractf128_high(vtmp1, src); 2398 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2399 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2400 } 2401 2402 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2403 vextractf64x4_high(vtmp2, src); 2404 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2405 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2406 } 2407 2408 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2409 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2410 pshufd(vtmp, src, 0xE); 2411 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2412 } 2413 2414 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2415 reduce2D(opcode, dst, src, vtmp2); 2416 vextractf128_high(vtmp2, src); 2417 reduce2D(opcode, dst, vtmp2, vtmp1); 2418 } 2419 2420 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2421 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2422 vextracti64x4_high(vtmp1, src); 2423 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2424 } 2425 2426 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2427 pshufd(dst, src, 0xE); 2428 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2429 } 2430 2431 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2432 vextractf128_high(vtmp, src); 2433 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2434 unorderedReduce2D(opcode, dst, vtmp); 2435 } 2436 2437 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2438 vextractf64x4_high(vtmp2, src); 2439 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2440 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2441 } 2442 2443 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2444 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2445 } 2446 2447 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2448 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2449 } 2450 2451 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 2452 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2453 } 2454 2455 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2456 int vec_enc) { 2457 switch(elem_bt) { 2458 case T_INT: 2459 case T_FLOAT: 2460 vmaskmovps(dst, src, mask, vec_enc); 2461 break; 2462 case T_LONG: 2463 case T_DOUBLE: 2464 vmaskmovpd(dst, src, mask, vec_enc); 2465 break; 2466 default: 2467 fatal("Unsupported type %s", type2name(elem_bt)); 2468 break; 2469 } 2470 } 2471 2472 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2473 int vec_enc) { 2474 switch(elem_bt) { 2475 case T_INT: 2476 case T_FLOAT: 2477 vmaskmovps(dst, src, mask, vec_enc); 2478 break; 2479 case T_LONG: 2480 case T_DOUBLE: 2481 vmaskmovpd(dst, src, mask, vec_enc); 2482 break; 2483 default: 2484 fatal("Unsupported type %s", type2name(elem_bt)); 2485 break; 2486 } 2487 } 2488 2489 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2490 XMMRegister dst, XMMRegister src, 2491 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2492 XMMRegister xmm_0, XMMRegister xmm_1) { 2493 const int permconst[] = {1, 14}; 2494 XMMRegister wsrc = src; 2495 XMMRegister wdst = xmm_0; 2496 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2497 2498 int vlen_enc = Assembler::AVX_128bit; 2499 if (vlen == 16) { 2500 vlen_enc = Assembler::AVX_256bit; 2501 } 2502 2503 for (int i = log2(vlen) - 1; i >=0; i--) { 2504 if (i == 0 && !is_dst_valid) { 2505 wdst = dst; 2506 } 2507 if (i == 3) { 2508 vextracti64x4_high(wtmp, wsrc); 2509 } else if (i == 2) { 2510 vextracti128_high(wtmp, wsrc); 2511 } else { // i = [0,1] 2512 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2513 } 2514 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2515 wsrc = wdst; 2516 vlen_enc = Assembler::AVX_128bit; 2517 } 2518 if (is_dst_valid) { 2519 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2520 } 2521 } 2522 2523 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2524 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2525 XMMRegister xmm_0, XMMRegister xmm_1) { 2526 XMMRegister wsrc = src; 2527 XMMRegister wdst = xmm_0; 2528 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2529 int vlen_enc = Assembler::AVX_128bit; 2530 if (vlen == 8) { 2531 vlen_enc = Assembler::AVX_256bit; 2532 } 2533 for (int i = log2(vlen) - 1; i >=0; i--) { 2534 if (i == 0 && !is_dst_valid) { 2535 wdst = dst; 2536 } 2537 if (i == 1) { 2538 vextracti128_high(wtmp, wsrc); 2539 } else if (i == 2) { 2540 vextracti64x4_high(wtmp, wsrc); 2541 } else { 2542 assert(i == 0, "%d", i); 2543 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2544 } 2545 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2546 wsrc = wdst; 2547 vlen_enc = Assembler::AVX_128bit; 2548 } 2549 if (is_dst_valid) { 2550 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2551 } 2552 } 2553 2554 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2555 switch (bt) { 2556 case T_BYTE: pextrb(dst, src, idx); break; 2557 case T_SHORT: pextrw(dst, src, idx); break; 2558 case T_INT: pextrd(dst, src, idx); break; 2559 case T_LONG: pextrq(dst, src, idx); break; 2560 2561 default: 2562 assert(false,"Should not reach here."); 2563 break; 2564 } 2565 } 2566 2567 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2568 int esize = type2aelembytes(typ); 2569 int elem_per_lane = 16/esize; 2570 int lane = elemindex / elem_per_lane; 2571 int eindex = elemindex % elem_per_lane; 2572 2573 if (lane >= 2) { 2574 assert(UseAVX > 2, "required"); 2575 vextractf32x4(dst, src, lane & 3); 2576 return dst; 2577 } else if (lane > 0) { 2578 assert(UseAVX > 0, "required"); 2579 vextractf128(dst, src, lane); 2580 return dst; 2581 } else { 2582 return src; 2583 } 2584 } 2585 2586 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2587 if (typ == T_BYTE) { 2588 movsbl(dst, dst); 2589 } else if (typ == T_SHORT) { 2590 movswl(dst, dst); 2591 } 2592 } 2593 2594 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2595 int esize = type2aelembytes(typ); 2596 int elem_per_lane = 16/esize; 2597 int eindex = elemindex % elem_per_lane; 2598 assert(is_integral_type(typ),"required"); 2599 2600 if (eindex == 0) { 2601 if (typ == T_LONG) { 2602 movq(dst, src); 2603 } else { 2604 movdl(dst, src); 2605 movsxl(typ, dst); 2606 } 2607 } else { 2608 extract(typ, dst, src, eindex); 2609 movsxl(typ, dst); 2610 } 2611 } 2612 2613 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2614 int esize = type2aelembytes(typ); 2615 int elem_per_lane = 16/esize; 2616 int eindex = elemindex % elem_per_lane; 2617 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2618 2619 if (eindex == 0) { 2620 movq(dst, src); 2621 } else { 2622 if (typ == T_FLOAT) { 2623 if (UseAVX == 0) { 2624 movdqu(dst, src); 2625 shufps(dst, dst, eindex); 2626 } else { 2627 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2628 } 2629 } else { 2630 if (UseAVX == 0) { 2631 movdqu(dst, src); 2632 psrldq(dst, eindex*esize); 2633 } else { 2634 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2635 } 2636 movq(dst, dst); 2637 } 2638 } 2639 // Zero upper bits 2640 if (typ == T_FLOAT) { 2641 if (UseAVX == 0) { 2642 assert(vtmp != xnoreg, "required."); 2643 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2644 pand(dst, vtmp); 2645 } else { 2646 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2647 } 2648 } 2649 } 2650 2651 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2652 switch(typ) { 2653 case T_BYTE: 2654 case T_BOOLEAN: 2655 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2656 break; 2657 case T_SHORT: 2658 case T_CHAR: 2659 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2660 break; 2661 case T_INT: 2662 case T_FLOAT: 2663 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2664 break; 2665 case T_LONG: 2666 case T_DOUBLE: 2667 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2668 break; 2669 default: 2670 assert(false,"Should not reach here."); 2671 break; 2672 } 2673 } 2674 2675 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2676 assert(rscratch != noreg || always_reachable(src2), "missing"); 2677 2678 switch(typ) { 2679 case T_BOOLEAN: 2680 case T_BYTE: 2681 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2682 break; 2683 case T_CHAR: 2684 case T_SHORT: 2685 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2686 break; 2687 case T_INT: 2688 case T_FLOAT: 2689 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2690 break; 2691 case T_LONG: 2692 case T_DOUBLE: 2693 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2694 break; 2695 default: 2696 assert(false,"Should not reach here."); 2697 break; 2698 } 2699 } 2700 2701 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2702 switch(typ) { 2703 case T_BYTE: 2704 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2705 break; 2706 case T_SHORT: 2707 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2708 break; 2709 case T_INT: 2710 case T_FLOAT: 2711 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2712 break; 2713 case T_LONG: 2714 case T_DOUBLE: 2715 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2716 break; 2717 default: 2718 assert(false,"Should not reach here."); 2719 break; 2720 } 2721 } 2722 2723 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2724 assert(vlen_in_bytes <= 32, ""); 2725 int esize = type2aelembytes(bt); 2726 if (vlen_in_bytes == 32) { 2727 assert(vtmp == xnoreg, "required."); 2728 if (esize >= 4) { 2729 vtestps(src1, src2, AVX_256bit); 2730 } else { 2731 vptest(src1, src2, AVX_256bit); 2732 } 2733 return; 2734 } 2735 if (vlen_in_bytes < 16) { 2736 // Duplicate the lower part to fill the whole register, 2737 // Don't need to do so for src2 2738 assert(vtmp != xnoreg, "required"); 2739 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2740 pshufd(vtmp, src1, shuffle_imm); 2741 } else { 2742 assert(vtmp == xnoreg, "required"); 2743 vtmp = src1; 2744 } 2745 if (esize >= 4 && VM_Version::supports_avx()) { 2746 vtestps(vtmp, src2, AVX_128bit); 2747 } else { 2748 ptest(vtmp, src2); 2749 } 2750 } 2751 2752 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2753 #ifdef ASSERT 2754 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2755 bool is_bw_supported = VM_Version::supports_avx512bw(); 2756 if (is_bw && !is_bw_supported) { 2757 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2758 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2759 "XMM register should be 0-15"); 2760 } 2761 #endif // ASSERT 2762 switch (elem_bt) { 2763 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2764 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2765 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2766 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2767 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2768 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2769 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2770 } 2771 } 2772 2773 #ifdef _LP64 2774 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2775 assert(UseAVX >= 2, "required"); 2776 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2777 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2778 if ((UseAVX > 2) && 2779 (!is_bw || VM_Version::supports_avx512bw()) && 2780 (!is_vl || VM_Version::supports_avx512vl())) { 2781 switch (elem_bt) { 2782 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2783 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2784 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2785 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2786 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2787 } 2788 } else { 2789 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2790 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2791 switch (elem_bt) { 2792 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2793 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2794 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2795 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2796 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2797 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2798 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2799 } 2800 } 2801 } 2802 #endif 2803 2804 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2805 switch (to_elem_bt) { 2806 case T_SHORT: 2807 vpmovsxbw(dst, src, vlen_enc); 2808 break; 2809 case T_INT: 2810 vpmovsxbd(dst, src, vlen_enc); 2811 break; 2812 case T_FLOAT: 2813 vpmovsxbd(dst, src, vlen_enc); 2814 vcvtdq2ps(dst, dst, vlen_enc); 2815 break; 2816 case T_LONG: 2817 vpmovsxbq(dst, src, vlen_enc); 2818 break; 2819 case T_DOUBLE: { 2820 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2821 vpmovsxbd(dst, src, mid_vlen_enc); 2822 vcvtdq2pd(dst, dst, vlen_enc); 2823 break; 2824 } 2825 default: 2826 fatal("Unsupported type %s", type2name(to_elem_bt)); 2827 break; 2828 } 2829 } 2830 2831 //------------------------------------------------------------------------------------------- 2832 2833 // IndexOf for constant substrings with size >= 8 chars 2834 // which don't need to be loaded through stack. 2835 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2836 Register cnt1, Register cnt2, 2837 int int_cnt2, Register result, 2838 XMMRegister vec, Register tmp, 2839 int ae) { 2840 ShortBranchVerifier sbv(this); 2841 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2842 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2843 2844 // This method uses the pcmpestri instruction with bound registers 2845 // inputs: 2846 // xmm - substring 2847 // rax - substring length (elements count) 2848 // mem - scanned string 2849 // rdx - string length (elements count) 2850 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2851 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2852 // outputs: 2853 // rcx - matched index in string 2854 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2855 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2856 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2857 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2858 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2859 2860 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2861 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2862 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2863 2864 // Note, inline_string_indexOf() generates checks: 2865 // if (substr.count > string.count) return -1; 2866 // if (substr.count == 0) return 0; 2867 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2868 2869 // Load substring. 2870 if (ae == StrIntrinsicNode::UL) { 2871 pmovzxbw(vec, Address(str2, 0)); 2872 } else { 2873 movdqu(vec, Address(str2, 0)); 2874 } 2875 movl(cnt2, int_cnt2); 2876 movptr(result, str1); // string addr 2877 2878 if (int_cnt2 > stride) { 2879 jmpb(SCAN_TO_SUBSTR); 2880 2881 // Reload substr for rescan, this code 2882 // is executed only for large substrings (> 8 chars) 2883 bind(RELOAD_SUBSTR); 2884 if (ae == StrIntrinsicNode::UL) { 2885 pmovzxbw(vec, Address(str2, 0)); 2886 } else { 2887 movdqu(vec, Address(str2, 0)); 2888 } 2889 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2890 2891 bind(RELOAD_STR); 2892 // We came here after the beginning of the substring was 2893 // matched but the rest of it was not so we need to search 2894 // again. Start from the next element after the previous match. 2895 2896 // cnt2 is number of substring reminding elements and 2897 // cnt1 is number of string reminding elements when cmp failed. 2898 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2899 subl(cnt1, cnt2); 2900 addl(cnt1, int_cnt2); 2901 movl(cnt2, int_cnt2); // Now restore cnt2 2902 2903 decrementl(cnt1); // Shift to next element 2904 cmpl(cnt1, cnt2); 2905 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2906 2907 addptr(result, (1<<scale1)); 2908 2909 } // (int_cnt2 > 8) 2910 2911 // Scan string for start of substr in 16-byte vectors 2912 bind(SCAN_TO_SUBSTR); 2913 pcmpestri(vec, Address(result, 0), mode); 2914 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2915 subl(cnt1, stride); 2916 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2917 cmpl(cnt1, cnt2); 2918 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2919 addptr(result, 16); 2920 jmpb(SCAN_TO_SUBSTR); 2921 2922 // Found a potential substr 2923 bind(FOUND_CANDIDATE); 2924 // Matched whole vector if first element matched (tmp(rcx) == 0). 2925 if (int_cnt2 == stride) { 2926 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2927 } else { // int_cnt2 > 8 2928 jccb(Assembler::overflow, FOUND_SUBSTR); 2929 } 2930 // After pcmpestri tmp(rcx) contains matched element index 2931 // Compute start addr of substr 2932 lea(result, Address(result, tmp, scale1)); 2933 2934 // Make sure string is still long enough 2935 subl(cnt1, tmp); 2936 cmpl(cnt1, cnt2); 2937 if (int_cnt2 == stride) { 2938 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2939 } else { // int_cnt2 > 8 2940 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2941 } 2942 // Left less then substring. 2943 2944 bind(RET_NOT_FOUND); 2945 movl(result, -1); 2946 jmp(EXIT); 2947 2948 if (int_cnt2 > stride) { 2949 // This code is optimized for the case when whole substring 2950 // is matched if its head is matched. 2951 bind(MATCH_SUBSTR_HEAD); 2952 pcmpestri(vec, Address(result, 0), mode); 2953 // Reload only string if does not match 2954 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2955 2956 Label CONT_SCAN_SUBSTR; 2957 // Compare the rest of substring (> 8 chars). 2958 bind(FOUND_SUBSTR); 2959 // First 8 chars are already matched. 2960 negptr(cnt2); 2961 addptr(cnt2, stride); 2962 2963 bind(SCAN_SUBSTR); 2964 subl(cnt1, stride); 2965 cmpl(cnt2, -stride); // Do not read beyond substring 2966 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2967 // Back-up strings to avoid reading beyond substring: 2968 // cnt1 = cnt1 - cnt2 + 8 2969 addl(cnt1, cnt2); // cnt2 is negative 2970 addl(cnt1, stride); 2971 movl(cnt2, stride); negptr(cnt2); 2972 bind(CONT_SCAN_SUBSTR); 2973 if (int_cnt2 < (int)G) { 2974 int tail_off1 = int_cnt2<<scale1; 2975 int tail_off2 = int_cnt2<<scale2; 2976 if (ae == StrIntrinsicNode::UL) { 2977 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2978 } else { 2979 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2980 } 2981 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2982 } else { 2983 // calculate index in register to avoid integer overflow (int_cnt2*2) 2984 movl(tmp, int_cnt2); 2985 addptr(tmp, cnt2); 2986 if (ae == StrIntrinsicNode::UL) { 2987 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2988 } else { 2989 movdqu(vec, Address(str2, tmp, scale2, 0)); 2990 } 2991 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2992 } 2993 // Need to reload strings pointers if not matched whole vector 2994 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2995 addptr(cnt2, stride); 2996 jcc(Assembler::negative, SCAN_SUBSTR); 2997 // Fall through if found full substring 2998 2999 } // (int_cnt2 > 8) 3000 3001 bind(RET_FOUND); 3002 // Found result if we matched full small substring. 3003 // Compute substr offset 3004 subptr(result, str1); 3005 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3006 shrl(result, 1); // index 3007 } 3008 bind(EXIT); 3009 3010 } // string_indexofC8 3011 3012 // Small strings are loaded through stack if they cross page boundary. 3013 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 3014 Register cnt1, Register cnt2, 3015 int int_cnt2, Register result, 3016 XMMRegister vec, Register tmp, 3017 int ae) { 3018 ShortBranchVerifier sbv(this); 3019 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3020 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3021 3022 // 3023 // int_cnt2 is length of small (< 8 chars) constant substring 3024 // or (-1) for non constant substring in which case its length 3025 // is in cnt2 register. 3026 // 3027 // Note, inline_string_indexOf() generates checks: 3028 // if (substr.count > string.count) return -1; 3029 // if (substr.count == 0) return 0; 3030 // 3031 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 3032 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 3033 // This method uses the pcmpestri instruction with bound registers 3034 // inputs: 3035 // xmm - substring 3036 // rax - substring length (elements count) 3037 // mem - scanned string 3038 // rdx - string length (elements count) 3039 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 3040 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 3041 // outputs: 3042 // rcx - matched index in string 3043 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3044 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 3045 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 3046 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 3047 3048 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 3049 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 3050 FOUND_CANDIDATE; 3051 3052 { //======================================================== 3053 // We don't know where these strings are located 3054 // and we can't read beyond them. Load them through stack. 3055 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 3056 3057 movptr(tmp, rsp); // save old SP 3058 3059 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 3060 if (int_cnt2 == (1>>scale2)) { // One byte 3061 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 3062 load_unsigned_byte(result, Address(str2, 0)); 3063 movdl(vec, result); // move 32 bits 3064 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 3065 // Not enough header space in 32-bit VM: 12+3 = 15. 3066 movl(result, Address(str2, -1)); 3067 shrl(result, 8); 3068 movdl(vec, result); // move 32 bits 3069 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 3070 load_unsigned_short(result, Address(str2, 0)); 3071 movdl(vec, result); // move 32 bits 3072 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 3073 movdl(vec, Address(str2, 0)); // move 32 bits 3074 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 3075 movq(vec, Address(str2, 0)); // move 64 bits 3076 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 3077 // Array header size is 12 bytes in 32-bit VM 3078 // + 6 bytes for 3 chars == 18 bytes, 3079 // enough space to load vec and shift. 3080 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3081 if (ae == StrIntrinsicNode::UL) { 3082 int tail_off = int_cnt2-8; 3083 pmovzxbw(vec, Address(str2, tail_off)); 3084 psrldq(vec, -2*tail_off); 3085 } 3086 else { 3087 int tail_off = int_cnt2*(1<<scale2); 3088 movdqu(vec, Address(str2, tail_off-16)); 3089 psrldq(vec, 16-tail_off); 3090 } 3091 } 3092 } else { // not constant substring 3093 cmpl(cnt2, stride); 3094 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3095 3096 // We can read beyond string if srt+16 does not cross page boundary 3097 // since heaps are aligned and mapped by pages. 3098 assert(os::vm_page_size() < (int)G, "default page should be small"); 3099 movl(result, str2); // We need only low 32 bits 3100 andl(result, ((int)os::vm_page_size()-1)); 3101 cmpl(result, ((int)os::vm_page_size()-16)); 3102 jccb(Assembler::belowEqual, CHECK_STR); 3103 3104 // Move small strings to stack to allow load 16 bytes into vec. 3105 subptr(rsp, 16); 3106 int stk_offset = wordSize-(1<<scale2); 3107 push(cnt2); 3108 3109 bind(COPY_SUBSTR); 3110 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3111 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3112 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3113 } else if (ae == StrIntrinsicNode::UU) { 3114 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3115 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3116 } 3117 decrement(cnt2); 3118 jccb(Assembler::notZero, COPY_SUBSTR); 3119 3120 pop(cnt2); 3121 movptr(str2, rsp); // New substring address 3122 } // non constant 3123 3124 bind(CHECK_STR); 3125 cmpl(cnt1, stride); 3126 jccb(Assembler::aboveEqual, BIG_STRINGS); 3127 3128 // Check cross page boundary. 3129 movl(result, str1); // We need only low 32 bits 3130 andl(result, ((int)os::vm_page_size()-1)); 3131 cmpl(result, ((int)os::vm_page_size()-16)); 3132 jccb(Assembler::belowEqual, BIG_STRINGS); 3133 3134 subptr(rsp, 16); 3135 int stk_offset = -(1<<scale1); 3136 if (int_cnt2 < 0) { // not constant 3137 push(cnt2); 3138 stk_offset += wordSize; 3139 } 3140 movl(cnt2, cnt1); 3141 3142 bind(COPY_STR); 3143 if (ae == StrIntrinsicNode::LL) { 3144 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3145 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3146 } else { 3147 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3148 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3149 } 3150 decrement(cnt2); 3151 jccb(Assembler::notZero, COPY_STR); 3152 3153 if (int_cnt2 < 0) { // not constant 3154 pop(cnt2); 3155 } 3156 movptr(str1, rsp); // New string address 3157 3158 bind(BIG_STRINGS); 3159 // Load substring. 3160 if (int_cnt2 < 0) { // -1 3161 if (ae == StrIntrinsicNode::UL) { 3162 pmovzxbw(vec, Address(str2, 0)); 3163 } else { 3164 movdqu(vec, Address(str2, 0)); 3165 } 3166 push(cnt2); // substr count 3167 push(str2); // substr addr 3168 push(str1); // string addr 3169 } else { 3170 // Small (< 8 chars) constant substrings are loaded already. 3171 movl(cnt2, int_cnt2); 3172 } 3173 push(tmp); // original SP 3174 3175 } // Finished loading 3176 3177 //======================================================== 3178 // Start search 3179 // 3180 3181 movptr(result, str1); // string addr 3182 3183 if (int_cnt2 < 0) { // Only for non constant substring 3184 jmpb(SCAN_TO_SUBSTR); 3185 3186 // SP saved at sp+0 3187 // String saved at sp+1*wordSize 3188 // Substr saved at sp+2*wordSize 3189 // Substr count saved at sp+3*wordSize 3190 3191 // Reload substr for rescan, this code 3192 // is executed only for large substrings (> 8 chars) 3193 bind(RELOAD_SUBSTR); 3194 movptr(str2, Address(rsp, 2*wordSize)); 3195 movl(cnt2, Address(rsp, 3*wordSize)); 3196 if (ae == StrIntrinsicNode::UL) { 3197 pmovzxbw(vec, Address(str2, 0)); 3198 } else { 3199 movdqu(vec, Address(str2, 0)); 3200 } 3201 // We came here after the beginning of the substring was 3202 // matched but the rest of it was not so we need to search 3203 // again. Start from the next element after the previous match. 3204 subptr(str1, result); // Restore counter 3205 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3206 shrl(str1, 1); 3207 } 3208 addl(cnt1, str1); 3209 decrementl(cnt1); // Shift to next element 3210 cmpl(cnt1, cnt2); 3211 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3212 3213 addptr(result, (1<<scale1)); 3214 } // non constant 3215 3216 // Scan string for start of substr in 16-byte vectors 3217 bind(SCAN_TO_SUBSTR); 3218 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3219 pcmpestri(vec, Address(result, 0), mode); 3220 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3221 subl(cnt1, stride); 3222 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3223 cmpl(cnt1, cnt2); 3224 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3225 addptr(result, 16); 3226 3227 bind(ADJUST_STR); 3228 cmpl(cnt1, stride); // Do not read beyond string 3229 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3230 // Back-up string to avoid reading beyond string. 3231 lea(result, Address(result, cnt1, scale1, -16)); 3232 movl(cnt1, stride); 3233 jmpb(SCAN_TO_SUBSTR); 3234 3235 // Found a potential substr 3236 bind(FOUND_CANDIDATE); 3237 // After pcmpestri tmp(rcx) contains matched element index 3238 3239 // Make sure string is still long enough 3240 subl(cnt1, tmp); 3241 cmpl(cnt1, cnt2); 3242 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3243 // Left less then substring. 3244 3245 bind(RET_NOT_FOUND); 3246 movl(result, -1); 3247 jmp(CLEANUP); 3248 3249 bind(FOUND_SUBSTR); 3250 // Compute start addr of substr 3251 lea(result, Address(result, tmp, scale1)); 3252 if (int_cnt2 > 0) { // Constant substring 3253 // Repeat search for small substring (< 8 chars) 3254 // from new point without reloading substring. 3255 // Have to check that we don't read beyond string. 3256 cmpl(tmp, stride-int_cnt2); 3257 jccb(Assembler::greater, ADJUST_STR); 3258 // Fall through if matched whole substring. 3259 } else { // non constant 3260 assert(int_cnt2 == -1, "should be != 0"); 3261 3262 addl(tmp, cnt2); 3263 // Found result if we matched whole substring. 3264 cmpl(tmp, stride); 3265 jcc(Assembler::lessEqual, RET_FOUND); 3266 3267 // Repeat search for small substring (<= 8 chars) 3268 // from new point 'str1' without reloading substring. 3269 cmpl(cnt2, stride); 3270 // Have to check that we don't read beyond string. 3271 jccb(Assembler::lessEqual, ADJUST_STR); 3272 3273 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3274 // Compare the rest of substring (> 8 chars). 3275 movptr(str1, result); 3276 3277 cmpl(tmp, cnt2); 3278 // First 8 chars are already matched. 3279 jccb(Assembler::equal, CHECK_NEXT); 3280 3281 bind(SCAN_SUBSTR); 3282 pcmpestri(vec, Address(str1, 0), mode); 3283 // Need to reload strings pointers if not matched whole vector 3284 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3285 3286 bind(CHECK_NEXT); 3287 subl(cnt2, stride); 3288 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3289 addptr(str1, 16); 3290 if (ae == StrIntrinsicNode::UL) { 3291 addptr(str2, 8); 3292 } else { 3293 addptr(str2, 16); 3294 } 3295 subl(cnt1, stride); 3296 cmpl(cnt2, stride); // Do not read beyond substring 3297 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3298 // Back-up strings to avoid reading beyond substring. 3299 3300 if (ae == StrIntrinsicNode::UL) { 3301 lea(str2, Address(str2, cnt2, scale2, -8)); 3302 lea(str1, Address(str1, cnt2, scale1, -16)); 3303 } else { 3304 lea(str2, Address(str2, cnt2, scale2, -16)); 3305 lea(str1, Address(str1, cnt2, scale1, -16)); 3306 } 3307 subl(cnt1, cnt2); 3308 movl(cnt2, stride); 3309 addl(cnt1, stride); 3310 bind(CONT_SCAN_SUBSTR); 3311 if (ae == StrIntrinsicNode::UL) { 3312 pmovzxbw(vec, Address(str2, 0)); 3313 } else { 3314 movdqu(vec, Address(str2, 0)); 3315 } 3316 jmp(SCAN_SUBSTR); 3317 3318 bind(RET_FOUND_LONG); 3319 movptr(str1, Address(rsp, wordSize)); 3320 } // non constant 3321 3322 bind(RET_FOUND); 3323 // Compute substr offset 3324 subptr(result, str1); 3325 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3326 shrl(result, 1); // index 3327 } 3328 bind(CLEANUP); 3329 pop(rsp); // restore SP 3330 3331 } // string_indexof 3332 3333 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3334 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3335 ShortBranchVerifier sbv(this); 3336 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3337 3338 int stride = 8; 3339 3340 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3341 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3342 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3343 FOUND_SEQ_CHAR, DONE_LABEL; 3344 3345 movptr(result, str1); 3346 if (UseAVX >= 2) { 3347 cmpl(cnt1, stride); 3348 jcc(Assembler::less, SCAN_TO_CHAR); 3349 cmpl(cnt1, 2*stride); 3350 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3351 movdl(vec1, ch); 3352 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3353 vpxor(vec2, vec2); 3354 movl(tmp, cnt1); 3355 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3356 andl(cnt1,0x0000000F); //tail count (in chars) 3357 3358 bind(SCAN_TO_16_CHAR_LOOP); 3359 vmovdqu(vec3, Address(result, 0)); 3360 vpcmpeqw(vec3, vec3, vec1, 1); 3361 vptest(vec2, vec3); 3362 jcc(Assembler::carryClear, FOUND_CHAR); 3363 addptr(result, 32); 3364 subl(tmp, 2*stride); 3365 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3366 jmp(SCAN_TO_8_CHAR); 3367 bind(SCAN_TO_8_CHAR_INIT); 3368 movdl(vec1, ch); 3369 pshuflw(vec1, vec1, 0x00); 3370 pshufd(vec1, vec1, 0); 3371 pxor(vec2, vec2); 3372 } 3373 bind(SCAN_TO_8_CHAR); 3374 cmpl(cnt1, stride); 3375 jcc(Assembler::less, SCAN_TO_CHAR); 3376 if (UseAVX < 2) { 3377 movdl(vec1, ch); 3378 pshuflw(vec1, vec1, 0x00); 3379 pshufd(vec1, vec1, 0); 3380 pxor(vec2, vec2); 3381 } 3382 movl(tmp, cnt1); 3383 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3384 andl(cnt1,0x00000007); //tail count (in chars) 3385 3386 bind(SCAN_TO_8_CHAR_LOOP); 3387 movdqu(vec3, Address(result, 0)); 3388 pcmpeqw(vec3, vec1); 3389 ptest(vec2, vec3); 3390 jcc(Assembler::carryClear, FOUND_CHAR); 3391 addptr(result, 16); 3392 subl(tmp, stride); 3393 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3394 bind(SCAN_TO_CHAR); 3395 testl(cnt1, cnt1); 3396 jcc(Assembler::zero, RET_NOT_FOUND); 3397 bind(SCAN_TO_CHAR_LOOP); 3398 load_unsigned_short(tmp, Address(result, 0)); 3399 cmpl(ch, tmp); 3400 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3401 addptr(result, 2); 3402 subl(cnt1, 1); 3403 jccb(Assembler::zero, RET_NOT_FOUND); 3404 jmp(SCAN_TO_CHAR_LOOP); 3405 3406 bind(RET_NOT_FOUND); 3407 movl(result, -1); 3408 jmpb(DONE_LABEL); 3409 3410 bind(FOUND_CHAR); 3411 if (UseAVX >= 2) { 3412 vpmovmskb(tmp, vec3); 3413 } else { 3414 pmovmskb(tmp, vec3); 3415 } 3416 bsfl(ch, tmp); 3417 addptr(result, ch); 3418 3419 bind(FOUND_SEQ_CHAR); 3420 subptr(result, str1); 3421 shrl(result, 1); 3422 3423 bind(DONE_LABEL); 3424 } // string_indexof_char 3425 3426 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3427 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3428 ShortBranchVerifier sbv(this); 3429 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3430 3431 int stride = 16; 3432 3433 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3434 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3435 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3436 FOUND_SEQ_CHAR, DONE_LABEL; 3437 3438 movptr(result, str1); 3439 if (UseAVX >= 2) { 3440 cmpl(cnt1, stride); 3441 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3442 cmpl(cnt1, stride*2); 3443 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3444 movdl(vec1, ch); 3445 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3446 vpxor(vec2, vec2); 3447 movl(tmp, cnt1); 3448 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3449 andl(cnt1,0x0000001F); //tail count (in chars) 3450 3451 bind(SCAN_TO_32_CHAR_LOOP); 3452 vmovdqu(vec3, Address(result, 0)); 3453 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3454 vptest(vec2, vec3); 3455 jcc(Assembler::carryClear, FOUND_CHAR); 3456 addptr(result, 32); 3457 subl(tmp, stride*2); 3458 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3459 jmp(SCAN_TO_16_CHAR); 3460 3461 bind(SCAN_TO_16_CHAR_INIT); 3462 movdl(vec1, ch); 3463 pxor(vec2, vec2); 3464 pshufb(vec1, vec2); 3465 } 3466 3467 bind(SCAN_TO_16_CHAR); 3468 cmpl(cnt1, stride); 3469 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3470 if (UseAVX < 2) { 3471 movdl(vec1, ch); 3472 pxor(vec2, vec2); 3473 pshufb(vec1, vec2); 3474 } 3475 movl(tmp, cnt1); 3476 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3477 andl(cnt1,0x0000000F); //tail count (in bytes) 3478 3479 bind(SCAN_TO_16_CHAR_LOOP); 3480 movdqu(vec3, Address(result, 0)); 3481 pcmpeqb(vec3, vec1); 3482 ptest(vec2, vec3); 3483 jcc(Assembler::carryClear, FOUND_CHAR); 3484 addptr(result, 16); 3485 subl(tmp, stride); 3486 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3487 3488 bind(SCAN_TO_CHAR_INIT); 3489 testl(cnt1, cnt1); 3490 jcc(Assembler::zero, RET_NOT_FOUND); 3491 bind(SCAN_TO_CHAR_LOOP); 3492 load_unsigned_byte(tmp, Address(result, 0)); 3493 cmpl(ch, tmp); 3494 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3495 addptr(result, 1); 3496 subl(cnt1, 1); 3497 jccb(Assembler::zero, RET_NOT_FOUND); 3498 jmp(SCAN_TO_CHAR_LOOP); 3499 3500 bind(RET_NOT_FOUND); 3501 movl(result, -1); 3502 jmpb(DONE_LABEL); 3503 3504 bind(FOUND_CHAR); 3505 if (UseAVX >= 2) { 3506 vpmovmskb(tmp, vec3); 3507 } else { 3508 pmovmskb(tmp, vec3); 3509 } 3510 bsfl(ch, tmp); 3511 addptr(result, ch); 3512 3513 bind(FOUND_SEQ_CHAR); 3514 subptr(result, str1); 3515 3516 bind(DONE_LABEL); 3517 } // stringL_indexof_char 3518 3519 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3520 switch (eltype) { 3521 case T_BOOLEAN: return sizeof(jboolean); 3522 case T_BYTE: return sizeof(jbyte); 3523 case T_SHORT: return sizeof(jshort); 3524 case T_CHAR: return sizeof(jchar); 3525 case T_INT: return sizeof(jint); 3526 default: 3527 ShouldNotReachHere(); 3528 return -1; 3529 } 3530 } 3531 3532 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3533 switch (eltype) { 3534 // T_BOOLEAN used as surrogate for unsigned byte 3535 case T_BOOLEAN: movzbl(dst, src); break; 3536 case T_BYTE: movsbl(dst, src); break; 3537 case T_SHORT: movswl(dst, src); break; 3538 case T_CHAR: movzwl(dst, src); break; 3539 case T_INT: movl(dst, src); break; 3540 default: 3541 ShouldNotReachHere(); 3542 } 3543 } 3544 3545 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3546 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3547 } 3548 3549 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3550 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3551 } 3552 3553 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3554 const int vlen = Assembler::AVX_256bit; 3555 switch (eltype) { 3556 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3557 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3558 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3559 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3560 case T_INT: 3561 // do nothing 3562 break; 3563 default: 3564 ShouldNotReachHere(); 3565 } 3566 } 3567 3568 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3569 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3570 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3571 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3572 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3573 BasicType eltype) { 3574 ShortBranchVerifier sbv(this); 3575 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3576 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3577 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3578 3579 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3580 SHORT_UNROLLED_LOOP_EXIT, 3581 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3582 UNROLLED_VECTOR_LOOP_BEGIN, 3583 END; 3584 switch (eltype) { 3585 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3586 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3587 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3588 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3589 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3590 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3591 } 3592 3593 // For "renaming" for readibility of the code 3594 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3595 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3596 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3597 3598 const int elsize = arrays_hashcode_elsize(eltype); 3599 3600 /* 3601 if (cnt1 >= 2) { 3602 if (cnt1 >= 32) { 3603 UNROLLED VECTOR LOOP 3604 } 3605 UNROLLED SCALAR LOOP 3606 } 3607 SINGLE SCALAR 3608 */ 3609 3610 cmpl(cnt1, 32); 3611 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3612 3613 // cnt1 >= 32 && generate_vectorized_loop 3614 xorl(index, index); 3615 3616 // vresult = IntVector.zero(I256); 3617 for (int idx = 0; idx < 4; idx++) { 3618 vpxor(vresult[idx], vresult[idx]); 3619 } 3620 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3621 Register bound = tmp2; 3622 Register next = tmp3; 3623 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3624 movl(next, Address(tmp2, 0)); 3625 movdl(vnext, next); 3626 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3627 3628 // index = 0; 3629 // bound = cnt1 & ~(32 - 1); 3630 movl(bound, cnt1); 3631 andl(bound, ~(32 - 1)); 3632 // for (; index < bound; index += 32) { 3633 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3634 // result *= next; 3635 imull(result, next); 3636 // loop fission to upfront the cost of fetching from memory, OOO execution 3637 // can then hopefully do a better job of prefetching 3638 for (int idx = 0; idx < 4; idx++) { 3639 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3640 } 3641 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3642 for (int idx = 0; idx < 4; idx++) { 3643 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3644 arrays_hashcode_elvcast(vtmp[idx], eltype); 3645 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3646 } 3647 // index += 32; 3648 addl(index, 32); 3649 // index < bound; 3650 cmpl(index, bound); 3651 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3652 // } 3653 3654 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3655 subl(cnt1, bound); 3656 // release bound 3657 3658 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3659 for (int idx = 0; idx < 4; idx++) { 3660 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3661 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3662 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3663 } 3664 // result += vresult.reduceLanes(ADD); 3665 for (int idx = 0; idx < 4; idx++) { 3666 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3667 } 3668 3669 // } else if (cnt1 < 32) { 3670 3671 bind(SHORT_UNROLLED_BEGIN); 3672 // int i = 1; 3673 movl(index, 1); 3674 cmpl(index, cnt1); 3675 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3676 3677 // for (; i < cnt1 ; i += 2) { 3678 bind(SHORT_UNROLLED_LOOP_BEGIN); 3679 movl(tmp3, 961); 3680 imull(result, tmp3); 3681 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3682 movl(tmp3, tmp2); 3683 shll(tmp3, 5); 3684 subl(tmp3, tmp2); 3685 addl(result, tmp3); 3686 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3687 addl(result, tmp3); 3688 addl(index, 2); 3689 cmpl(index, cnt1); 3690 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3691 3692 // } 3693 // if (i >= cnt1) { 3694 bind(SHORT_UNROLLED_LOOP_EXIT); 3695 jccb(Assembler::greater, END); 3696 movl(tmp2, result); 3697 shll(result, 5); 3698 subl(result, tmp2); 3699 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3700 addl(result, tmp3); 3701 // } 3702 bind(END); 3703 3704 BLOCK_COMMENT("} // arrays_hashcode"); 3705 3706 } // arrays_hashcode 3707 3708 // helper function for string_compare 3709 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3710 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3711 Address::ScaleFactor scale2, Register index, int ae) { 3712 if (ae == StrIntrinsicNode::LL) { 3713 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3714 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3715 } else if (ae == StrIntrinsicNode::UU) { 3716 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3717 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3718 } else { 3719 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3720 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3721 } 3722 } 3723 3724 // Compare strings, used for char[] and byte[]. 3725 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3726 Register cnt1, Register cnt2, Register result, 3727 XMMRegister vec1, int ae, KRegister mask) { 3728 ShortBranchVerifier sbv(this); 3729 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3730 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3731 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3732 int stride2x2 = 0x40; 3733 Address::ScaleFactor scale = Address::no_scale; 3734 Address::ScaleFactor scale1 = Address::no_scale; 3735 Address::ScaleFactor scale2 = Address::no_scale; 3736 3737 if (ae != StrIntrinsicNode::LL) { 3738 stride2x2 = 0x20; 3739 } 3740 3741 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3742 shrl(cnt2, 1); 3743 } 3744 // Compute the minimum of the string lengths and the 3745 // difference of the string lengths (stack). 3746 // Do the conditional move stuff 3747 movl(result, cnt1); 3748 subl(cnt1, cnt2); 3749 push(cnt1); 3750 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3751 3752 // Is the minimum length zero? 3753 testl(cnt2, cnt2); 3754 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3755 if (ae == StrIntrinsicNode::LL) { 3756 // Load first bytes 3757 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3758 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3759 } else if (ae == StrIntrinsicNode::UU) { 3760 // Load first characters 3761 load_unsigned_short(result, Address(str1, 0)); 3762 load_unsigned_short(cnt1, Address(str2, 0)); 3763 } else { 3764 load_unsigned_byte(result, Address(str1, 0)); 3765 load_unsigned_short(cnt1, Address(str2, 0)); 3766 } 3767 subl(result, cnt1); 3768 jcc(Assembler::notZero, POP_LABEL); 3769 3770 if (ae == StrIntrinsicNode::UU) { 3771 // Divide length by 2 to get number of chars 3772 shrl(cnt2, 1); 3773 } 3774 cmpl(cnt2, 1); 3775 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3776 3777 // Check if the strings start at the same location and setup scale and stride 3778 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3779 cmpptr(str1, str2); 3780 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3781 if (ae == StrIntrinsicNode::LL) { 3782 scale = Address::times_1; 3783 stride = 16; 3784 } else { 3785 scale = Address::times_2; 3786 stride = 8; 3787 } 3788 } else { 3789 scale1 = Address::times_1; 3790 scale2 = Address::times_2; 3791 // scale not used 3792 stride = 8; 3793 } 3794 3795 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3796 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3797 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3798 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3799 Label COMPARE_TAIL_LONG; 3800 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3801 3802 int pcmpmask = 0x19; 3803 if (ae == StrIntrinsicNode::LL) { 3804 pcmpmask &= ~0x01; 3805 } 3806 3807 // Setup to compare 16-chars (32-bytes) vectors, 3808 // start from first character again because it has aligned address. 3809 if (ae == StrIntrinsicNode::LL) { 3810 stride2 = 32; 3811 } else { 3812 stride2 = 16; 3813 } 3814 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3815 adr_stride = stride << scale; 3816 } else { 3817 adr_stride1 = 8; //stride << scale1; 3818 adr_stride2 = 16; //stride << scale2; 3819 } 3820 3821 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3822 // rax and rdx are used by pcmpestri as elements counters 3823 movl(result, cnt2); 3824 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3825 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3826 3827 // fast path : compare first 2 8-char vectors. 3828 bind(COMPARE_16_CHARS); 3829 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3830 movdqu(vec1, Address(str1, 0)); 3831 } else { 3832 pmovzxbw(vec1, Address(str1, 0)); 3833 } 3834 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3835 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3836 3837 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3838 movdqu(vec1, Address(str1, adr_stride)); 3839 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3840 } else { 3841 pmovzxbw(vec1, Address(str1, adr_stride1)); 3842 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3843 } 3844 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3845 addl(cnt1, stride); 3846 3847 // Compare the characters at index in cnt1 3848 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3849 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3850 subl(result, cnt2); 3851 jmp(POP_LABEL); 3852 3853 // Setup the registers to start vector comparison loop 3854 bind(COMPARE_WIDE_VECTORS); 3855 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3856 lea(str1, Address(str1, result, scale)); 3857 lea(str2, Address(str2, result, scale)); 3858 } else { 3859 lea(str1, Address(str1, result, scale1)); 3860 lea(str2, Address(str2, result, scale2)); 3861 } 3862 subl(result, stride2); 3863 subl(cnt2, stride2); 3864 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3865 negptr(result); 3866 3867 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3868 bind(COMPARE_WIDE_VECTORS_LOOP); 3869 3870 #ifdef _LP64 3871 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3872 cmpl(cnt2, stride2x2); 3873 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3874 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3875 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3876 3877 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3878 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3879 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3880 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3881 } else { 3882 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3883 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3884 } 3885 kortestql(mask, mask); 3886 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3887 addptr(result, stride2x2); // update since we already compared at this addr 3888 subl(cnt2, stride2x2); // and sub the size too 3889 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3890 3891 vpxor(vec1, vec1); 3892 jmpb(COMPARE_WIDE_TAIL); 3893 }//if (VM_Version::supports_avx512vlbw()) 3894 #endif // _LP64 3895 3896 3897 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3898 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3899 vmovdqu(vec1, Address(str1, result, scale)); 3900 vpxor(vec1, Address(str2, result, scale)); 3901 } else { 3902 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3903 vpxor(vec1, Address(str2, result, scale2)); 3904 } 3905 vptest(vec1, vec1); 3906 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3907 addptr(result, stride2); 3908 subl(cnt2, stride2); 3909 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3910 // clean upper bits of YMM registers 3911 vpxor(vec1, vec1); 3912 3913 // compare wide vectors tail 3914 bind(COMPARE_WIDE_TAIL); 3915 testptr(result, result); 3916 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3917 3918 movl(result, stride2); 3919 movl(cnt2, result); 3920 negptr(result); 3921 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3922 3923 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3924 bind(VECTOR_NOT_EQUAL); 3925 // clean upper bits of YMM registers 3926 vpxor(vec1, vec1); 3927 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3928 lea(str1, Address(str1, result, scale)); 3929 lea(str2, Address(str2, result, scale)); 3930 } else { 3931 lea(str1, Address(str1, result, scale1)); 3932 lea(str2, Address(str2, result, scale2)); 3933 } 3934 jmp(COMPARE_16_CHARS); 3935 3936 // Compare tail chars, length between 1 to 15 chars 3937 bind(COMPARE_TAIL_LONG); 3938 movl(cnt2, result); 3939 cmpl(cnt2, stride); 3940 jcc(Assembler::less, COMPARE_SMALL_STR); 3941 3942 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3943 movdqu(vec1, Address(str1, 0)); 3944 } else { 3945 pmovzxbw(vec1, Address(str1, 0)); 3946 } 3947 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3948 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3949 subptr(cnt2, stride); 3950 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3951 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3952 lea(str1, Address(str1, result, scale)); 3953 lea(str2, Address(str2, result, scale)); 3954 } else { 3955 lea(str1, Address(str1, result, scale1)); 3956 lea(str2, Address(str2, result, scale2)); 3957 } 3958 negptr(cnt2); 3959 jmpb(WHILE_HEAD_LABEL); 3960 3961 bind(COMPARE_SMALL_STR); 3962 } else if (UseSSE42Intrinsics) { 3963 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3964 int pcmpmask = 0x19; 3965 // Setup to compare 8-char (16-byte) vectors, 3966 // start from first character again because it has aligned address. 3967 movl(result, cnt2); 3968 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3969 if (ae == StrIntrinsicNode::LL) { 3970 pcmpmask &= ~0x01; 3971 } 3972 jcc(Assembler::zero, COMPARE_TAIL); 3973 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3974 lea(str1, Address(str1, result, scale)); 3975 lea(str2, Address(str2, result, scale)); 3976 } else { 3977 lea(str1, Address(str1, result, scale1)); 3978 lea(str2, Address(str2, result, scale2)); 3979 } 3980 negptr(result); 3981 3982 // pcmpestri 3983 // inputs: 3984 // vec1- substring 3985 // rax - negative string length (elements count) 3986 // mem - scanned string 3987 // rdx - string length (elements count) 3988 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3989 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3990 // outputs: 3991 // rcx - first mismatched element index 3992 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3993 3994 bind(COMPARE_WIDE_VECTORS); 3995 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3996 movdqu(vec1, Address(str1, result, scale)); 3997 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3998 } else { 3999 pmovzxbw(vec1, Address(str1, result, scale1)); 4000 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4001 } 4002 // After pcmpestri cnt1(rcx) contains mismatched element index 4003 4004 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 4005 addptr(result, stride); 4006 subptr(cnt2, stride); 4007 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4008 4009 // compare wide vectors tail 4010 testptr(result, result); 4011 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 4012 4013 movl(cnt2, stride); 4014 movl(result, stride); 4015 negptr(result); 4016 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4017 movdqu(vec1, Address(str1, result, scale)); 4018 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 4019 } else { 4020 pmovzxbw(vec1, Address(str1, result, scale1)); 4021 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4022 } 4023 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 4024 4025 // Mismatched characters in the vectors 4026 bind(VECTOR_NOT_EQUAL); 4027 addptr(cnt1, result); 4028 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 4029 subl(result, cnt2); 4030 jmpb(POP_LABEL); 4031 4032 bind(COMPARE_TAIL); // limit is zero 4033 movl(cnt2, result); 4034 // Fallthru to tail compare 4035 } 4036 // Shift str2 and str1 to the end of the arrays, negate min 4037 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4038 lea(str1, Address(str1, cnt2, scale)); 4039 lea(str2, Address(str2, cnt2, scale)); 4040 } else { 4041 lea(str1, Address(str1, cnt2, scale1)); 4042 lea(str2, Address(str2, cnt2, scale2)); 4043 } 4044 decrementl(cnt2); // first character was compared already 4045 negptr(cnt2); 4046 4047 // Compare the rest of the elements 4048 bind(WHILE_HEAD_LABEL); 4049 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 4050 subl(result, cnt1); 4051 jccb(Assembler::notZero, POP_LABEL); 4052 increment(cnt2); 4053 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 4054 4055 // Strings are equal up to min length. Return the length difference. 4056 bind(LENGTH_DIFF_LABEL); 4057 pop(result); 4058 if (ae == StrIntrinsicNode::UU) { 4059 // Divide diff by 2 to get number of chars 4060 sarl(result, 1); 4061 } 4062 jmpb(DONE_LABEL); 4063 4064 #ifdef _LP64 4065 if (VM_Version::supports_avx512vlbw()) { 4066 4067 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 4068 4069 kmovql(cnt1, mask); 4070 notq(cnt1); 4071 bsfq(cnt2, cnt1); 4072 if (ae != StrIntrinsicNode::LL) { 4073 // Divide diff by 2 to get number of chars 4074 sarl(cnt2, 1); 4075 } 4076 addq(result, cnt2); 4077 if (ae == StrIntrinsicNode::LL) { 4078 load_unsigned_byte(cnt1, Address(str2, result)); 4079 load_unsigned_byte(result, Address(str1, result)); 4080 } else if (ae == StrIntrinsicNode::UU) { 4081 load_unsigned_short(cnt1, Address(str2, result, scale)); 4082 load_unsigned_short(result, Address(str1, result, scale)); 4083 } else { 4084 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4085 load_unsigned_byte(result, Address(str1, result, scale1)); 4086 } 4087 subl(result, cnt1); 4088 jmpb(POP_LABEL); 4089 }//if (VM_Version::supports_avx512vlbw()) 4090 #endif // _LP64 4091 4092 // Discard the stored length difference 4093 bind(POP_LABEL); 4094 pop(cnt1); 4095 4096 // That's it 4097 bind(DONE_LABEL); 4098 if(ae == StrIntrinsicNode::UL) { 4099 negl(result); 4100 } 4101 4102 } 4103 4104 // Search for Non-ASCII character (Negative byte value) in a byte array, 4105 // return the index of the first such character, otherwise the length 4106 // of the array segment searched. 4107 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4108 // @IntrinsicCandidate 4109 // public static int countPositives(byte[] ba, int off, int len) { 4110 // for (int i = off; i < off + len; i++) { 4111 // if (ba[i] < 0) { 4112 // return i - off; 4113 // } 4114 // } 4115 // return len; 4116 // } 4117 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4118 Register result, Register tmp1, 4119 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4120 // rsi: byte array 4121 // rcx: len 4122 // rax: result 4123 ShortBranchVerifier sbv(this); 4124 assert_different_registers(ary1, len, result, tmp1); 4125 assert_different_registers(vec1, vec2); 4126 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4127 4128 movl(result, len); // copy 4129 // len == 0 4130 testl(len, len); 4131 jcc(Assembler::zero, DONE); 4132 4133 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4134 VM_Version::supports_avx512vlbw() && 4135 VM_Version::supports_bmi2()) { 4136 4137 Label test_64_loop, test_tail, BREAK_LOOP; 4138 movl(tmp1, len); 4139 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4140 4141 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4142 andl(len, 0xffffffc0); // vector count (in chars) 4143 jccb(Assembler::zero, test_tail); 4144 4145 lea(ary1, Address(ary1, len, Address::times_1)); 4146 negptr(len); 4147 4148 bind(test_64_loop); 4149 // Check whether our 64 elements of size byte contain negatives 4150 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4151 kortestql(mask1, mask1); 4152 jcc(Assembler::notZero, BREAK_LOOP); 4153 4154 addptr(len, 64); 4155 jccb(Assembler::notZero, test_64_loop); 4156 4157 bind(test_tail); 4158 // bail out when there is nothing to be done 4159 testl(tmp1, -1); 4160 jcc(Assembler::zero, DONE); 4161 4162 4163 // check the tail for absense of negatives 4164 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4165 #ifdef _LP64 4166 { 4167 Register tmp3_aliased = len; 4168 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4169 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4170 notq(tmp3_aliased); 4171 kmovql(mask2, tmp3_aliased); 4172 } 4173 #else 4174 Label k_init; 4175 jmp(k_init); 4176 4177 // We could not read 64-bits from a general purpose register thus we move 4178 // data required to compose 64 1's to the instruction stream 4179 // We emit 64 byte wide series of elements from 0..63 which later on would 4180 // be used as a compare targets with tail count contained in tmp1 register. 4181 // Result would be a k register having tmp1 consecutive number or 1 4182 // counting from least significant bit. 4183 address tmp = pc(); 4184 emit_int64(0x0706050403020100); 4185 emit_int64(0x0F0E0D0C0B0A0908); 4186 emit_int64(0x1716151413121110); 4187 emit_int64(0x1F1E1D1C1B1A1918); 4188 emit_int64(0x2726252423222120); 4189 emit_int64(0x2F2E2D2C2B2A2928); 4190 emit_int64(0x3736353433323130); 4191 emit_int64(0x3F3E3D3C3B3A3938); 4192 4193 bind(k_init); 4194 lea(len, InternalAddress(tmp)); 4195 // create mask to test for negative byte inside a vector 4196 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 4197 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 4198 4199 #endif 4200 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4201 ktestq(mask1, mask2); 4202 jcc(Assembler::zero, DONE); 4203 4204 // do a full check for negative registers in the tail 4205 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4206 // ary1 already pointing to the right place 4207 jmpb(TAIL_START); 4208 4209 bind(BREAK_LOOP); 4210 // At least one byte in the last 64 byte block was negative. 4211 // Set up to look at the last 64 bytes as if they were a tail 4212 lea(ary1, Address(ary1, len, Address::times_1)); 4213 addptr(result, len); 4214 // Ignore the very last byte: if all others are positive, 4215 // it must be negative, so we can skip right to the 2+1 byte 4216 // end comparison at this point 4217 orl(result, 63); 4218 movl(len, 63); 4219 // Fallthru to tail compare 4220 } else { 4221 4222 if (UseAVX >= 2 && UseSSE >= 2) { 4223 // With AVX2, use 32-byte vector compare 4224 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4225 4226 // Compare 32-byte vectors 4227 testl(len, 0xffffffe0); // vector count (in bytes) 4228 jccb(Assembler::zero, TAIL_START); 4229 4230 andl(len, 0xffffffe0); 4231 lea(ary1, Address(ary1, len, Address::times_1)); 4232 negptr(len); 4233 4234 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4235 movdl(vec2, tmp1); 4236 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4237 4238 bind(COMPARE_WIDE_VECTORS); 4239 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4240 vptest(vec1, vec2); 4241 jccb(Assembler::notZero, BREAK_LOOP); 4242 addptr(len, 32); 4243 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4244 4245 testl(result, 0x0000001f); // any bytes remaining? 4246 jcc(Assembler::zero, DONE); 4247 4248 // Quick test using the already prepared vector mask 4249 movl(len, result); 4250 andl(len, 0x0000001f); 4251 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4252 vptest(vec1, vec2); 4253 jcc(Assembler::zero, DONE); 4254 // There are zeros, jump to the tail to determine exactly where 4255 jmpb(TAIL_START); 4256 4257 bind(BREAK_LOOP); 4258 // At least one byte in the last 32-byte vector is negative. 4259 // Set up to look at the last 32 bytes as if they were a tail 4260 lea(ary1, Address(ary1, len, Address::times_1)); 4261 addptr(result, len); 4262 // Ignore the very last byte: if all others are positive, 4263 // it must be negative, so we can skip right to the 2+1 byte 4264 // end comparison at this point 4265 orl(result, 31); 4266 movl(len, 31); 4267 // Fallthru to tail compare 4268 } else if (UseSSE42Intrinsics) { 4269 // With SSE4.2, use double quad vector compare 4270 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4271 4272 // Compare 16-byte vectors 4273 testl(len, 0xfffffff0); // vector count (in bytes) 4274 jcc(Assembler::zero, TAIL_START); 4275 4276 andl(len, 0xfffffff0); 4277 lea(ary1, Address(ary1, len, Address::times_1)); 4278 negptr(len); 4279 4280 movl(tmp1, 0x80808080); 4281 movdl(vec2, tmp1); 4282 pshufd(vec2, vec2, 0); 4283 4284 bind(COMPARE_WIDE_VECTORS); 4285 movdqu(vec1, Address(ary1, len, Address::times_1)); 4286 ptest(vec1, vec2); 4287 jccb(Assembler::notZero, BREAK_LOOP); 4288 addptr(len, 16); 4289 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4290 4291 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4292 jcc(Assembler::zero, DONE); 4293 4294 // Quick test using the already prepared vector mask 4295 movl(len, result); 4296 andl(len, 0x0000000f); // tail count (in bytes) 4297 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4298 ptest(vec1, vec2); 4299 jcc(Assembler::zero, DONE); 4300 jmpb(TAIL_START); 4301 4302 bind(BREAK_LOOP); 4303 // At least one byte in the last 16-byte vector is negative. 4304 // Set up and look at the last 16 bytes as if they were a tail 4305 lea(ary1, Address(ary1, len, Address::times_1)); 4306 addptr(result, len); 4307 // Ignore the very last byte: if all others are positive, 4308 // it must be negative, so we can skip right to the 2+1 byte 4309 // end comparison at this point 4310 orl(result, 15); 4311 movl(len, 15); 4312 // Fallthru to tail compare 4313 } 4314 } 4315 4316 bind(TAIL_START); 4317 // Compare 4-byte vectors 4318 andl(len, 0xfffffffc); // vector count (in bytes) 4319 jccb(Assembler::zero, COMPARE_CHAR); 4320 4321 lea(ary1, Address(ary1, len, Address::times_1)); 4322 negptr(len); 4323 4324 bind(COMPARE_VECTORS); 4325 movl(tmp1, Address(ary1, len, Address::times_1)); 4326 andl(tmp1, 0x80808080); 4327 jccb(Assembler::notZero, TAIL_ADJUST); 4328 addptr(len, 4); 4329 jccb(Assembler::notZero, COMPARE_VECTORS); 4330 4331 // Compare trailing char (final 2-3 bytes), if any 4332 bind(COMPARE_CHAR); 4333 4334 testl(result, 0x2); // tail char 4335 jccb(Assembler::zero, COMPARE_BYTE); 4336 load_unsigned_short(tmp1, Address(ary1, 0)); 4337 andl(tmp1, 0x00008080); 4338 jccb(Assembler::notZero, CHAR_ADJUST); 4339 lea(ary1, Address(ary1, 2)); 4340 4341 bind(COMPARE_BYTE); 4342 testl(result, 0x1); // tail byte 4343 jccb(Assembler::zero, DONE); 4344 load_unsigned_byte(tmp1, Address(ary1, 0)); 4345 testl(tmp1, 0x00000080); 4346 jccb(Assembler::zero, DONE); 4347 subptr(result, 1); 4348 jmpb(DONE); 4349 4350 bind(TAIL_ADJUST); 4351 // there are negative bits in the last 4 byte block. 4352 // Adjust result and check the next three bytes 4353 addptr(result, len); 4354 orl(result, 3); 4355 lea(ary1, Address(ary1, len, Address::times_1)); 4356 jmpb(COMPARE_CHAR); 4357 4358 bind(CHAR_ADJUST); 4359 // We are looking at a char + optional byte tail, and found that one 4360 // of the bytes in the char is negative. Adjust the result, check the 4361 // first byte and readjust if needed. 4362 andl(result, 0xfffffffc); 4363 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4364 jccb(Assembler::notZero, DONE); 4365 addptr(result, 1); 4366 4367 // That's it 4368 bind(DONE); 4369 if (UseAVX >= 2 && UseSSE >= 2) { 4370 // clean upper bits of YMM registers 4371 vpxor(vec1, vec1); 4372 vpxor(vec2, vec2); 4373 } 4374 } 4375 4376 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4377 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4378 Register limit, Register result, Register chr, 4379 XMMRegister vec1, XMMRegister vec2, bool is_char, 4380 KRegister mask, bool expand_ary2) { 4381 // for expand_ary2, limit is the (smaller) size of the second array. 4382 ShortBranchVerifier sbv(this); 4383 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4384 4385 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4386 "Expansion only implemented for AVX2"); 4387 4388 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4389 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4390 4391 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4392 int scaleIncr = expand_ary2 ? 8 : 16; 4393 4394 if (is_array_equ) { 4395 // Check the input args 4396 cmpoop(ary1, ary2); 4397 jcc(Assembler::equal, TRUE_LABEL); 4398 4399 // Need additional checks for arrays_equals. 4400 testptr(ary1, ary1); 4401 jcc(Assembler::zero, FALSE_LABEL); 4402 testptr(ary2, ary2); 4403 jcc(Assembler::zero, FALSE_LABEL); 4404 4405 // Check the lengths 4406 movl(limit, Address(ary1, length_offset)); 4407 cmpl(limit, Address(ary2, length_offset)); 4408 jcc(Assembler::notEqual, FALSE_LABEL); 4409 } 4410 4411 // count == 0 4412 testl(limit, limit); 4413 jcc(Assembler::zero, TRUE_LABEL); 4414 4415 if (is_array_equ) { 4416 // Load array address 4417 lea(ary1, Address(ary1, base_offset)); 4418 lea(ary2, Address(ary2, base_offset)); 4419 } 4420 4421 if (is_array_equ && is_char) { 4422 // arrays_equals when used for char[]. 4423 shll(limit, 1); // byte count != 0 4424 } 4425 movl(result, limit); // copy 4426 4427 if (UseAVX >= 2) { 4428 // With AVX2, use 32-byte vector compare 4429 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4430 4431 // Compare 32-byte vectors 4432 if (expand_ary2) { 4433 andl(result, 0x0000000f); // tail count (in bytes) 4434 andl(limit, 0xfffffff0); // vector count (in bytes) 4435 jcc(Assembler::zero, COMPARE_TAIL); 4436 } else { 4437 andl(result, 0x0000001f); // tail count (in bytes) 4438 andl(limit, 0xffffffe0); // vector count (in bytes) 4439 jcc(Assembler::zero, COMPARE_TAIL_16); 4440 } 4441 4442 lea(ary1, Address(ary1, limit, scaleFactor)); 4443 lea(ary2, Address(ary2, limit, Address::times_1)); 4444 negptr(limit); 4445 4446 #ifdef _LP64 4447 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4448 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4449 4450 cmpl(limit, -64); 4451 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4452 4453 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4454 4455 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4456 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4457 kortestql(mask, mask); 4458 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4459 addptr(limit, 64); // update since we already compared at this addr 4460 cmpl(limit, -64); 4461 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4462 4463 // At this point we may still need to compare -limit+result bytes. 4464 // We could execute the next two instruction and just continue via non-wide path: 4465 // cmpl(limit, 0); 4466 // jcc(Assembler::equal, COMPARE_TAIL); // true 4467 // But since we stopped at the points ary{1,2}+limit which are 4468 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4469 // (|limit| <= 32 and result < 32), 4470 // we may just compare the last 64 bytes. 4471 // 4472 addptr(result, -64); // it is safe, bc we just came from this area 4473 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4474 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4475 kortestql(mask, mask); 4476 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4477 4478 jmp(TRUE_LABEL); 4479 4480 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4481 4482 }//if (VM_Version::supports_avx512vlbw()) 4483 #endif //_LP64 4484 bind(COMPARE_WIDE_VECTORS); 4485 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4486 if (expand_ary2) { 4487 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4488 } else { 4489 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4490 } 4491 vpxor(vec1, vec2); 4492 4493 vptest(vec1, vec1); 4494 jcc(Assembler::notZero, FALSE_LABEL); 4495 addptr(limit, scaleIncr * 2); 4496 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4497 4498 testl(result, result); 4499 jcc(Assembler::zero, TRUE_LABEL); 4500 4501 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4502 if (expand_ary2) { 4503 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4504 } else { 4505 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4506 } 4507 vpxor(vec1, vec2); 4508 4509 vptest(vec1, vec1); 4510 jcc(Assembler::notZero, FALSE_LABEL); 4511 jmp(TRUE_LABEL); 4512 4513 bind(COMPARE_TAIL_16); // limit is zero 4514 movl(limit, result); 4515 4516 // Compare 16-byte chunks 4517 andl(result, 0x0000000f); // tail count (in bytes) 4518 andl(limit, 0xfffffff0); // vector count (in bytes) 4519 jcc(Assembler::zero, COMPARE_TAIL); 4520 4521 lea(ary1, Address(ary1, limit, scaleFactor)); 4522 lea(ary2, Address(ary2, limit, Address::times_1)); 4523 negptr(limit); 4524 4525 bind(COMPARE_WIDE_VECTORS_16); 4526 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4527 if (expand_ary2) { 4528 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4529 } else { 4530 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4531 } 4532 pxor(vec1, vec2); 4533 4534 ptest(vec1, vec1); 4535 jcc(Assembler::notZero, FALSE_LABEL); 4536 addptr(limit, scaleIncr); 4537 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4538 4539 bind(COMPARE_TAIL); // limit is zero 4540 movl(limit, result); 4541 // Fallthru to tail compare 4542 } else if (UseSSE42Intrinsics) { 4543 // With SSE4.2, use double quad vector compare 4544 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4545 4546 // Compare 16-byte vectors 4547 andl(result, 0x0000000f); // tail count (in bytes) 4548 andl(limit, 0xfffffff0); // vector count (in bytes) 4549 jcc(Assembler::zero, COMPARE_TAIL); 4550 4551 lea(ary1, Address(ary1, limit, Address::times_1)); 4552 lea(ary2, Address(ary2, limit, Address::times_1)); 4553 negptr(limit); 4554 4555 bind(COMPARE_WIDE_VECTORS); 4556 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4557 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4558 pxor(vec1, vec2); 4559 4560 ptest(vec1, vec1); 4561 jcc(Assembler::notZero, FALSE_LABEL); 4562 addptr(limit, 16); 4563 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4564 4565 testl(result, result); 4566 jcc(Assembler::zero, TRUE_LABEL); 4567 4568 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4569 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4570 pxor(vec1, vec2); 4571 4572 ptest(vec1, vec1); 4573 jccb(Assembler::notZero, FALSE_LABEL); 4574 jmpb(TRUE_LABEL); 4575 4576 bind(COMPARE_TAIL); // limit is zero 4577 movl(limit, result); 4578 // Fallthru to tail compare 4579 } 4580 4581 // Compare 4-byte vectors 4582 if (expand_ary2) { 4583 testl(result, result); 4584 jccb(Assembler::zero, TRUE_LABEL); 4585 } else { 4586 andl(limit, 0xfffffffc); // vector count (in bytes) 4587 jccb(Assembler::zero, COMPARE_CHAR); 4588 } 4589 4590 lea(ary1, Address(ary1, limit, scaleFactor)); 4591 lea(ary2, Address(ary2, limit, Address::times_1)); 4592 negptr(limit); 4593 4594 bind(COMPARE_VECTORS); 4595 if (expand_ary2) { 4596 // There are no "vector" operations for bytes to shorts 4597 movzbl(chr, Address(ary2, limit, Address::times_1)); 4598 cmpw(Address(ary1, limit, Address::times_2), chr); 4599 jccb(Assembler::notEqual, FALSE_LABEL); 4600 addptr(limit, 1); 4601 jcc(Assembler::notZero, COMPARE_VECTORS); 4602 jmp(TRUE_LABEL); 4603 } else { 4604 movl(chr, Address(ary1, limit, Address::times_1)); 4605 cmpl(chr, Address(ary2, limit, Address::times_1)); 4606 jccb(Assembler::notEqual, FALSE_LABEL); 4607 addptr(limit, 4); 4608 jcc(Assembler::notZero, COMPARE_VECTORS); 4609 } 4610 4611 // Compare trailing char (final 2 bytes), if any 4612 bind(COMPARE_CHAR); 4613 testl(result, 0x2); // tail char 4614 jccb(Assembler::zero, COMPARE_BYTE); 4615 load_unsigned_short(chr, Address(ary1, 0)); 4616 load_unsigned_short(limit, Address(ary2, 0)); 4617 cmpl(chr, limit); 4618 jccb(Assembler::notEqual, FALSE_LABEL); 4619 4620 if (is_array_equ && is_char) { 4621 bind(COMPARE_BYTE); 4622 } else { 4623 lea(ary1, Address(ary1, 2)); 4624 lea(ary2, Address(ary2, 2)); 4625 4626 bind(COMPARE_BYTE); 4627 testl(result, 0x1); // tail byte 4628 jccb(Assembler::zero, TRUE_LABEL); 4629 load_unsigned_byte(chr, Address(ary1, 0)); 4630 load_unsigned_byte(limit, Address(ary2, 0)); 4631 cmpl(chr, limit); 4632 jccb(Assembler::notEqual, FALSE_LABEL); 4633 } 4634 bind(TRUE_LABEL); 4635 movl(result, 1); // return true 4636 jmpb(DONE); 4637 4638 bind(FALSE_LABEL); 4639 xorl(result, result); // return false 4640 4641 // That's it 4642 bind(DONE); 4643 if (UseAVX >= 2) { 4644 // clean upper bits of YMM registers 4645 vpxor(vec1, vec1); 4646 vpxor(vec2, vec2); 4647 } 4648 } 4649 4650 #ifdef _LP64 4651 4652 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4653 #define __ masm. 4654 Register dst = stub.data<0>(); 4655 XMMRegister src = stub.data<1>(); 4656 address target = stub.data<2>(); 4657 __ bind(stub.entry()); 4658 __ subptr(rsp, 8); 4659 __ movdbl(Address(rsp), src); 4660 __ call(RuntimeAddress(target)); 4661 __ pop(dst); 4662 __ jmp(stub.continuation()); 4663 #undef __ 4664 } 4665 4666 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4667 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4668 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4669 4670 address slowpath_target; 4671 if (dst_bt == T_INT) { 4672 if (src_bt == T_FLOAT) { 4673 cvttss2sil(dst, src); 4674 cmpl(dst, 0x80000000); 4675 slowpath_target = StubRoutines::x86::f2i_fixup(); 4676 } else { 4677 cvttsd2sil(dst, src); 4678 cmpl(dst, 0x80000000); 4679 slowpath_target = StubRoutines::x86::d2i_fixup(); 4680 } 4681 } else { 4682 if (src_bt == T_FLOAT) { 4683 cvttss2siq(dst, src); 4684 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4685 slowpath_target = StubRoutines::x86::f2l_fixup(); 4686 } else { 4687 cvttsd2siq(dst, src); 4688 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4689 slowpath_target = StubRoutines::x86::d2l_fixup(); 4690 } 4691 } 4692 4693 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4694 jcc(Assembler::equal, stub->entry()); 4695 bind(stub->continuation()); 4696 } 4697 4698 #endif // _LP64 4699 4700 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4701 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4702 switch(ideal_opc) { 4703 case Op_LShiftVS: 4704 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4705 case Op_LShiftVI: 4706 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4707 case Op_LShiftVL: 4708 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4709 case Op_RShiftVS: 4710 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4711 case Op_RShiftVI: 4712 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4713 case Op_RShiftVL: 4714 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4715 case Op_URShiftVS: 4716 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4717 case Op_URShiftVI: 4718 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4719 case Op_URShiftVL: 4720 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4721 case Op_RotateRightV: 4722 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4723 case Op_RotateLeftV: 4724 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4725 default: 4726 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4727 break; 4728 } 4729 } 4730 4731 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4732 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) { 4733 if (is_unsigned) { 4734 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4735 } else { 4736 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4737 } 4738 } 4739 4740 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4741 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4742 switch (elem_bt) { 4743 case T_BYTE: 4744 if (ideal_opc == Op_SaturatingAddV) { 4745 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4746 } else { 4747 assert(ideal_opc == Op_SaturatingSubV, ""); 4748 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4749 } 4750 break; 4751 case T_SHORT: 4752 if (ideal_opc == Op_SaturatingAddV) { 4753 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4754 } else { 4755 assert(ideal_opc == Op_SaturatingSubV, ""); 4756 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4757 } 4758 break; 4759 default: 4760 fatal("Unsupported type %s", type2name(elem_bt)); 4761 break; 4762 } 4763 } 4764 4765 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4766 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4767 switch (elem_bt) { 4768 case T_BYTE: 4769 if (ideal_opc == Op_SaturatingAddV) { 4770 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4771 } else { 4772 assert(ideal_opc == Op_SaturatingSubV, ""); 4773 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4774 } 4775 break; 4776 case T_SHORT: 4777 if (ideal_opc == Op_SaturatingAddV) { 4778 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4779 } else { 4780 assert(ideal_opc == Op_SaturatingSubV, ""); 4781 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4782 } 4783 break; 4784 default: 4785 fatal("Unsupported type %s", type2name(elem_bt)); 4786 break; 4787 } 4788 } 4789 4790 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4791 Address src2, bool is_unsigned, bool merge, int vlen_enc) { 4792 if (is_unsigned) { 4793 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4794 } else { 4795 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4796 } 4797 } 4798 4799 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4800 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4801 switch (elem_bt) { 4802 case T_BYTE: 4803 if (ideal_opc == Op_SaturatingAddV) { 4804 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4805 } else { 4806 assert(ideal_opc == Op_SaturatingSubV, ""); 4807 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4808 } 4809 break; 4810 case T_SHORT: 4811 if (ideal_opc == Op_SaturatingAddV) { 4812 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4813 } else { 4814 assert(ideal_opc == Op_SaturatingSubV, ""); 4815 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4816 } 4817 break; 4818 default: 4819 fatal("Unsupported type %s", type2name(elem_bt)); 4820 break; 4821 } 4822 } 4823 4824 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4825 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4826 switch (elem_bt) { 4827 case T_BYTE: 4828 if (ideal_opc == Op_SaturatingAddV) { 4829 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4830 } else { 4831 assert(ideal_opc == Op_SaturatingSubV, ""); 4832 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4833 } 4834 break; 4835 case T_SHORT: 4836 if (ideal_opc == Op_SaturatingAddV) { 4837 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4838 } else { 4839 assert(ideal_opc == Op_SaturatingSubV, ""); 4840 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4841 } 4842 break; 4843 default: 4844 fatal("Unsupported type %s", type2name(elem_bt)); 4845 break; 4846 } 4847 } 4848 4849 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4850 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4851 bool is_varshift) { 4852 switch (ideal_opc) { 4853 case Op_AddVB: 4854 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4855 case Op_AddVS: 4856 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4857 case Op_AddVI: 4858 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4859 case Op_AddVL: 4860 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4861 case Op_AddVF: 4862 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4863 case Op_AddVD: 4864 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4865 case Op_SubVB: 4866 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4867 case Op_SubVS: 4868 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4869 case Op_SubVI: 4870 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4871 case Op_SubVL: 4872 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4873 case Op_SubVF: 4874 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4875 case Op_SubVD: 4876 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4877 case Op_MulVS: 4878 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4879 case Op_MulVI: 4880 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4881 case Op_MulVL: 4882 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4883 case Op_MulVF: 4884 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4885 case Op_MulVD: 4886 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4887 case Op_DivVF: 4888 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4889 case Op_DivVD: 4890 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4891 case Op_SqrtVF: 4892 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4893 case Op_SqrtVD: 4894 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4895 case Op_AbsVB: 4896 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4897 case Op_AbsVS: 4898 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4899 case Op_AbsVI: 4900 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4901 case Op_AbsVL: 4902 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4903 case Op_FmaVF: 4904 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4905 case Op_FmaVD: 4906 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4907 case Op_VectorRearrange: 4908 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4909 case Op_LShiftVS: 4910 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4911 case Op_LShiftVI: 4912 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4913 case Op_LShiftVL: 4914 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4915 case Op_RShiftVS: 4916 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4917 case Op_RShiftVI: 4918 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4919 case Op_RShiftVL: 4920 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4921 case Op_URShiftVS: 4922 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4923 case Op_URShiftVI: 4924 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4925 case Op_URShiftVL: 4926 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4927 case Op_RotateLeftV: 4928 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4929 case Op_RotateRightV: 4930 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4931 case Op_MaxV: 4932 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4933 case Op_MinV: 4934 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4935 case Op_UMinV: 4936 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4937 case Op_UMaxV: 4938 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4939 case Op_XorV: 4940 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4941 case Op_OrV: 4942 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4943 case Op_AndV: 4944 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4945 default: 4946 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4947 break; 4948 } 4949 } 4950 4951 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4952 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4953 switch (ideal_opc) { 4954 case Op_AddVB: 4955 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4956 case Op_AddVS: 4957 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4958 case Op_AddVI: 4959 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4960 case Op_AddVL: 4961 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4962 case Op_AddVF: 4963 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4964 case Op_AddVD: 4965 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4966 case Op_SubVB: 4967 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4968 case Op_SubVS: 4969 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4970 case Op_SubVI: 4971 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4972 case Op_SubVL: 4973 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4974 case Op_SubVF: 4975 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4976 case Op_SubVD: 4977 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4978 case Op_MulVS: 4979 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4980 case Op_MulVI: 4981 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4982 case Op_MulVL: 4983 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4984 case Op_MulVF: 4985 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4986 case Op_MulVD: 4987 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4988 case Op_DivVF: 4989 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4990 case Op_DivVD: 4991 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4992 case Op_FmaVF: 4993 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4994 case Op_FmaVD: 4995 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4996 case Op_MaxV: 4997 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4998 case Op_MinV: 4999 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5000 case Op_UMaxV: 5001 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5002 case Op_UMinV: 5003 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5004 case Op_XorV: 5005 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5006 case Op_OrV: 5007 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5008 case Op_AndV: 5009 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5010 default: 5011 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 5012 break; 5013 } 5014 } 5015 5016 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 5017 KRegister src1, KRegister src2) { 5018 BasicType etype = T_ILLEGAL; 5019 switch(mask_len) { 5020 case 2: 5021 case 4: 5022 case 8: etype = T_BYTE; break; 5023 case 16: etype = T_SHORT; break; 5024 case 32: etype = T_INT; break; 5025 case 64: etype = T_LONG; break; 5026 default: fatal("Unsupported type"); break; 5027 } 5028 assert(etype != T_ILLEGAL, ""); 5029 switch(ideal_opc) { 5030 case Op_AndVMask: 5031 kand(etype, dst, src1, src2); break; 5032 case Op_OrVMask: 5033 kor(etype, dst, src1, src2); break; 5034 case Op_XorVMask: 5035 kxor(etype, dst, src1, src2); break; 5036 default: 5037 fatal("Unsupported masked operation"); break; 5038 } 5039 } 5040 5041 /* 5042 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5043 * If src is NaN, the result is 0. 5044 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 5045 * the result is equal to the value of Integer.MIN_VALUE. 5046 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 5047 * the result is equal to the value of Integer.MAX_VALUE. 5048 */ 5049 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5050 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5051 Register rscratch, AddressLiteral float_sign_flip, 5052 int vec_enc) { 5053 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5054 Label done; 5055 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 5056 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5057 vptest(xtmp2, xtmp2, vec_enc); 5058 jccb(Assembler::equal, done); 5059 5060 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 5061 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 5062 5063 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5064 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 5065 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 5066 5067 // Recompute the mask for remaining special value. 5068 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 5069 // Extract SRC values corresponding to TRUE mask lanes. 5070 vpand(xtmp4, xtmp2, src, vec_enc); 5071 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 5072 // values are set. 5073 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 5074 5075 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 5076 bind(done); 5077 } 5078 5079 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5080 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5081 Register rscratch, AddressLiteral float_sign_flip, 5082 int vec_enc) { 5083 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5084 Label done; 5085 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5086 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5087 kortestwl(ktmp1, ktmp1); 5088 jccb(Assembler::equal, done); 5089 5090 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5091 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5092 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5093 5094 kxorwl(ktmp1, ktmp1, ktmp2); 5095 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5096 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5097 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5098 bind(done); 5099 } 5100 5101 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5102 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5103 Register rscratch, AddressLiteral double_sign_flip, 5104 int vec_enc) { 5105 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5106 5107 Label done; 5108 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5109 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 5110 kortestwl(ktmp1, ktmp1); 5111 jccb(Assembler::equal, done); 5112 5113 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5114 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5115 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5116 5117 kxorwl(ktmp1, ktmp1, ktmp2); 5118 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5119 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5120 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5121 bind(done); 5122 } 5123 5124 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5125 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5126 Register rscratch, AddressLiteral float_sign_flip, 5127 int vec_enc) { 5128 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5129 Label done; 5130 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5131 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5132 kortestwl(ktmp1, ktmp1); 5133 jccb(Assembler::equal, done); 5134 5135 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5136 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5137 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5138 5139 kxorwl(ktmp1, ktmp1, ktmp2); 5140 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5141 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5142 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5143 bind(done); 5144 } 5145 5146 /* 5147 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5148 * If src is NaN, the result is 0. 5149 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 5150 * the result is equal to the value of Long.MIN_VALUE. 5151 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 5152 * the result is equal to the value of Long.MAX_VALUE. 5153 */ 5154 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5155 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5156 Register rscratch, AddressLiteral double_sign_flip, 5157 int vec_enc) { 5158 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5159 5160 Label done; 5161 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5162 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 5163 kortestwl(ktmp1, ktmp1); 5164 jccb(Assembler::equal, done); 5165 5166 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5167 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5168 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5169 5170 kxorwl(ktmp1, ktmp1, ktmp2); 5171 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5172 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5173 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5174 bind(done); 5175 } 5176 5177 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5178 XMMRegister xtmp, int index, int vec_enc) { 5179 assert(vec_enc < Assembler::AVX_512bit, ""); 5180 if (vec_enc == Assembler::AVX_256bit) { 5181 vextractf128_high(xtmp, src); 5182 vshufps(dst, src, xtmp, index, vec_enc); 5183 } else { 5184 vshufps(dst, src, zero, index, vec_enc); 5185 } 5186 } 5187 5188 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5189 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5190 AddressLiteral float_sign_flip, int src_vec_enc) { 5191 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5192 5193 Label done; 5194 // Compare the destination lanes with float_sign_flip 5195 // value to get mask for all special values. 5196 movdqu(xtmp1, float_sign_flip, rscratch); 5197 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5198 ptest(xtmp2, xtmp2); 5199 jccb(Assembler::equal, done); 5200 5201 // Flip float_sign_flip to get max integer value. 5202 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5203 pxor(xtmp1, xtmp4); 5204 5205 // Set detination lanes corresponding to unordered source lanes as zero. 5206 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5207 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5208 5209 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5210 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5211 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5212 5213 // Recompute the mask for remaining special value. 5214 pxor(xtmp2, xtmp3); 5215 // Extract mask corresponding to non-negative source lanes. 5216 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5217 5218 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5219 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5220 pand(xtmp3, xtmp2); 5221 5222 // Replace destination lanes holding special value(0x80000000) with max int 5223 // if corresponding source lane holds a +ve value. 5224 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5225 bind(done); 5226 } 5227 5228 5229 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5230 XMMRegister xtmp, Register rscratch, int vec_enc) { 5231 switch(to_elem_bt) { 5232 case T_SHORT: 5233 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5234 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5235 vpackusdw(dst, dst, zero, vec_enc); 5236 if (vec_enc == Assembler::AVX_256bit) { 5237 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5238 } 5239 break; 5240 case T_BYTE: 5241 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5242 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5243 vpackusdw(dst, dst, zero, vec_enc); 5244 if (vec_enc == Assembler::AVX_256bit) { 5245 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5246 } 5247 vpackuswb(dst, dst, zero, vec_enc); 5248 break; 5249 default: assert(false, "%s", type2name(to_elem_bt)); 5250 } 5251 } 5252 5253 /* 5254 * Algorithm for vector D2L and F2I conversions:- 5255 * a) Perform vector D2L/F2I cast. 5256 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5257 * It signifies that source value could be any of the special floating point 5258 * values(NaN,-Inf,Inf,Max,-Min). 5259 * c) Set destination to zero if source is NaN value. 5260 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5261 */ 5262 5263 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5264 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5265 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5266 int to_elem_sz = type2aelembytes(to_elem_bt); 5267 assert(to_elem_sz <= 4, ""); 5268 vcvttps2dq(dst, src, vec_enc); 5269 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5270 if (to_elem_sz < 4) { 5271 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5272 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5273 } 5274 } 5275 5276 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5277 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5278 Register rscratch, int vec_enc) { 5279 int to_elem_sz = type2aelembytes(to_elem_bt); 5280 assert(to_elem_sz <= 4, ""); 5281 vcvttps2dq(dst, src, vec_enc); 5282 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5283 switch(to_elem_bt) { 5284 case T_INT: 5285 break; 5286 case T_SHORT: 5287 evpmovdw(dst, dst, vec_enc); 5288 break; 5289 case T_BYTE: 5290 evpmovdb(dst, dst, vec_enc); 5291 break; 5292 default: assert(false, "%s", type2name(to_elem_bt)); 5293 } 5294 } 5295 5296 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5297 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5298 Register rscratch, int vec_enc) { 5299 evcvttps2qq(dst, src, vec_enc); 5300 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5301 } 5302 5303 // Handling for downcasting from double to integer or sub-word types on AVX2. 5304 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5305 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5306 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5307 int to_elem_sz = type2aelembytes(to_elem_bt); 5308 assert(to_elem_sz < 8, ""); 5309 vcvttpd2dq(dst, src, vec_enc); 5310 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5311 float_sign_flip, vec_enc); 5312 if (to_elem_sz < 4) { 5313 // xtmp4 holds all zero lanes. 5314 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5315 } 5316 } 5317 5318 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5319 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5320 KRegister ktmp2, AddressLiteral sign_flip, 5321 Register rscratch, int vec_enc) { 5322 if (VM_Version::supports_avx512dq()) { 5323 evcvttpd2qq(dst, src, vec_enc); 5324 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5325 switch(to_elem_bt) { 5326 case T_LONG: 5327 break; 5328 case T_INT: 5329 evpmovsqd(dst, dst, vec_enc); 5330 break; 5331 case T_SHORT: 5332 evpmovsqd(dst, dst, vec_enc); 5333 evpmovdw(dst, dst, vec_enc); 5334 break; 5335 case T_BYTE: 5336 evpmovsqd(dst, dst, vec_enc); 5337 evpmovdb(dst, dst, vec_enc); 5338 break; 5339 default: assert(false, "%s", type2name(to_elem_bt)); 5340 } 5341 } else { 5342 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5343 vcvttpd2dq(dst, src, vec_enc); 5344 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5345 switch(to_elem_bt) { 5346 case T_INT: 5347 break; 5348 case T_SHORT: 5349 evpmovdw(dst, dst, vec_enc); 5350 break; 5351 case T_BYTE: 5352 evpmovdb(dst, dst, vec_enc); 5353 break; 5354 default: assert(false, "%s", type2name(to_elem_bt)); 5355 } 5356 } 5357 } 5358 5359 #ifdef _LP64 5360 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5361 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5362 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5363 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5364 // and re-instantiate original MXCSR.RC mode after that. 5365 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5366 5367 mov64(tmp, julong_cast(0.5L)); 5368 evpbroadcastq(xtmp1, tmp, vec_enc); 5369 vaddpd(xtmp1, src , xtmp1, vec_enc); 5370 evcvtpd2qq(dst, xtmp1, vec_enc); 5371 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5372 double_sign_flip, vec_enc);; 5373 5374 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5375 } 5376 5377 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5378 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5379 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5380 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5381 // and re-instantiate original MXCSR.RC mode after that. 5382 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5383 5384 movl(tmp, jint_cast(0.5)); 5385 movq(xtmp1, tmp); 5386 vbroadcastss(xtmp1, xtmp1, vec_enc); 5387 vaddps(xtmp1, src , xtmp1, vec_enc); 5388 vcvtps2dq(dst, xtmp1, vec_enc); 5389 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5390 float_sign_flip, vec_enc); 5391 5392 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5393 } 5394 5395 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5396 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5397 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5398 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5399 // and re-instantiate original MXCSR.RC mode after that. 5400 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5401 5402 movl(tmp, jint_cast(0.5)); 5403 movq(xtmp1, tmp); 5404 vbroadcastss(xtmp1, xtmp1, vec_enc); 5405 vaddps(xtmp1, src , xtmp1, vec_enc); 5406 vcvtps2dq(dst, xtmp1, vec_enc); 5407 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5408 5409 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5410 } 5411 #endif // _LP64 5412 5413 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5414 BasicType from_elem_bt, BasicType to_elem_bt) { 5415 switch (from_elem_bt) { 5416 case T_BYTE: 5417 switch (to_elem_bt) { 5418 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5419 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5420 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5421 default: ShouldNotReachHere(); 5422 } 5423 break; 5424 case T_SHORT: 5425 switch (to_elem_bt) { 5426 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5427 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5428 default: ShouldNotReachHere(); 5429 } 5430 break; 5431 case T_INT: 5432 assert(to_elem_bt == T_LONG, ""); 5433 vpmovzxdq(dst, src, vlen_enc); 5434 break; 5435 default: 5436 ShouldNotReachHere(); 5437 } 5438 } 5439 5440 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5441 BasicType from_elem_bt, BasicType to_elem_bt) { 5442 switch (from_elem_bt) { 5443 case T_BYTE: 5444 switch (to_elem_bt) { 5445 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5446 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5447 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5448 default: ShouldNotReachHere(); 5449 } 5450 break; 5451 case T_SHORT: 5452 switch (to_elem_bt) { 5453 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5454 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5455 default: ShouldNotReachHere(); 5456 } 5457 break; 5458 case T_INT: 5459 assert(to_elem_bt == T_LONG, ""); 5460 vpmovsxdq(dst, src, vlen_enc); 5461 break; 5462 default: 5463 ShouldNotReachHere(); 5464 } 5465 } 5466 5467 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5468 BasicType dst_bt, BasicType src_bt, int vlen) { 5469 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5470 assert(vlen_enc != AVX_512bit, ""); 5471 5472 int dst_bt_size = type2aelembytes(dst_bt); 5473 int src_bt_size = type2aelembytes(src_bt); 5474 if (dst_bt_size > src_bt_size) { 5475 switch (dst_bt_size / src_bt_size) { 5476 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5477 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5478 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5479 default: ShouldNotReachHere(); 5480 } 5481 } else { 5482 assert(dst_bt_size < src_bt_size, ""); 5483 switch (src_bt_size / dst_bt_size) { 5484 case 2: { 5485 if (vlen_enc == AVX_128bit) { 5486 vpacksswb(dst, src, src, vlen_enc); 5487 } else { 5488 vpacksswb(dst, src, src, vlen_enc); 5489 vpermq(dst, dst, 0x08, vlen_enc); 5490 } 5491 break; 5492 } 5493 case 4: { 5494 if (vlen_enc == AVX_128bit) { 5495 vpackssdw(dst, src, src, vlen_enc); 5496 vpacksswb(dst, dst, dst, vlen_enc); 5497 } else { 5498 vpackssdw(dst, src, src, vlen_enc); 5499 vpermq(dst, dst, 0x08, vlen_enc); 5500 vpacksswb(dst, dst, dst, AVX_128bit); 5501 } 5502 break; 5503 } 5504 case 8: { 5505 if (vlen_enc == AVX_128bit) { 5506 vpshufd(dst, src, 0x08, vlen_enc); 5507 vpackssdw(dst, dst, dst, vlen_enc); 5508 vpacksswb(dst, dst, dst, vlen_enc); 5509 } else { 5510 vpshufd(dst, src, 0x08, vlen_enc); 5511 vpermq(dst, dst, 0x08, vlen_enc); 5512 vpackssdw(dst, dst, dst, AVX_128bit); 5513 vpacksswb(dst, dst, dst, AVX_128bit); 5514 } 5515 break; 5516 } 5517 default: ShouldNotReachHere(); 5518 } 5519 } 5520 } 5521 5522 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5523 bool merge, BasicType bt, int vlen_enc) { 5524 if (bt == T_INT) { 5525 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5526 } else { 5527 assert(bt == T_LONG, ""); 5528 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5529 } 5530 } 5531 5532 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5533 bool merge, BasicType bt, int vlen_enc) { 5534 if (bt == T_INT) { 5535 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5536 } else { 5537 assert(bt == T_LONG, ""); 5538 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5539 } 5540 } 5541 5542 #ifdef _LP64 5543 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5544 Register rtmp2, XMMRegister xtmp, int mask_len, 5545 int vec_enc) { 5546 int index = 0; 5547 int vindex = 0; 5548 mov64(rtmp1, 0x0101010101010101L); 5549 pdepq(rtmp1, src, rtmp1); 5550 if (mask_len > 8) { 5551 movq(rtmp2, src); 5552 vpxor(xtmp, xtmp, xtmp, vec_enc); 5553 movq(xtmp, rtmp1); 5554 } 5555 movq(dst, rtmp1); 5556 5557 mask_len -= 8; 5558 while (mask_len > 0) { 5559 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5560 index++; 5561 if ((index % 2) == 0) { 5562 pxor(xtmp, xtmp); 5563 } 5564 mov64(rtmp1, 0x0101010101010101L); 5565 shrq(rtmp2, 8); 5566 pdepq(rtmp1, rtmp2, rtmp1); 5567 pinsrq(xtmp, rtmp1, index % 2); 5568 vindex = index / 2; 5569 if (vindex) { 5570 // Write entire 16 byte vector when both 64 bit 5571 // lanes are update to save redundant instructions. 5572 if (index % 2) { 5573 vinsertf128(dst, dst, xtmp, vindex); 5574 } 5575 } else { 5576 vmovdqu(dst, xtmp); 5577 } 5578 mask_len -= 8; 5579 } 5580 } 5581 5582 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5583 switch(opc) { 5584 case Op_VectorMaskTrueCount: 5585 popcntq(dst, tmp); 5586 break; 5587 case Op_VectorMaskLastTrue: 5588 if (VM_Version::supports_lzcnt()) { 5589 lzcntq(tmp, tmp); 5590 movl(dst, 63); 5591 subl(dst, tmp); 5592 } else { 5593 movl(dst, -1); 5594 bsrq(tmp, tmp); 5595 cmov32(Assembler::notZero, dst, tmp); 5596 } 5597 break; 5598 case Op_VectorMaskFirstTrue: 5599 if (VM_Version::supports_bmi1()) { 5600 if (masklen < 32) { 5601 orl(tmp, 1 << masklen); 5602 tzcntl(dst, tmp); 5603 } else if (masklen == 32) { 5604 tzcntl(dst, tmp); 5605 } else { 5606 assert(masklen == 64, ""); 5607 tzcntq(dst, tmp); 5608 } 5609 } else { 5610 if (masklen < 32) { 5611 orl(tmp, 1 << masklen); 5612 bsfl(dst, tmp); 5613 } else { 5614 assert(masklen == 32 || masklen == 64, ""); 5615 movl(dst, masklen); 5616 if (masklen == 32) { 5617 bsfl(tmp, tmp); 5618 } else { 5619 bsfq(tmp, tmp); 5620 } 5621 cmov32(Assembler::notZero, dst, tmp); 5622 } 5623 } 5624 break; 5625 case Op_VectorMaskToLong: 5626 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5627 break; 5628 default: assert(false, "Unhandled mask operation"); 5629 } 5630 } 5631 5632 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5633 int masklen, int masksize, int vec_enc) { 5634 assert(VM_Version::supports_popcnt(), ""); 5635 5636 if(VM_Version::supports_avx512bw()) { 5637 kmovql(tmp, mask); 5638 } else { 5639 assert(masklen <= 16, ""); 5640 kmovwl(tmp, mask); 5641 } 5642 5643 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5644 // operations needs to be clipped. 5645 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5646 andq(tmp, (1 << masklen) - 1); 5647 } 5648 5649 vector_mask_operation_helper(opc, dst, tmp, masklen); 5650 } 5651 5652 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5653 Register tmp, int masklen, BasicType bt, int vec_enc) { 5654 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5655 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5656 assert(VM_Version::supports_popcnt(), ""); 5657 5658 bool need_clip = false; 5659 switch(bt) { 5660 case T_BOOLEAN: 5661 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5662 vpxor(xtmp, xtmp, xtmp, vec_enc); 5663 vpsubb(xtmp, xtmp, mask, vec_enc); 5664 vpmovmskb(tmp, xtmp, vec_enc); 5665 need_clip = masklen < 16; 5666 break; 5667 case T_BYTE: 5668 vpmovmskb(tmp, mask, vec_enc); 5669 need_clip = masklen < 16; 5670 break; 5671 case T_SHORT: 5672 vpacksswb(xtmp, mask, mask, vec_enc); 5673 if (masklen >= 16) { 5674 vpermpd(xtmp, xtmp, 8, vec_enc); 5675 } 5676 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5677 need_clip = masklen < 16; 5678 break; 5679 case T_INT: 5680 case T_FLOAT: 5681 vmovmskps(tmp, mask, vec_enc); 5682 need_clip = masklen < 4; 5683 break; 5684 case T_LONG: 5685 case T_DOUBLE: 5686 vmovmskpd(tmp, mask, vec_enc); 5687 need_clip = masklen < 2; 5688 break; 5689 default: assert(false, "Unhandled type, %s", type2name(bt)); 5690 } 5691 5692 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5693 // operations needs to be clipped. 5694 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5695 // need_clip implies masklen < 32 5696 andq(tmp, (1 << masklen) - 1); 5697 } 5698 5699 vector_mask_operation_helper(opc, dst, tmp, masklen); 5700 } 5701 5702 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5703 Register rtmp2, int mask_len) { 5704 kmov(rtmp1, src); 5705 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5706 mov64(rtmp2, -1L); 5707 pextq(rtmp2, rtmp2, rtmp1); 5708 kmov(dst, rtmp2); 5709 } 5710 5711 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5712 XMMRegister mask, Register rtmp, Register rscratch, 5713 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5714 int vec_enc) { 5715 assert(type2aelembytes(bt) >= 4, ""); 5716 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5717 address compress_perm_table = nullptr; 5718 address expand_perm_table = nullptr; 5719 if (type2aelembytes(bt) == 8) { 5720 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5721 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5722 vmovmskpd(rtmp, mask, vec_enc); 5723 } else { 5724 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5725 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5726 vmovmskps(rtmp, mask, vec_enc); 5727 } 5728 shlq(rtmp, 5); // for 32 byte permute row. 5729 if (opcode == Op_CompressV) { 5730 lea(rscratch, ExternalAddress(compress_perm_table)); 5731 } else { 5732 lea(rscratch, ExternalAddress(expand_perm_table)); 5733 } 5734 addptr(rtmp, rscratch); 5735 vmovdqu(permv, Address(rtmp)); 5736 vpermps(dst, permv, src, Assembler::AVX_256bit); 5737 vpxor(xtmp, xtmp, xtmp, vec_enc); 5738 // Blend the result with zero vector using permute mask, each column entry 5739 // in a permute table row contains either a valid permute index or a -1 (default) 5740 // value, this can potentially be used as a blending mask after 5741 // compressing/expanding the source vector lanes. 5742 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5743 } 5744 5745 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5746 bool merge, BasicType bt, int vec_enc) { 5747 if (opcode == Op_CompressV) { 5748 switch(bt) { 5749 case T_BYTE: 5750 evpcompressb(dst, mask, src, merge, vec_enc); 5751 break; 5752 case T_CHAR: 5753 case T_SHORT: 5754 evpcompressw(dst, mask, src, merge, vec_enc); 5755 break; 5756 case T_INT: 5757 evpcompressd(dst, mask, src, merge, vec_enc); 5758 break; 5759 case T_FLOAT: 5760 evcompressps(dst, mask, src, merge, vec_enc); 5761 break; 5762 case T_LONG: 5763 evpcompressq(dst, mask, src, merge, vec_enc); 5764 break; 5765 case T_DOUBLE: 5766 evcompresspd(dst, mask, src, merge, vec_enc); 5767 break; 5768 default: 5769 fatal("Unsupported type %s", type2name(bt)); 5770 break; 5771 } 5772 } else { 5773 assert(opcode == Op_ExpandV, ""); 5774 switch(bt) { 5775 case T_BYTE: 5776 evpexpandb(dst, mask, src, merge, vec_enc); 5777 break; 5778 case T_CHAR: 5779 case T_SHORT: 5780 evpexpandw(dst, mask, src, merge, vec_enc); 5781 break; 5782 case T_INT: 5783 evpexpandd(dst, mask, src, merge, vec_enc); 5784 break; 5785 case T_FLOAT: 5786 evexpandps(dst, mask, src, merge, vec_enc); 5787 break; 5788 case T_LONG: 5789 evpexpandq(dst, mask, src, merge, vec_enc); 5790 break; 5791 case T_DOUBLE: 5792 evexpandpd(dst, mask, src, merge, vec_enc); 5793 break; 5794 default: 5795 fatal("Unsupported type %s", type2name(bt)); 5796 break; 5797 } 5798 } 5799 } 5800 #endif 5801 5802 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5803 KRegister ktmp1, int vec_enc) { 5804 if (opcode == Op_SignumVD) { 5805 vsubpd(dst, zero, one, vec_enc); 5806 // if src < 0 ? -1 : 1 5807 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5808 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5809 // if src == NaN, -0.0 or 0.0 return src. 5810 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5811 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5812 } else { 5813 assert(opcode == Op_SignumVF, ""); 5814 vsubps(dst, zero, one, vec_enc); 5815 // if src < 0 ? -1 : 1 5816 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5817 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5818 // if src == NaN, -0.0 or 0.0 return src. 5819 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5820 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5821 } 5822 } 5823 5824 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5825 XMMRegister xtmp1, int vec_enc) { 5826 if (opcode == Op_SignumVD) { 5827 vsubpd(dst, zero, one, vec_enc); 5828 // if src < 0 ? -1 : 1 5829 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5830 // if src == NaN, -0.0 or 0.0 return src. 5831 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5832 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5833 } else { 5834 assert(opcode == Op_SignumVF, ""); 5835 vsubps(dst, zero, one, vec_enc); 5836 // if src < 0 ? -1 : 1 5837 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5838 // if src == NaN, -0.0 or 0.0 return src. 5839 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5840 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5841 } 5842 } 5843 5844 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5845 if (VM_Version::supports_avx512bw()) { 5846 if (mask_len > 32) { 5847 kmovql(dst, src); 5848 } else { 5849 kmovdl(dst, src); 5850 if (mask_len != 32) { 5851 kshiftrdl(dst, dst, 32 - mask_len); 5852 } 5853 } 5854 } else { 5855 assert(mask_len <= 16, ""); 5856 kmovwl(dst, src); 5857 if (mask_len != 16) { 5858 kshiftrwl(dst, dst, 16 - mask_len); 5859 } 5860 } 5861 } 5862 5863 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5864 int lane_size = type2aelembytes(bt); 5865 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5866 if ((is_LP64 || lane_size < 8) && 5867 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5868 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5869 movptr(rtmp, imm32); 5870 switch(lane_size) { 5871 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5872 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5873 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5874 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5875 fatal("Unsupported lane size %d", lane_size); 5876 break; 5877 } 5878 } else { 5879 movptr(rtmp, imm32); 5880 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5881 switch(lane_size) { 5882 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5883 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5884 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5885 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5886 fatal("Unsupported lane size %d", lane_size); 5887 break; 5888 } 5889 } 5890 } 5891 5892 // 5893 // Following is lookup table based popcount computation algorithm:- 5894 // Index Bit set count 5895 // [ 0000 -> 0, 5896 // 0001 -> 1, 5897 // 0010 -> 1, 5898 // 0011 -> 2, 5899 // 0100 -> 1, 5900 // 0101 -> 2, 5901 // 0110 -> 2, 5902 // 0111 -> 3, 5903 // 1000 -> 1, 5904 // 1001 -> 2, 5905 // 1010 -> 3, 5906 // 1011 -> 3, 5907 // 1100 -> 2, 5908 // 1101 -> 3, 5909 // 1111 -> 4 ] 5910 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5911 // shuffle indices for lookup table access. 5912 // b. Right shift each byte of vector lane by 4 positions. 5913 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5914 // shuffle indices for lookup table access. 5915 // d. Add the bitset count of upper and lower 4 bits of each byte. 5916 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5917 // count of all the bytes of a quadword. 5918 // f. Perform step e. for upper 128bit vector lane. 5919 // g. Pack the bitset count of quadwords back to double word. 5920 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5921 5922 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5923 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5924 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5925 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5926 vpsrlw(dst, src, 4, vec_enc); 5927 vpand(dst, dst, xtmp1, vec_enc); 5928 vpand(xtmp1, src, xtmp1, vec_enc); 5929 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5930 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5931 vpshufb(dst, xtmp2, dst, vec_enc); 5932 vpaddb(dst, dst, xtmp1, vec_enc); 5933 } 5934 5935 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5936 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5937 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5938 // Following code is as per steps e,f,g and h of above algorithm. 5939 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5940 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5941 vpsadbw(dst, dst, xtmp2, vec_enc); 5942 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5943 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5944 vpackuswb(dst, xtmp1, dst, vec_enc); 5945 } 5946 5947 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5948 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5949 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5950 // Add the popcount of upper and lower bytes of word. 5951 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5952 vpsrlw(dst, xtmp1, 8, vec_enc); 5953 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5954 vpaddw(dst, dst, xtmp1, vec_enc); 5955 } 5956 5957 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5958 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5959 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5960 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5961 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5962 } 5963 5964 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5965 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5966 switch(bt) { 5967 case T_LONG: 5968 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5969 break; 5970 case T_INT: 5971 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5972 break; 5973 case T_CHAR: 5974 case T_SHORT: 5975 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5976 break; 5977 case T_BYTE: 5978 case T_BOOLEAN: 5979 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5980 break; 5981 default: 5982 fatal("Unsupported type %s", type2name(bt)); 5983 break; 5984 } 5985 } 5986 5987 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5988 KRegister mask, bool merge, int vec_enc) { 5989 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5990 switch(bt) { 5991 case T_LONG: 5992 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5993 evpopcntq(dst, mask, src, merge, vec_enc); 5994 break; 5995 case T_INT: 5996 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5997 evpopcntd(dst, mask, src, merge, vec_enc); 5998 break; 5999 case T_CHAR: 6000 case T_SHORT: 6001 assert(VM_Version::supports_avx512_bitalg(), ""); 6002 evpopcntw(dst, mask, src, merge, vec_enc); 6003 break; 6004 case T_BYTE: 6005 case T_BOOLEAN: 6006 assert(VM_Version::supports_avx512_bitalg(), ""); 6007 evpopcntb(dst, mask, src, merge, vec_enc); 6008 break; 6009 default: 6010 fatal("Unsupported type %s", type2name(bt)); 6011 break; 6012 } 6013 } 6014 6015 #ifndef _LP64 6016 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 6017 assert(VM_Version::supports_avx512bw(), ""); 6018 kmovdl(tmp, src); 6019 kunpckdql(dst, tmp, tmp); 6020 } 6021 #endif 6022 6023 // Bit reversal algorithm first reverses the bits of each byte followed by 6024 // a byte level reversal for multi-byte primitive types (short/int/long). 6025 // Algorithm performs a lookup table access to get reverse bit sequence 6026 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 6027 // is obtained by swapping the reverse bit sequences of upper and lower 6028 // nibble of a byte. 6029 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6030 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6031 if (VM_Version::supports_avx512vlbw()) { 6032 6033 // Get the reverse bit sequence of lower nibble of each byte. 6034 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 6035 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6036 evpandq(dst, xtmp2, src, vec_enc); 6037 vpshufb(dst, xtmp1, dst, vec_enc); 6038 vpsllq(dst, dst, 4, vec_enc); 6039 6040 // Get the reverse bit sequence of upper nibble of each byte. 6041 vpandn(xtmp2, xtmp2, src, vec_enc); 6042 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6043 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6044 6045 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6046 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6047 evporq(xtmp2, dst, xtmp2, vec_enc); 6048 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6049 6050 } else if(vec_enc == Assembler::AVX_512bit) { 6051 // Shift based bit reversal. 6052 assert(bt == T_LONG || bt == T_INT, ""); 6053 6054 // Swap lower and upper nibble of each byte. 6055 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 6056 6057 // Swap two least and most significant bits of each nibble. 6058 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 6059 6060 // Swap adjacent pair of bits. 6061 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6062 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 6063 6064 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6065 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 6066 } else { 6067 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 6068 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6069 6070 // Get the reverse bit sequence of lower nibble of each byte. 6071 vpand(dst, xtmp2, src, vec_enc); 6072 vpshufb(dst, xtmp1, dst, vec_enc); 6073 vpsllq(dst, dst, 4, vec_enc); 6074 6075 // Get the reverse bit sequence of upper nibble of each byte. 6076 vpandn(xtmp2, xtmp2, src, vec_enc); 6077 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6078 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6079 6080 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6081 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6082 vpor(xtmp2, dst, xtmp2, vec_enc); 6083 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6084 } 6085 } 6086 6087 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 6088 XMMRegister xtmp, Register rscratch) { 6089 assert(VM_Version::supports_gfni(), ""); 6090 assert(rscratch != noreg || always_reachable(mask), "missing"); 6091 6092 // Galois field instruction based bit reversal based on following algorithm. 6093 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6094 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 6095 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 6096 vector_reverse_byte(bt, dst, xtmp, vec_enc); 6097 } 6098 6099 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 6100 XMMRegister xtmp1, Register rtmp, int vec_enc) { 6101 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 6102 evpandq(dst, xtmp1, src, vec_enc); 6103 vpsllq(dst, dst, nbits, vec_enc); 6104 vpandn(xtmp1, xtmp1, src, vec_enc); 6105 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 6106 evporq(dst, dst, xtmp1, vec_enc); 6107 } 6108 6109 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6110 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6111 // Shift based bit reversal. 6112 assert(VM_Version::supports_evex(), ""); 6113 switch(bt) { 6114 case T_LONG: 6115 // Swap upper and lower double word of each quad word. 6116 evprorq(xtmp1, k0, src, 32, true, vec_enc); 6117 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 6118 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6119 break; 6120 case T_INT: 6121 // Swap upper and lower word of each double word. 6122 evprord(xtmp1, k0, src, 16, true, vec_enc); 6123 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6124 break; 6125 case T_CHAR: 6126 case T_SHORT: 6127 // Swap upper and lower byte of each word. 6128 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 6129 break; 6130 case T_BYTE: 6131 evmovdquq(dst, k0, src, true, vec_enc); 6132 break; 6133 default: 6134 fatal("Unsupported type %s", type2name(bt)); 6135 break; 6136 } 6137 } 6138 6139 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 6140 if (bt == T_BYTE) { 6141 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 6142 evmovdquq(dst, k0, src, true, vec_enc); 6143 } else { 6144 vmovdqu(dst, src); 6145 } 6146 return; 6147 } 6148 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 6149 // pre-computed shuffle indices. 6150 switch(bt) { 6151 case T_LONG: 6152 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 6153 break; 6154 case T_INT: 6155 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 6156 break; 6157 case T_CHAR: 6158 case T_SHORT: 6159 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6160 break; 6161 default: 6162 fatal("Unsupported type %s", type2name(bt)); 6163 break; 6164 } 6165 vpshufb(dst, src, dst, vec_enc); 6166 } 6167 6168 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6169 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6170 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6171 assert(is_integral_type(bt), ""); 6172 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6173 assert(VM_Version::supports_avx512cd(), ""); 6174 switch(bt) { 6175 case T_LONG: 6176 evplzcntq(dst, ktmp, src, merge, vec_enc); 6177 break; 6178 case T_INT: 6179 evplzcntd(dst, ktmp, src, merge, vec_enc); 6180 break; 6181 case T_SHORT: 6182 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6183 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6184 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6185 vpunpckhwd(dst, xtmp1, src, vec_enc); 6186 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6187 vpackusdw(dst, xtmp2, dst, vec_enc); 6188 break; 6189 case T_BYTE: 6190 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6191 // accessing the lookup table. 6192 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6193 // accessing the lookup table. 6194 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6195 assert(VM_Version::supports_avx512bw(), ""); 6196 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6197 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6198 vpand(xtmp2, dst, src, vec_enc); 6199 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6200 vpsrlw(xtmp3, src, 4, vec_enc); 6201 vpand(xtmp3, dst, xtmp3, vec_enc); 6202 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6203 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6204 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6205 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6206 break; 6207 default: 6208 fatal("Unsupported type %s", type2name(bt)); 6209 break; 6210 } 6211 } 6212 6213 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6214 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6215 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6216 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6217 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6218 // accessing the lookup table. 6219 vpand(dst, xtmp2, src, vec_enc); 6220 vpshufb(dst, xtmp1, dst, vec_enc); 6221 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6222 // accessing the lookup table. 6223 vpsrlw(xtmp3, src, 4, vec_enc); 6224 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6225 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6226 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6227 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6228 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6229 vpaddb(dst, dst, xtmp2, vec_enc); 6230 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6231 } 6232 6233 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6234 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6235 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6236 // Add zero counts of lower byte and upper byte of a word if 6237 // upper byte holds a zero value. 6238 vpsrlw(xtmp3, src, 8, vec_enc); 6239 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6240 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6241 vpsllw(xtmp2, dst, 8, vec_enc); 6242 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6243 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6244 vpsrlw(dst, dst, 8, vec_enc); 6245 } 6246 6247 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6248 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6249 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6250 // hence biased exponent can be used to compute leading zero count as per 6251 // following formula:- 6252 // LZCNT = 31 - (biased_exp - 127) 6253 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6254 6255 // Broadcast 0xFF 6256 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6257 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6258 6259 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher 6260 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit 6261 // contributes to the leading number of zeros. 6262 vpsrld(xtmp2, src, 1, vec_enc); 6263 vpandn(xtmp3, xtmp2, src, vec_enc); 6264 6265 // Extract biased exponent. 6266 vcvtdq2ps(dst, xtmp3, vec_enc); 6267 vpsrld(dst, dst, 23, vec_enc); 6268 vpand(dst, dst, xtmp1, vec_enc); 6269 6270 // Broadcast 127. 6271 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6272 // Exponent = biased_exp - 127 6273 vpsubd(dst, dst, xtmp1, vec_enc); 6274 6275 // Exponent_plus_one = Exponent + 1 6276 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6277 vpaddd(dst, dst, xtmp3, vec_enc); 6278 6279 // Replace -ve exponent with zero, exponent is -ve when src 6280 // lane contains a zero value. 6281 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6282 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6283 6284 // Rematerialize broadcast 32. 6285 vpslld(xtmp1, xtmp3, 5, vec_enc); 6286 // Exponent is 32 if corresponding source lane contains max_int value. 6287 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6288 // LZCNT = 32 - exponent_plus_one 6289 vpsubd(dst, xtmp1, dst, vec_enc); 6290 6291 // Replace LZCNT with a value 1 if corresponding source lane 6292 // contains max_int value. 6293 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6294 6295 // Replace biased_exp with 0 if source lane value is less than zero. 6296 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6297 vblendvps(dst, dst, xtmp2, src, vec_enc); 6298 } 6299 6300 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6301 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6302 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6303 // Add zero counts of lower word and upper word of a double word if 6304 // upper word holds a zero value. 6305 vpsrld(xtmp3, src, 16, vec_enc); 6306 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6307 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6308 vpslld(xtmp2, dst, 16, vec_enc); 6309 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6310 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6311 vpsrld(dst, dst, 16, vec_enc); 6312 // Add zero counts of lower doubleword and upper doubleword of a 6313 // quadword if upper doubleword holds a zero value. 6314 vpsrlq(xtmp3, src, 32, vec_enc); 6315 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6316 vpsllq(xtmp2, dst, 32, vec_enc); 6317 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6318 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6319 vpsrlq(dst, dst, 32, vec_enc); 6320 } 6321 6322 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6323 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6324 Register rtmp, int vec_enc) { 6325 assert(is_integral_type(bt), "unexpected type"); 6326 assert(vec_enc < Assembler::AVX_512bit, ""); 6327 switch(bt) { 6328 case T_LONG: 6329 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6330 break; 6331 case T_INT: 6332 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6333 break; 6334 case T_SHORT: 6335 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6336 break; 6337 case T_BYTE: 6338 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6339 break; 6340 default: 6341 fatal("Unsupported type %s", type2name(bt)); 6342 break; 6343 } 6344 } 6345 6346 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6347 switch(bt) { 6348 case T_BYTE: 6349 vpsubb(dst, src1, src2, vec_enc); 6350 break; 6351 case T_SHORT: 6352 vpsubw(dst, src1, src2, vec_enc); 6353 break; 6354 case T_INT: 6355 vpsubd(dst, src1, src2, vec_enc); 6356 break; 6357 case T_LONG: 6358 vpsubq(dst, src1, src2, vec_enc); 6359 break; 6360 default: 6361 fatal("Unsupported type %s", type2name(bt)); 6362 break; 6363 } 6364 } 6365 6366 // Trailing zero count computation is based on leading zero count operation as per 6367 // following equation. All AVX3 targets support AVX512CD feature which offers 6368 // direct vector instruction to compute leading zero count. 6369 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6370 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6371 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6372 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6373 assert(is_integral_type(bt), ""); 6374 // xtmp = -1 6375 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6376 // xtmp = xtmp + src 6377 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6378 // xtmp = xtmp & ~src 6379 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6380 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6381 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6382 vpsub(bt, dst, xtmp4, dst, vec_enc); 6383 } 6384 6385 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6386 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6387 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6388 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6389 assert(is_integral_type(bt), ""); 6390 // xtmp = 0 6391 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6392 // xtmp = 0 - src 6393 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6394 // xtmp = xtmp | src 6395 vpor(xtmp3, xtmp3, src, vec_enc); 6396 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6397 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6398 vpsub(bt, dst, xtmp1, dst, vec_enc); 6399 } 6400 6401 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6402 Label done; 6403 Label neg_divisor_fastpath; 6404 cmpl(divisor, 0); 6405 jccb(Assembler::less, neg_divisor_fastpath); 6406 xorl(rdx, rdx); 6407 divl(divisor); 6408 jmpb(done); 6409 bind(neg_divisor_fastpath); 6410 // Fastpath for divisor < 0: 6411 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6412 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6413 movl(rdx, rax); 6414 subl(rdx, divisor); 6415 if (VM_Version::supports_bmi1()) { 6416 andnl(rax, rdx, rax); 6417 } else { 6418 notl(rdx); 6419 andl(rax, rdx); 6420 } 6421 shrl(rax, 31); 6422 bind(done); 6423 } 6424 6425 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6426 Label done; 6427 Label neg_divisor_fastpath; 6428 cmpl(divisor, 0); 6429 jccb(Assembler::less, neg_divisor_fastpath); 6430 xorl(rdx, rdx); 6431 divl(divisor); 6432 jmpb(done); 6433 bind(neg_divisor_fastpath); 6434 // Fastpath when divisor < 0: 6435 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6436 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6437 movl(rdx, rax); 6438 subl(rax, divisor); 6439 if (VM_Version::supports_bmi1()) { 6440 andnl(rax, rax, rdx); 6441 } else { 6442 notl(rax); 6443 andl(rax, rdx); 6444 } 6445 sarl(rax, 31); 6446 andl(rax, divisor); 6447 subl(rdx, rax); 6448 bind(done); 6449 } 6450 6451 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6452 Label done; 6453 Label neg_divisor_fastpath; 6454 6455 cmpl(divisor, 0); 6456 jccb(Assembler::less, neg_divisor_fastpath); 6457 xorl(rdx, rdx); 6458 divl(divisor); 6459 jmpb(done); 6460 bind(neg_divisor_fastpath); 6461 // Fastpath for divisor < 0: 6462 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6463 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6464 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6465 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6466 movl(rdx, rax); 6467 subl(rax, divisor); 6468 if (VM_Version::supports_bmi1()) { 6469 andnl(rax, rax, rdx); 6470 } else { 6471 notl(rax); 6472 andl(rax, rdx); 6473 } 6474 movl(tmp, rax); 6475 shrl(rax, 31); // quotient 6476 sarl(tmp, 31); 6477 andl(tmp, divisor); 6478 subl(rdx, tmp); // remainder 6479 bind(done); 6480 } 6481 6482 #ifdef _LP64 6483 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6484 XMMRegister xtmp2, Register rtmp) { 6485 if(VM_Version::supports_gfni()) { 6486 // Galois field instruction based bit reversal based on following algorithm. 6487 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6488 mov64(rtmp, 0x8040201008040201L); 6489 movq(xtmp1, src); 6490 movq(xtmp2, rtmp); 6491 gf2p8affineqb(xtmp1, xtmp2, 0); 6492 movq(dst, xtmp1); 6493 } else { 6494 // Swap even and odd numbered bits. 6495 movl(rtmp, src); 6496 andl(rtmp, 0x55555555); 6497 shll(rtmp, 1); 6498 movl(dst, src); 6499 andl(dst, 0xAAAAAAAA); 6500 shrl(dst, 1); 6501 orl(dst, rtmp); 6502 6503 // Swap LSB and MSB 2 bits of each nibble. 6504 movl(rtmp, dst); 6505 andl(rtmp, 0x33333333); 6506 shll(rtmp, 2); 6507 andl(dst, 0xCCCCCCCC); 6508 shrl(dst, 2); 6509 orl(dst, rtmp); 6510 6511 // Swap LSB and MSB 4 bits of each byte. 6512 movl(rtmp, dst); 6513 andl(rtmp, 0x0F0F0F0F); 6514 shll(rtmp, 4); 6515 andl(dst, 0xF0F0F0F0); 6516 shrl(dst, 4); 6517 orl(dst, rtmp); 6518 } 6519 bswapl(dst); 6520 } 6521 6522 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6523 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6524 if(VM_Version::supports_gfni()) { 6525 // Galois field instruction based bit reversal based on following algorithm. 6526 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6527 mov64(rtmp1, 0x8040201008040201L); 6528 movq(xtmp1, src); 6529 movq(xtmp2, rtmp1); 6530 gf2p8affineqb(xtmp1, xtmp2, 0); 6531 movq(dst, xtmp1); 6532 } else { 6533 // Swap even and odd numbered bits. 6534 movq(rtmp1, src); 6535 mov64(rtmp2, 0x5555555555555555L); 6536 andq(rtmp1, rtmp2); 6537 shlq(rtmp1, 1); 6538 movq(dst, src); 6539 notq(rtmp2); 6540 andq(dst, rtmp2); 6541 shrq(dst, 1); 6542 orq(dst, rtmp1); 6543 6544 // Swap LSB and MSB 2 bits of each nibble. 6545 movq(rtmp1, dst); 6546 mov64(rtmp2, 0x3333333333333333L); 6547 andq(rtmp1, rtmp2); 6548 shlq(rtmp1, 2); 6549 notq(rtmp2); 6550 andq(dst, rtmp2); 6551 shrq(dst, 2); 6552 orq(dst, rtmp1); 6553 6554 // Swap LSB and MSB 4 bits of each byte. 6555 movq(rtmp1, dst); 6556 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6557 andq(rtmp1, rtmp2); 6558 shlq(rtmp1, 4); 6559 notq(rtmp2); 6560 andq(dst, rtmp2); 6561 shrq(dst, 4); 6562 orq(dst, rtmp1); 6563 } 6564 bswapq(dst); 6565 } 6566 6567 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6568 Label done; 6569 Label neg_divisor_fastpath; 6570 cmpq(divisor, 0); 6571 jccb(Assembler::less, neg_divisor_fastpath); 6572 xorl(rdx, rdx); 6573 divq(divisor); 6574 jmpb(done); 6575 bind(neg_divisor_fastpath); 6576 // Fastpath for divisor < 0: 6577 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6578 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6579 movq(rdx, rax); 6580 subq(rdx, divisor); 6581 if (VM_Version::supports_bmi1()) { 6582 andnq(rax, rdx, rax); 6583 } else { 6584 notq(rdx); 6585 andq(rax, rdx); 6586 } 6587 shrq(rax, 63); 6588 bind(done); 6589 } 6590 6591 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6592 Label done; 6593 Label neg_divisor_fastpath; 6594 cmpq(divisor, 0); 6595 jccb(Assembler::less, neg_divisor_fastpath); 6596 xorq(rdx, rdx); 6597 divq(divisor); 6598 jmp(done); 6599 bind(neg_divisor_fastpath); 6600 // Fastpath when divisor < 0: 6601 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6602 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6603 movq(rdx, rax); 6604 subq(rax, divisor); 6605 if (VM_Version::supports_bmi1()) { 6606 andnq(rax, rax, rdx); 6607 } else { 6608 notq(rax); 6609 andq(rax, rdx); 6610 } 6611 sarq(rax, 63); 6612 andq(rax, divisor); 6613 subq(rdx, rax); 6614 bind(done); 6615 } 6616 6617 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6618 Label done; 6619 Label neg_divisor_fastpath; 6620 cmpq(divisor, 0); 6621 jccb(Assembler::less, neg_divisor_fastpath); 6622 xorq(rdx, rdx); 6623 divq(divisor); 6624 jmp(done); 6625 bind(neg_divisor_fastpath); 6626 // Fastpath for divisor < 0: 6627 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6628 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6629 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6630 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6631 movq(rdx, rax); 6632 subq(rax, divisor); 6633 if (VM_Version::supports_bmi1()) { 6634 andnq(rax, rax, rdx); 6635 } else { 6636 notq(rax); 6637 andq(rax, rdx); 6638 } 6639 movq(tmp, rax); 6640 shrq(rax, 63); // quotient 6641 sarq(tmp, 63); 6642 andq(tmp, divisor); 6643 subq(rdx, tmp); // remainder 6644 bind(done); 6645 } 6646 #endif 6647 6648 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6649 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6650 int vlen_enc) { 6651 assert(VM_Version::supports_avx512bw(), ""); 6652 // Byte shuffles are inlane operations and indices are determined using 6653 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6654 // normalized to index range 0-15. This makes sure that all the multiples 6655 // of an index value are placed at same relative position in 128 bit 6656 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6657 // will be 16th element in their respective 128 bit lanes. 6658 movl(rtmp, 16); 6659 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6660 6661 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6662 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6663 // original shuffle indices and move the shuffled lanes corresponding to true 6664 // mask to destination vector. 6665 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6666 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6667 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6668 6669 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6670 // and broadcasting second 128 bit lane. 6671 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6672 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6673 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6674 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6675 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6676 6677 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6678 // and broadcasting third 128 bit lane. 6679 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6680 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6681 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6682 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6683 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6684 6685 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6686 // and broadcasting third 128 bit lane. 6687 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6688 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6689 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6690 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6691 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6692 } 6693 6694 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6695 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6696 if (vlen_enc == AVX_128bit) { 6697 vpermilps(dst, src, shuffle, vlen_enc); 6698 } else if (bt == T_INT) { 6699 vpermd(dst, shuffle, src, vlen_enc); 6700 } else { 6701 assert(bt == T_FLOAT, ""); 6702 vpermps(dst, shuffle, src, vlen_enc); 6703 } 6704 } 6705 6706 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 6707 switch(opcode) { 6708 case Op_AddHF: vaddsh(dst, src1, src2); break; 6709 case Op_SubHF: vsubsh(dst, src1, src2); break; 6710 case Op_MulHF: vmulsh(dst, src1, src2); break; 6711 case Op_DivHF: vdivsh(dst, src1, src2); break; 6712 case Op_MaxHF: vmaxsh(dst, src1, src2); break; 6713 case Op_MinHF: vminsh(dst, src1, src2); break; 6714 default: assert(false, "%s", NodeClassNames[opcode]); break; 6715 } 6716 } 6717 6718 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6719 switch(elem_bt) { 6720 case T_BYTE: 6721 if (ideal_opc == Op_SaturatingAddV) { 6722 vpaddsb(dst, src1, src2, vlen_enc); 6723 } else { 6724 assert(ideal_opc == Op_SaturatingSubV, ""); 6725 vpsubsb(dst, src1, src2, vlen_enc); 6726 } 6727 break; 6728 case T_SHORT: 6729 if (ideal_opc == Op_SaturatingAddV) { 6730 vpaddsw(dst, src1, src2, vlen_enc); 6731 } else { 6732 assert(ideal_opc == Op_SaturatingSubV, ""); 6733 vpsubsw(dst, src1, src2, vlen_enc); 6734 } 6735 break; 6736 default: 6737 fatal("Unsupported type %s", type2name(elem_bt)); 6738 break; 6739 } 6740 } 6741 6742 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6743 switch(elem_bt) { 6744 case T_BYTE: 6745 if (ideal_opc == Op_SaturatingAddV) { 6746 vpaddusb(dst, src1, src2, vlen_enc); 6747 } else { 6748 assert(ideal_opc == Op_SaturatingSubV, ""); 6749 vpsubusb(dst, src1, src2, vlen_enc); 6750 } 6751 break; 6752 case T_SHORT: 6753 if (ideal_opc == Op_SaturatingAddV) { 6754 vpaddusw(dst, src1, src2, vlen_enc); 6755 } else { 6756 assert(ideal_opc == Op_SaturatingSubV, ""); 6757 vpsubusw(dst, src1, src2, vlen_enc); 6758 } 6759 break; 6760 default: 6761 fatal("Unsupported type %s", type2name(elem_bt)); 6762 break; 6763 } 6764 } 6765 6766 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6767 XMMRegister src2, KRegister ktmp, int vlen_enc) { 6768 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input. 6769 // overflow_mask = Inp1 <u Inp2 6770 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc); 6771 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative) 6772 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false); 6773 } 6774 6775 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6776 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6777 // Emulate unsigned comparison using signed comparison 6778 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE 6779 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true); 6780 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc); 6781 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc); 6782 6783 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc); 6784 6785 // Res = INP1 - INP2 (non-commutative and non-associative) 6786 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6787 // Res = Mask ? Zero : Res 6788 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); 6789 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc); 6790 } 6791 6792 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6793 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) { 6794 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation. 6795 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2) 6796 // Res = Signed Add INP1, INP2 6797 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6798 // T1 = SRC1 | SRC2 6799 vpor(xtmp1, src1, src2, vlen_enc); 6800 // Max_Unsigned = -1 6801 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6802 // Unsigned compare: Mask = Res <u T1 6803 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc); 6804 // res = Mask ? Max_Unsigned : Res 6805 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc); 6806 } 6807 6808 // 6809 // Section 2-13 Hacker's Delight list following overflow detection check for saturating 6810 // unsigned addition operation. 6811 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1 6812 // 6813 // We empirically determined its semantic equivalence to following reduced expression 6814 // overflow_mask = (a + b) <u (a | b) 6815 // 6816 // and also verified it though Alive2 solver. 6817 // (https://alive2.llvm.org/ce/z/XDQ7dY) 6818 // 6819 6820 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6821 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) { 6822 // Res = Signed Add INP1, INP2 6823 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6824 // Compute T1 = INP1 | INP2 6825 vpor(xtmp3, src1, src2, vlen_enc); 6826 // T1 = Minimum signed value. 6827 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6828 // Convert T1 to signed value, T1 = T1 + MIN_VALUE 6829 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc); 6830 // Convert Res to signed value, Res<s> = Res + MIN_VALUE 6831 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc); 6832 // Compute overflow detection mask = Res<1> <s T1 6833 if (elem_bt == T_INT) { 6834 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc); 6835 } else { 6836 assert(elem_bt == T_LONG, ""); 6837 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc); 6838 } 6839 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6840 } 6841 6842 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6843 int vlen_enc, bool xtmp2_hold_M1) { 6844 if (VM_Version::supports_avx512dq()) { 6845 evpmovq2m(ktmp, src, vlen_enc); 6846 } else { 6847 assert(VM_Version::supports_evex(), ""); 6848 if (!xtmp2_hold_M1) { 6849 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6850 } 6851 evpsraq(xtmp1, src, 63, vlen_enc); 6852 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6853 } 6854 } 6855 6856 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6857 int vlen_enc, bool xtmp2_hold_M1) { 6858 if (VM_Version::supports_avx512dq()) { 6859 evpmovd2m(ktmp, src, vlen_enc); 6860 } else { 6861 assert(VM_Version::supports_evex(), ""); 6862 if (!xtmp2_hold_M1) { 6863 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6864 } 6865 vpsrad(xtmp1, src, 31, vlen_enc); 6866 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6867 } 6868 } 6869 6870 6871 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 6872 if (elem_bt == T_LONG) { 6873 if (VM_Version::supports_evex()) { 6874 evpsraq(dst, src, 63, vlen_enc); 6875 } else { 6876 vpsrad(dst, src, 31, vlen_enc); 6877 vpshufd(dst, dst, 0xF5, vlen_enc); 6878 } 6879 } else { 6880 assert(elem_bt == T_INT, ""); 6881 vpsrad(dst, src, 31, vlen_enc); 6882 } 6883 } 6884 6885 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6886 if (compute_allones) { 6887 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6888 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6889 } else { 6890 vpcmpeqq(allones, allones, allones, vlen_enc); 6891 } 6892 } 6893 if (elem_bt == T_LONG) { 6894 vpsrlq(dst, allones, 1, vlen_enc); 6895 } else { 6896 assert(elem_bt == T_INT, ""); 6897 vpsrld(dst, allones, 1, vlen_enc); 6898 } 6899 } 6900 6901 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6902 if (compute_allones) { 6903 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6904 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6905 } else { 6906 vpcmpeqq(allones, allones, allones, vlen_enc); 6907 } 6908 } 6909 if (elem_bt == T_LONG) { 6910 vpsllq(dst, allones, 63, vlen_enc); 6911 } else { 6912 assert(elem_bt == T_INT, ""); 6913 vpslld(dst, allones, 31, vlen_enc); 6914 } 6915 } 6916 6917 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2, 6918 Assembler::ComparisonPredicate cond, int vlen_enc) { 6919 switch(elem_bt) { 6920 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break; 6921 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break; 6922 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break; 6923 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break; 6924 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6925 } 6926 } 6927 6928 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6929 switch(elem_bt) { 6930 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break; 6931 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break; 6932 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break; 6933 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break; 6934 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6935 } 6936 } 6937 6938 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1, 6939 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) { 6940 if (elem_bt == T_LONG) { 6941 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6942 } else { 6943 assert(elem_bt == T_INT, ""); 6944 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6945 } 6946 } 6947 6948 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6949 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6950 KRegister ktmp1, KRegister ktmp2, int vlen_enc) { 6951 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6952 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6953 // Overflow detection based on Hacker's delight section 2-13. 6954 if (ideal_opc == Op_SaturatingAddV) { 6955 // res = src1 + src2 6956 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6957 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6958 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6959 vpxor(xtmp1, dst, src1, vlen_enc); 6960 vpxor(xtmp2, dst, src2, vlen_enc); 6961 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6962 } else { 6963 assert(ideal_opc == Op_SaturatingSubV, ""); 6964 // res = src1 - src2 6965 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6966 // Overflow occurs when both inputs have opposite polarity and 6967 // result polarity does not comply with first input polarity. 6968 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6969 vpxor(xtmp1, src1, src2, vlen_enc); 6970 vpxor(xtmp2, dst, src1, vlen_enc); 6971 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6972 } 6973 6974 // Compute overflow detection mask. 6975 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc); 6976 // Note: xtmp1 hold -1 in all its lanes after above call. 6977 6978 // Compute mask based on first input polarity. 6979 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true); 6980 6981 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6982 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6983 6984 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to 6985 // set bits in first input polarity mask holds a min value. 6986 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc); 6987 // Blend destination lanes with saturated values using overflow detection mask. 6988 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc); 6989 } 6990 6991 6992 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6993 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6994 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) { 6995 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6996 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6997 // Overflow detection based on Hacker's delight section 2-13. 6998 if (ideal_opc == Op_SaturatingAddV) { 6999 // res = src1 + src2 7000 vpadd(elem_bt, dst, src1, src2, vlen_enc); 7001 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 7002 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 7003 vpxor(xtmp1, dst, src1, vlen_enc); 7004 vpxor(xtmp2, dst, src2, vlen_enc); 7005 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 7006 } else { 7007 assert(ideal_opc == Op_SaturatingSubV, ""); 7008 // res = src1 - src2 7009 vpsub(elem_bt, dst, src1, src2, vlen_enc); 7010 // Overflow occurs when both inputs have opposite polarity and 7011 // result polarity does not comply with first input polarity. 7012 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 7013 vpxor(xtmp1, src1, src2, vlen_enc); 7014 vpxor(xtmp2, dst, src1, vlen_enc); 7015 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 7016 } 7017 7018 // Sign-extend to compute overflow detection mask. 7019 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc); 7020 7021 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc); 7022 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc); 7023 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 7024 7025 // Compose saturating min/max vector using first input polarity mask. 7026 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc); 7027 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc); 7028 7029 // Blend result with saturating vector using overflow detection mask. 7030 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 7031 } 7032 7033 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7034 switch(elem_bt) { 7035 case T_BYTE: 7036 if (ideal_opc == Op_SaturatingAddV) { 7037 vpaddsb(dst, src1, src2, vlen_enc); 7038 } else { 7039 assert(ideal_opc == Op_SaturatingSubV, ""); 7040 vpsubsb(dst, src1, src2, vlen_enc); 7041 } 7042 break; 7043 case T_SHORT: 7044 if (ideal_opc == Op_SaturatingAddV) { 7045 vpaddsw(dst, src1, src2, vlen_enc); 7046 } else { 7047 assert(ideal_opc == Op_SaturatingSubV, ""); 7048 vpsubsw(dst, src1, src2, vlen_enc); 7049 } 7050 break; 7051 default: 7052 fatal("Unsupported type %s", type2name(elem_bt)); 7053 break; 7054 } 7055 } 7056 7057 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7058 switch(elem_bt) { 7059 case T_BYTE: 7060 if (ideal_opc == Op_SaturatingAddV) { 7061 vpaddusb(dst, src1, src2, vlen_enc); 7062 } else { 7063 assert(ideal_opc == Op_SaturatingSubV, ""); 7064 vpsubusb(dst, src1, src2, vlen_enc); 7065 } 7066 break; 7067 case T_SHORT: 7068 if (ideal_opc == Op_SaturatingAddV) { 7069 vpaddusw(dst, src1, src2, vlen_enc); 7070 } else { 7071 assert(ideal_opc == Op_SaturatingSubV, ""); 7072 vpsubusw(dst, src1, src2, vlen_enc); 7073 } 7074 break; 7075 default: 7076 fatal("Unsupported type %s", type2name(elem_bt)); 7077 break; 7078 } 7079 } 7080 7081 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 7082 XMMRegister src2, int vlen_enc) { 7083 switch(elem_bt) { 7084 case T_BYTE: 7085 evpermi2b(dst, src1, src2, vlen_enc); 7086 break; 7087 case T_SHORT: 7088 evpermi2w(dst, src1, src2, vlen_enc); 7089 break; 7090 case T_INT: 7091 evpermi2d(dst, src1, src2, vlen_enc); 7092 break; 7093 case T_LONG: 7094 evpermi2q(dst, src1, src2, vlen_enc); 7095 break; 7096 case T_FLOAT: 7097 evpermi2ps(dst, src1, src2, vlen_enc); 7098 break; 7099 case T_DOUBLE: 7100 evpermi2pd(dst, src1, src2, vlen_enc); 7101 break; 7102 default: 7103 fatal("Unsupported type %s", type2name(elem_bt)); 7104 break; 7105 } 7106 } 7107 7108 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) { 7109 if (is_unsigned) { 7110 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7111 } else { 7112 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7113 } 7114 } 7115 7116 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) { 7117 if (is_unsigned) { 7118 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7119 } else { 7120 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7121 } 7122 }