1 /* 2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "gc/shared/barrierSet.hpp" 28 #include "gc/shared/barrierSetAssembler.hpp" 29 #include "oops/methodData.hpp" 30 #include "opto/c2_MacroAssembler.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/output.hpp" 33 #include "opto/opcodes.hpp" 34 #include "opto/subnode.hpp" 35 #include "runtime/globals.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 #include "utilities/checkedCast.hpp" 39 #include "utilities/globalDefinitions.hpp" 40 #include "utilities/powerOfTwo.hpp" 41 #include "utilities/sizes.hpp" 42 43 #ifdef PRODUCT 44 #define BLOCK_COMMENT(str) /* nothing */ 45 #define STOP(error) stop(error) 46 #else 47 #define BLOCK_COMMENT(str) block_comment(str) 48 #define STOP(error) block_comment(error); stop(error) 49 #endif 50 51 // C2 compiled method's prolog code. 52 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) { 53 if (C->clinit_barrier_on_entry()) { 54 assert(VM_Version::supports_fast_class_init_checks(), "sanity"); 55 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started"); 56 57 Label L_skip_barrier; 58 Register klass = rscratch1; 59 60 mov_metadata(klass, C->method()->holder()->constant_encoding()); 61 clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/); 62 63 jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 64 65 bind(L_skip_barrier); 66 } 67 68 int framesize = C->output()->frame_size_in_bytes(); 69 int bangsize = C->output()->bang_size_in_bytes(); 70 bool fp_mode_24b = false; 71 int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0; 72 73 // WARNING: Initial instruction MUST be 5 bytes or longer so that 74 // NativeJump::patch_verified_entry will be able to patch out the entry 75 // code safely. The push to verify stack depth is ok at 5 bytes, 76 // the frame allocation can be either 3 or 6 bytes. So if we don't do 77 // stack bang then we must use the 6 byte frame allocation even if 78 // we have no frame. :-( 79 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 80 81 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 82 // Remove word for return addr 83 framesize -= wordSize; 84 stack_bang_size -= wordSize; 85 86 // Calls to C2R adapters often do not accept exceptional returns. 87 // We require that their callers must bang for them. But be careful, because 88 // some VM calls (such as call site linkage) can use several kilobytes of 89 // stack. But the stack safety zone should account for that. 90 // See bugs 4446381, 4468289, 4497237. 91 if (stack_bang_size > 0) { 92 generate_stack_overflow_check(stack_bang_size); 93 94 // We always push rbp, so that on return to interpreter rbp, will be 95 // restored correctly and we can correct the stack. 96 push(rbp); 97 // Save caller's stack pointer into RBP if the frame pointer is preserved. 98 if (PreserveFramePointer) { 99 mov(rbp, rsp); 100 } 101 // Remove word for ebp 102 framesize -= wordSize; 103 104 // Create frame 105 if (framesize) { 106 subptr(rsp, framesize); 107 } 108 } else { 109 // Create frame (force generation of a 4 byte immediate value) 110 subptr_imm32(rsp, framesize); 111 112 // Save RBP register now. 113 framesize -= wordSize; 114 movptr(Address(rsp, framesize), rbp); 115 // Save caller's stack pointer into RBP if the frame pointer is preserved. 116 if (PreserveFramePointer) { 117 movptr(rbp, rsp); 118 if (framesize > 0) { 119 addptr(rbp, framesize); 120 } 121 } 122 } 123 124 if (C->needs_stack_repair()) { 125 // Save stack increment just below the saved rbp (also account for fixed framesize and rbp) 126 assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned"); 127 movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize); 128 } 129 130 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 131 framesize -= wordSize; 132 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 133 } 134 135 #ifdef ASSERT 136 if (VerifyStackAtCalls) { 137 Label L; 138 push(rax); 139 mov(rax, rsp); 140 andptr(rax, StackAlignmentInBytes-1); 141 cmpptr(rax, StackAlignmentInBytes-wordSize); 142 pop(rax); 143 jcc(Assembler::equal, L); 144 STOP("Stack is not properly aligned!"); 145 bind(L); 146 } 147 #endif 148 } 149 150 void C2_MacroAssembler::entry_barrier() { 151 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 152 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 153 Label dummy_slow_path; 154 Label dummy_continuation; 155 Label* slow_path = &dummy_slow_path; 156 Label* continuation = &dummy_continuation; 157 if (!Compile::current()->output()->in_scratch_emit_size()) { 158 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 159 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 160 Compile::current()->output()->add_stub(stub); 161 slow_path = &stub->entry(); 162 continuation = &stub->continuation(); 163 } 164 bs->nmethod_entry_barrier(this, slow_path, continuation); 165 } 166 167 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 168 switch (vlen_in_bytes) { 169 case 4: // fall-through 170 case 8: // fall-through 171 case 16: return Assembler::AVX_128bit; 172 case 32: return Assembler::AVX_256bit; 173 case 64: return Assembler::AVX_512bit; 174 175 default: { 176 ShouldNotReachHere(); 177 return Assembler::AVX_NoVec; 178 } 179 } 180 } 181 182 // fast_lock and fast_unlock used by C2 183 184 // Because the transitions from emitted code to the runtime 185 // monitorenter/exit helper stubs are so slow it's critical that 186 // we inline both the stack-locking fast path and the inflated fast path. 187 // 188 // See also: cmpFastLock and cmpFastUnlock. 189 // 190 // What follows is a specialized inline transliteration of the code 191 // in enter() and exit(). If we're concerned about I$ bloat another 192 // option would be to emit TrySlowEnter and TrySlowExit methods 193 // at startup-time. These methods would accept arguments as 194 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 195 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 196 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 197 // In practice, however, the # of lock sites is bounded and is usually small. 198 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 199 // if the processor uses simple bimodal branch predictors keyed by EIP 200 // Since the helper routines would be called from multiple synchronization 201 // sites. 202 // 203 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 204 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 205 // to those specialized methods. That'd give us a mostly platform-independent 206 // implementation that the JITs could optimize and inline at their pleasure. 207 // Done correctly, the only time we'd need to cross to native could would be 208 // to park() or unpark() threads. We'd also need a few more unsafe operators 209 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 210 // (b) explicit barriers or fence operations. 211 // 212 // TODO: 213 // 214 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 215 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 216 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 217 // the lock operators would typically be faster than reifying Self. 218 // 219 // * Ideally I'd define the primitives as: 220 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 221 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 222 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 223 // Instead, we're stuck with a rather awkward and brittle register assignments below. 224 // Furthermore the register assignments are overconstrained, possibly resulting in 225 // sub-optimal code near the synchronization site. 226 // 227 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 228 // Alternately, use a better sp-proximity test. 229 // 230 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 231 // Either one is sufficient to uniquely identify a thread. 232 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 233 // 234 // * Intrinsify notify() and notifyAll() for the common cases where the 235 // object is locked by the calling thread but the waitlist is empty. 236 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 237 // 238 // * use jccb and jmpb instead of jcc and jmp to improve code density. 239 // But beware of excessive branch density on AMD Opterons. 240 // 241 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 242 // or failure of the fast path. If the fast path fails then we pass 243 // control to the slow path, typically in C. In fast_lock and 244 // fast_unlock we often branch to DONE_LABEL, just to find that C2 245 // will emit a conditional branch immediately after the node. 246 // So we have branches to branches and lots of ICC.ZF games. 247 // Instead, it might be better to have C2 pass a "FailureLabel" 248 // into fast_lock and fast_unlock. In the case of success, control 249 // will drop through the node. ICC.ZF is undefined at exit. 250 // In the case of failure, the node will branch directly to the 251 // FailureLabel 252 253 254 // obj: object to lock 255 // box: on-stack box address (displaced header location) - KILLED 256 // rax,: tmp -- KILLED 257 // scr: tmp -- KILLED 258 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 259 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 260 Metadata* method_data) { 261 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 262 // Ensure the register assignments are disjoint 263 assert(tmpReg == rax, ""); 264 assert(cx1Reg == noreg, ""); 265 assert(cx2Reg == noreg, ""); 266 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 267 268 // Possible cases that we'll encounter in fast_lock 269 // ------------------------------------------------ 270 // * Inflated 271 // -- unlocked 272 // -- Locked 273 // = by self 274 // = by other 275 // * neutral 276 // * stack-locked 277 // -- by self 278 // = sp-proximity test hits 279 // = sp-proximity test generates false-negative 280 // -- by other 281 // 282 283 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 284 285 if (DiagnoseSyncOnValueBasedClasses != 0) { 286 load_klass(tmpReg, objReg, scrReg); 287 testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 288 jcc(Assembler::notZero, DONE_LABEL); 289 } 290 291 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 292 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 293 jcc(Assembler::notZero, IsInflated); 294 295 if (LockingMode == LM_MONITOR) { 296 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 297 testptr(objReg, objReg); 298 } else { 299 assert(LockingMode == LM_LEGACY, "must be"); 300 // Attempt stack-locking ... 301 orptr (tmpReg, markWord::unlocked_value); 302 if (EnableValhalla) { 303 // Mask inline_type bit such that we go to the slow path if object is an inline type 304 andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place)); 305 } 306 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 307 lock(); 308 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 309 jcc(Assembler::equal, COUNT); // Success 310 311 // Recursive locking. 312 // The object is stack-locked: markword contains stack pointer to BasicLock. 313 // Locked by current thread if difference with current SP is less than one page. 314 subptr(tmpReg, rsp); 315 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 316 andptr(tmpReg, (int32_t) (7 - (int)os::vm_page_size()) ); 317 movptr(Address(boxReg, 0), tmpReg); 318 } 319 jmp(DONE_LABEL); 320 321 bind(IsInflated); 322 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 323 324 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 325 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 326 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 327 328 // It's inflated and we use scrReg for ObjectMonitor* in this section. 329 movptr(boxReg, Address(r15_thread, JavaThread::monitor_owner_id_offset())); 330 movq(scrReg, tmpReg); 331 xorq(tmpReg, tmpReg); 332 lock(); 333 cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 334 335 // Propagate ICC.ZF from CAS above into DONE_LABEL. 336 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 337 338 cmpptr(boxReg, rax); // Check if we are already the owner (recursive lock) 339 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 340 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 341 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 342 bind(DONE_LABEL); 343 344 // ZFlag == 1 count in fast path 345 // ZFlag == 0 count in slow path 346 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 347 348 bind(COUNT); 349 if (LockingMode == LM_LEGACY) { 350 // Count monitors in fast path 351 increment(Address(thread, JavaThread::held_monitor_count_offset())); 352 } 353 xorl(tmpReg, tmpReg); // Set ZF == 1 354 355 bind(NO_COUNT); 356 357 // At NO_COUNT the icc ZFlag is set as follows ... 358 // fast_unlock uses the same protocol. 359 // ZFlag == 1 -> Success 360 // ZFlag == 0 -> Failure - force control through the slow path 361 } 362 363 // obj: object to unlock 364 // box: box address (displaced header location), killed. Must be EAX. 365 // tmp: killed, cannot be obj nor box. 366 // 367 // Some commentary on balanced locking: 368 // 369 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 370 // Methods that don't have provably balanced locking are forced to run in the 371 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 372 // The interpreter provides two properties: 373 // I1: At return-time the interpreter automatically and quietly unlocks any 374 // objects acquired the current activation (frame). Recall that the 375 // interpreter maintains an on-stack list of locks currently held by 376 // a frame. 377 // I2: If a method attempts to unlock an object that is not held by the 378 // the frame the interpreter throws IMSX. 379 // 380 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 381 // B() doesn't have provably balanced locking so it runs in the interpreter. 382 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 383 // is still locked by A(). 384 // 385 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 386 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 387 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 388 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 389 // Arguably given that the spec legislates the JNI case as undefined our implementation 390 // could reasonably *avoid* checking owner in fast_unlock(). 391 // In the interest of performance we elide m->Owner==Self check in unlock. 392 // A perfectly viable alternative is to elide the owner check except when 393 // Xcheck:jni is enabled. 394 395 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { 396 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 397 assert(boxReg == rax, ""); 398 assert_different_registers(objReg, boxReg, tmpReg); 399 400 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 401 402 if (LockingMode == LM_LEGACY) { 403 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 404 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 405 } 406 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 407 if (LockingMode != LM_MONITOR) { 408 testptr(tmpReg, markWord::monitor_value); // Inflated? 409 jcc(Assembler::zero, Stacked); 410 } 411 412 // It's inflated. 413 414 // Despite our balanced locking property we still check that m->_owner == Self 415 // as java routines or native JNI code called by this thread might 416 // have released the lock. 417 // 418 // If there's no contention try a 1-0 exit. That is, exit without 419 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 420 // we detect and recover from the race that the 1-0 exit admits. 421 // 422 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 423 // before it STs null into _owner, releasing the lock. Updates 424 // to data protected by the critical section must be visible before 425 // we drop the lock (and thus before any other thread could acquire 426 // the lock and observe the fields protected by the lock). 427 // IA32's memory-model is SPO, so STs are ordered with respect to 428 // each other and there's no need for an explicit barrier (fence). 429 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 430 Label LSuccess, LNotRecursive; 431 432 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 433 jccb(Assembler::equal, LNotRecursive); 434 435 // Recursive inflated unlock 436 decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 437 jmpb(LSuccess); 438 439 bind(LNotRecursive); 440 441 // Set owner to null. 442 // Release to satisfy the JMM 443 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 444 // We need a full fence after clearing owner to avoid stranding. 445 // StoreLoad achieves this. 446 membar(StoreLoad); 447 448 // Check if the entry_list is empty. 449 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(entry_list)), NULL_WORD); 450 jccb(Assembler::zero, LSuccess); // If so we are done. 451 452 // Check if there is a successor. 453 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 454 jccb(Assembler::notZero, LSuccess); // If so we are done. 455 456 // Save the monitor pointer in the current thread, so we can try to 457 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 458 andptr(tmpReg, ~(int32_t)markWord::monitor_value); 459 movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg); 460 461 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 462 jmpb (DONE_LABEL); 463 464 bind (LSuccess); 465 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 466 jmpb (DONE_LABEL); 467 468 if (LockingMode == LM_LEGACY) { 469 bind (Stacked); 470 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 471 lock(); 472 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 473 // Intentional fall-thru into DONE_LABEL 474 } 475 476 bind(DONE_LABEL); 477 478 // ZFlag == 1 count in fast path 479 // ZFlag == 0 count in slow path 480 jccb(Assembler::notZero, NO_COUNT); 481 482 bind(COUNT); 483 484 if (LockingMode == LM_LEGACY) { 485 // Count monitors in fast path 486 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 487 } 488 489 xorl(tmpReg, tmpReg); // Set ZF == 1 490 491 bind(NO_COUNT); 492 } 493 494 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 495 Register t, Register thread) { 496 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 497 assert(rax_reg == rax, "Used for CAS"); 498 assert_different_registers(obj, box, rax_reg, t, thread); 499 500 // Handle inflated monitor. 501 Label inflated; 502 // Finish fast lock successfully. ZF value is irrelevant. 503 Label locked; 504 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 505 Label slow_path; 506 507 if (UseObjectMonitorTable) { 508 // Clear cache in case fast locking succeeds or we need to take the slow-path. 509 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 510 } 511 512 if (DiagnoseSyncOnValueBasedClasses != 0) { 513 load_klass(rax_reg, obj, t); 514 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 515 jcc(Assembler::notZero, slow_path); 516 } 517 518 const Register mark = t; 519 520 { // Lightweight Lock 521 522 Label push; 523 524 const Register top = UseObjectMonitorTable ? rax_reg : box; 525 526 // Load the mark. 527 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 528 529 // Prefetch top. 530 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 531 532 // Check for monitor (0b10). 533 testptr(mark, markWord::monitor_value); 534 jcc(Assembler::notZero, inflated); 535 536 // Check if lock-stack is full. 537 cmpl(top, LockStack::end_offset() - 1); 538 jcc(Assembler::greater, slow_path); 539 540 // Check if recursive. 541 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 542 jccb(Assembler::equal, push); 543 544 // Try to lock. Transition lock bits 0b01 => 0b00 545 movptr(rax_reg, mark); 546 orptr(rax_reg, markWord::unlocked_value); 547 andptr(mark, ~(int32_t)markWord::unlocked_value); 548 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 549 jcc(Assembler::notEqual, slow_path); 550 551 if (UseObjectMonitorTable) { 552 // Need to reload top, clobbered by CAS. 553 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 554 } 555 bind(push); 556 // After successful lock, push object on lock-stack. 557 movptr(Address(thread, top), obj); 558 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 559 jmpb(locked); 560 } 561 562 { // Handle inflated monitor. 563 bind(inflated); 564 565 const Register monitor = t; 566 567 if (!UseObjectMonitorTable) { 568 assert(mark == monitor, "should be the same here"); 569 } else { 570 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 571 // Fetch ObjectMonitor* from the cache or take the slow-path. 572 Label monitor_found; 573 574 // Load cache address 575 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 576 577 const int num_unrolled = 2; 578 for (int i = 0; i < num_unrolled; i++) { 579 cmpptr(obj, Address(t)); 580 jccb(Assembler::equal, monitor_found); 581 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 582 } 583 584 Label loop; 585 586 // Search for obj in cache. 587 bind(loop); 588 589 // Check for match. 590 cmpptr(obj, Address(t)); 591 jccb(Assembler::equal, monitor_found); 592 593 // Search until null encountered, guaranteed _null_sentinel at end. 594 cmpptr(Address(t), 1); 595 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 596 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 597 jmpb(loop); 598 599 // Cache hit. 600 bind(monitor_found); 601 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 602 } 603 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 604 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 605 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 606 607 Label monitor_locked; 608 // Lock the monitor. 609 610 if (UseObjectMonitorTable) { 611 // Cache the monitor for unlock before trashing box. On failure to acquire 612 // the lock, the slow path will reset the entry accordingly (see CacheSetter). 613 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 614 } 615 616 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 617 xorptr(rax_reg, rax_reg); 618 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset())); 619 lock(); cmpxchgptr(box, owner_address); 620 jccb(Assembler::equal, monitor_locked); 621 622 // Check if recursive. 623 cmpptr(box, rax_reg); 624 jccb(Assembler::notEqual, slow_path); 625 626 // Recursive. 627 increment(recursions_address); 628 629 bind(monitor_locked); 630 } 631 632 bind(locked); 633 // Set ZF = 1 634 xorl(rax_reg, rax_reg); 635 636 #ifdef ASSERT 637 // Check that locked label is reached with ZF set. 638 Label zf_correct; 639 Label zf_bad_zero; 640 jcc(Assembler::zero, zf_correct); 641 jmp(zf_bad_zero); 642 #endif 643 644 bind(slow_path); 645 #ifdef ASSERT 646 // Check that slow_path label is reached with ZF not set. 647 jcc(Assembler::notZero, zf_correct); 648 stop("Fast Lock ZF != 0"); 649 bind(zf_bad_zero); 650 stop("Fast Lock ZF != 1"); 651 bind(zf_correct); 652 #endif 653 // C2 uses the value of ZF to determine the continuation. 654 } 655 656 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 657 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 658 assert(reg_rax == rax, "Used for CAS"); 659 assert_different_registers(obj, reg_rax, t); 660 661 // Handle inflated monitor. 662 Label inflated, inflated_check_lock_stack; 663 // Finish fast unlock successfully. MUST jump with ZF == 1 664 Label unlocked, slow_path; 665 666 const Register mark = t; 667 const Register monitor = t; 668 const Register top = UseObjectMonitorTable ? t : reg_rax; 669 const Register box = reg_rax; 670 671 Label dummy; 672 C2FastUnlockLightweightStub* stub = nullptr; 673 674 if (!Compile::current()->output()->in_scratch_emit_size()) { 675 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 676 Compile::current()->output()->add_stub(stub); 677 } 678 679 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 680 681 { // Lightweight Unlock 682 683 // Load top. 684 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 685 686 if (!UseObjectMonitorTable) { 687 // Prefetch mark. 688 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 689 } 690 691 // Check if obj is top of lock-stack. 692 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 693 // Top of lock stack was not obj. Must be monitor. 694 jcc(Assembler::notEqual, inflated_check_lock_stack); 695 696 // Pop lock-stack. 697 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 698 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 699 700 // Check if recursive. 701 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 702 jcc(Assembler::equal, unlocked); 703 704 // We elide the monitor check, let the CAS fail instead. 705 706 if (UseObjectMonitorTable) { 707 // Load mark. 708 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 709 } 710 711 // Try to unlock. Transition lock bits 0b00 => 0b01 712 movptr(reg_rax, mark); 713 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 714 orptr(mark, markWord::unlocked_value); 715 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 716 jcc(Assembler::notEqual, push_and_slow_path); 717 jmp(unlocked); 718 } 719 720 721 { // Handle inflated monitor. 722 bind(inflated_check_lock_stack); 723 #ifdef ASSERT 724 Label check_done; 725 subl(top, oopSize); 726 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 727 jcc(Assembler::below, check_done); 728 cmpptr(obj, Address(thread, top)); 729 jccb(Assembler::notEqual, inflated_check_lock_stack); 730 stop("Fast Unlock lock on stack"); 731 bind(check_done); 732 if (UseObjectMonitorTable) { 733 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 734 } 735 testptr(mark, markWord::monitor_value); 736 jccb(Assembler::notZero, inflated); 737 stop("Fast Unlock not monitor"); 738 #endif 739 740 bind(inflated); 741 742 if (!UseObjectMonitorTable) { 743 assert(mark == monitor, "should be the same here"); 744 } else { 745 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 746 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 747 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 748 cmpptr(monitor, alignof(ObjectMonitor*)); 749 jcc(Assembler::below, slow_path); 750 } 751 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 752 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 753 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 754 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag}; 755 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 756 757 Label recursive; 758 759 // Check if recursive. 760 cmpptr(recursions_address, 0); 761 jccb(Assembler::notZero, recursive); 762 763 // Set owner to null. 764 // Release to satisfy the JMM 765 movptr(owner_address, NULL_WORD); 766 // We need a full fence after clearing owner to avoid stranding. 767 // StoreLoad achieves this. 768 membar(StoreLoad); 769 770 // Check if the entry_list is empty. 771 cmpptr(entry_list_address, NULL_WORD); 772 jccb(Assembler::zero, unlocked); // If so we are done. 773 774 // Check if there is a successor. 775 cmpptr(succ_address, NULL_WORD); 776 jccb(Assembler::notZero, unlocked); // If so we are done. 777 778 // Save the monitor pointer in the current thread, so we can try to 779 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 780 if (!UseObjectMonitorTable) { 781 andptr(monitor, ~(int32_t)markWord::monitor_value); 782 } 783 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 784 785 orl(t, 1); // Fast Unlock ZF = 0 786 jmpb(slow_path); 787 788 // Recursive unlock. 789 bind(recursive); 790 decrement(recursions_address); 791 } 792 793 bind(unlocked); 794 xorl(t, t); // Fast Unlock ZF = 1 795 796 #ifdef ASSERT 797 // Check that unlocked label is reached with ZF set. 798 Label zf_correct; 799 Label zf_bad_zero; 800 jcc(Assembler::zero, zf_correct); 801 jmp(zf_bad_zero); 802 #endif 803 804 bind(slow_path); 805 if (stub != nullptr) { 806 bind(stub->slow_path_continuation()); 807 } 808 #ifdef ASSERT 809 // Check that stub->continuation() label is reached with ZF not set. 810 jcc(Assembler::notZero, zf_correct); 811 stop("Fast Unlock ZF != 0"); 812 bind(zf_bad_zero); 813 stop("Fast Unlock ZF != 1"); 814 bind(zf_correct); 815 #endif 816 // C2 uses the value of ZF to determine the continuation. 817 } 818 819 //------------------------------------------------------------------------------------------- 820 // Generic instructions support for use in .ad files C2 code generation 821 822 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 823 if (dst != src) { 824 movdqu(dst, src); 825 } 826 if (opcode == Op_AbsVD) { 827 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 828 } else { 829 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 830 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 831 } 832 } 833 834 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 835 if (opcode == Op_AbsVD) { 836 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 837 } else { 838 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 839 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 840 } 841 } 842 843 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 844 if (dst != src) { 845 movdqu(dst, src); 846 } 847 if (opcode == Op_AbsVF) { 848 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 849 } else { 850 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 851 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 852 } 853 } 854 855 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 856 if (opcode == Op_AbsVF) { 857 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 858 } else { 859 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 860 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 861 } 862 } 863 864 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 865 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 866 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 867 868 if (opcode == Op_MinV) { 869 if (elem_bt == T_BYTE) { 870 pminsb(dst, src); 871 } else if (elem_bt == T_SHORT) { 872 pminsw(dst, src); 873 } else if (elem_bt == T_INT) { 874 pminsd(dst, src); 875 } else { 876 assert(elem_bt == T_LONG, "required"); 877 assert(tmp == xmm0, "required"); 878 assert_different_registers(dst, src, tmp); 879 movdqu(xmm0, dst); 880 pcmpgtq(xmm0, src); 881 blendvpd(dst, src); // xmm0 as mask 882 } 883 } else { // opcode == Op_MaxV 884 if (elem_bt == T_BYTE) { 885 pmaxsb(dst, src); 886 } else if (elem_bt == T_SHORT) { 887 pmaxsw(dst, src); 888 } else if (elem_bt == T_INT) { 889 pmaxsd(dst, src); 890 } else { 891 assert(elem_bt == T_LONG, "required"); 892 assert(tmp == xmm0, "required"); 893 assert_different_registers(dst, src, tmp); 894 movdqu(xmm0, src); 895 pcmpgtq(xmm0, dst); 896 blendvpd(dst, src); // xmm0 as mask 897 } 898 } 899 } 900 901 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 902 XMMRegister src1, Address src2, int vlen_enc) { 903 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 904 if (opcode == Op_UMinV) { 905 switch(elem_bt) { 906 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 907 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 908 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 909 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 910 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 911 } 912 } else { 913 assert(opcode == Op_UMaxV, "required"); 914 switch(elem_bt) { 915 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 916 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 917 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 918 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 919 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 920 } 921 } 922 } 923 924 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 925 // For optimality, leverage a full vector width of 512 bits 926 // for operations over smaller vector sizes on AVX512 targets. 927 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { 928 if (opcode == Op_UMaxV) { 929 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 930 } else { 931 assert(opcode == Op_UMinV, "required"); 932 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 933 } 934 } else { 935 // T1 = -1 936 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc); 937 // T1 = -1 << 63 938 vpsllq(xtmp1, xtmp1, 63, vlen_enc); 939 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2 940 vpaddq(xtmp2, xtmp1, src2, vlen_enc); 941 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1 942 vpaddq(xtmp1, xtmp1, src1, vlen_enc); 943 // Mask = T2 > T1 944 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc); 945 if (opcode == Op_UMaxV) { 946 // Res = Mask ? Src2 : Src1 947 vpblendvb(dst, src1, src2, xtmp1, vlen_enc); 948 } else { 949 // Res = Mask ? Src1 : Src2 950 vpblendvb(dst, src2, src1, xtmp1, vlen_enc); 951 } 952 } 953 } 954 955 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 956 XMMRegister src1, XMMRegister src2, int vlen_enc) { 957 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 958 if (opcode == Op_UMinV) { 959 switch(elem_bt) { 960 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 961 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 962 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 963 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 964 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 965 } 966 } else { 967 assert(opcode == Op_UMaxV, "required"); 968 switch(elem_bt) { 969 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 970 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 971 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 972 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 973 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 974 } 975 } 976 } 977 978 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 979 XMMRegister dst, XMMRegister src1, XMMRegister src2, 980 int vlen_enc) { 981 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 982 983 if (opcode == Op_MinV) { 984 if (elem_bt == T_BYTE) { 985 vpminsb(dst, src1, src2, vlen_enc); 986 } else if (elem_bt == T_SHORT) { 987 vpminsw(dst, src1, src2, vlen_enc); 988 } else if (elem_bt == T_INT) { 989 vpminsd(dst, src1, src2, vlen_enc); 990 } else { 991 assert(elem_bt == T_LONG, "required"); 992 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 993 vpminsq(dst, src1, src2, vlen_enc); 994 } else { 995 assert_different_registers(dst, src1, src2); 996 vpcmpgtq(dst, src1, src2, vlen_enc); 997 vblendvpd(dst, src1, src2, dst, vlen_enc); 998 } 999 } 1000 } else { // opcode == Op_MaxV 1001 if (elem_bt == T_BYTE) { 1002 vpmaxsb(dst, src1, src2, vlen_enc); 1003 } else if (elem_bt == T_SHORT) { 1004 vpmaxsw(dst, src1, src2, vlen_enc); 1005 } else if (elem_bt == T_INT) { 1006 vpmaxsd(dst, src1, src2, vlen_enc); 1007 } else { 1008 assert(elem_bt == T_LONG, "required"); 1009 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1010 vpmaxsq(dst, src1, src2, vlen_enc); 1011 } else { 1012 assert_different_registers(dst, src1, src2); 1013 vpcmpgtq(dst, src1, src2, vlen_enc); 1014 vblendvpd(dst, src2, src1, dst, vlen_enc); 1015 } 1016 } 1017 } 1018 } 1019 1020 // Float/Double min max 1021 1022 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1023 XMMRegister dst, XMMRegister a, XMMRegister b, 1024 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1025 int vlen_enc) { 1026 assert(UseAVX > 0, "required"); 1027 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1028 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1029 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1030 assert_different_registers(a, tmp, atmp, btmp); 1031 assert_different_registers(b, tmp, atmp, btmp); 1032 1033 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1034 bool is_double_word = is_double_word_type(elem_bt); 1035 1036 /* Note on 'non-obvious' assembly sequence: 1037 * 1038 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1039 * and Java on how they handle floats: 1040 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1041 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1042 * 1043 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1044 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1045 * (only useful when signs differ, noop otherwise) 1046 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1047 1048 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1049 * btmp = (b < +0.0) ? a : b 1050 * atmp = (b < +0.0) ? b : a 1051 * Tmp = Max_Float(atmp , btmp) 1052 * Res = (atmp == NaN) ? atmp : Tmp 1053 */ 1054 1055 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1056 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1057 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1058 XMMRegister mask; 1059 1060 if (!is_double_word && is_min) { 1061 mask = a; 1062 vblend = &MacroAssembler::vblendvps; 1063 vmaxmin = &MacroAssembler::vminps; 1064 vcmp = &MacroAssembler::vcmpps; 1065 } else if (!is_double_word && !is_min) { 1066 mask = b; 1067 vblend = &MacroAssembler::vblendvps; 1068 vmaxmin = &MacroAssembler::vmaxps; 1069 vcmp = &MacroAssembler::vcmpps; 1070 } else if (is_double_word && is_min) { 1071 mask = a; 1072 vblend = &MacroAssembler::vblendvpd; 1073 vmaxmin = &MacroAssembler::vminpd; 1074 vcmp = &MacroAssembler::vcmppd; 1075 } else { 1076 assert(is_double_word && !is_min, "sanity"); 1077 mask = b; 1078 vblend = &MacroAssembler::vblendvpd; 1079 vmaxmin = &MacroAssembler::vmaxpd; 1080 vcmp = &MacroAssembler::vcmppd; 1081 } 1082 1083 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1084 XMMRegister maxmin, scratch; 1085 if (dst == btmp) { 1086 maxmin = btmp; 1087 scratch = tmp; 1088 } else { 1089 maxmin = tmp; 1090 scratch = btmp; 1091 } 1092 1093 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1094 if (precompute_mask && !is_double_word) { 1095 vpsrad(tmp, mask, 32, vlen_enc); 1096 mask = tmp; 1097 } else if (precompute_mask && is_double_word) { 1098 vpxor(tmp, tmp, tmp, vlen_enc); 1099 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1100 mask = tmp; 1101 } 1102 1103 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1104 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1105 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1106 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1107 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1108 } 1109 1110 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1111 XMMRegister dst, XMMRegister a, XMMRegister b, 1112 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1113 int vlen_enc) { 1114 assert(UseAVX > 2, "required"); 1115 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1116 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1117 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1118 assert_different_registers(dst, a, atmp, btmp); 1119 assert_different_registers(dst, b, atmp, btmp); 1120 1121 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1122 bool is_double_word = is_double_word_type(elem_bt); 1123 bool merge = true; 1124 1125 if (!is_double_word && is_min) { 1126 evpmovd2m(ktmp, a, vlen_enc); 1127 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1128 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1129 vminps(dst, atmp, btmp, vlen_enc); 1130 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1131 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1132 } else if (!is_double_word && !is_min) { 1133 evpmovd2m(ktmp, b, vlen_enc); 1134 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1135 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1136 vmaxps(dst, atmp, btmp, vlen_enc); 1137 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1138 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1139 } else if (is_double_word && is_min) { 1140 evpmovq2m(ktmp, a, vlen_enc); 1141 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1142 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1143 vminpd(dst, atmp, btmp, vlen_enc); 1144 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1145 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1146 } else { 1147 assert(is_double_word && !is_min, "sanity"); 1148 evpmovq2m(ktmp, b, vlen_enc); 1149 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1150 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1151 vmaxpd(dst, atmp, btmp, vlen_enc); 1152 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1153 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1154 } 1155 } 1156 1157 // Float/Double signum 1158 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1159 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1160 1161 Label DONE_LABEL; 1162 1163 if (opcode == Op_SignumF) { 1164 ucomiss(dst, zero); 1165 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1166 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1167 movflt(dst, one); 1168 jcc(Assembler::above, DONE_LABEL); 1169 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1170 } else if (opcode == Op_SignumD) { 1171 ucomisd(dst, zero); 1172 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1173 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1174 movdbl(dst, one); 1175 jcc(Assembler::above, DONE_LABEL); 1176 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1177 } 1178 1179 bind(DONE_LABEL); 1180 } 1181 1182 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1183 if (sign) { 1184 pmovsxbw(dst, src); 1185 } else { 1186 pmovzxbw(dst, src); 1187 } 1188 } 1189 1190 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1191 if (sign) { 1192 vpmovsxbw(dst, src, vector_len); 1193 } else { 1194 vpmovzxbw(dst, src, vector_len); 1195 } 1196 } 1197 1198 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1199 if (sign) { 1200 vpmovsxbd(dst, src, vector_len); 1201 } else { 1202 vpmovzxbd(dst, src, vector_len); 1203 } 1204 } 1205 1206 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1207 if (sign) { 1208 vpmovsxwd(dst, src, vector_len); 1209 } else { 1210 vpmovzxwd(dst, src, vector_len); 1211 } 1212 } 1213 1214 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1215 int shift, int vector_len) { 1216 if (opcode == Op_RotateLeftV) { 1217 if (etype == T_INT) { 1218 evprold(dst, src, shift, vector_len); 1219 } else { 1220 assert(etype == T_LONG, "expected type T_LONG"); 1221 evprolq(dst, src, shift, vector_len); 1222 } 1223 } else { 1224 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1225 if (etype == T_INT) { 1226 evprord(dst, src, shift, vector_len); 1227 } else { 1228 assert(etype == T_LONG, "expected type T_LONG"); 1229 evprorq(dst, src, shift, vector_len); 1230 } 1231 } 1232 } 1233 1234 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1235 XMMRegister shift, int vector_len) { 1236 if (opcode == Op_RotateLeftV) { 1237 if (etype == T_INT) { 1238 evprolvd(dst, src, shift, vector_len); 1239 } else { 1240 assert(etype == T_LONG, "expected type T_LONG"); 1241 evprolvq(dst, src, shift, vector_len); 1242 } 1243 } else { 1244 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1245 if (etype == T_INT) { 1246 evprorvd(dst, src, shift, vector_len); 1247 } else { 1248 assert(etype == T_LONG, "expected type T_LONG"); 1249 evprorvq(dst, src, shift, vector_len); 1250 } 1251 } 1252 } 1253 1254 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1255 if (opcode == Op_RShiftVI) { 1256 psrad(dst, shift); 1257 } else if (opcode == Op_LShiftVI) { 1258 pslld(dst, shift); 1259 } else { 1260 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1261 psrld(dst, shift); 1262 } 1263 } 1264 1265 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1266 switch (opcode) { 1267 case Op_RShiftVI: psrad(dst, shift); break; 1268 case Op_LShiftVI: pslld(dst, shift); break; 1269 case Op_URShiftVI: psrld(dst, shift); break; 1270 1271 default: assert(false, "%s", NodeClassNames[opcode]); 1272 } 1273 } 1274 1275 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1276 if (opcode == Op_RShiftVI) { 1277 vpsrad(dst, nds, shift, vector_len); 1278 } else if (opcode == Op_LShiftVI) { 1279 vpslld(dst, nds, shift, vector_len); 1280 } else { 1281 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1282 vpsrld(dst, nds, shift, vector_len); 1283 } 1284 } 1285 1286 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1287 switch (opcode) { 1288 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1289 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1290 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1291 1292 default: assert(false, "%s", NodeClassNames[opcode]); 1293 } 1294 } 1295 1296 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1297 switch (opcode) { 1298 case Op_RShiftVB: // fall-through 1299 case Op_RShiftVS: psraw(dst, shift); break; 1300 1301 case Op_LShiftVB: // fall-through 1302 case Op_LShiftVS: psllw(dst, shift); break; 1303 1304 case Op_URShiftVS: // fall-through 1305 case Op_URShiftVB: psrlw(dst, shift); break; 1306 1307 default: assert(false, "%s", NodeClassNames[opcode]); 1308 } 1309 } 1310 1311 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1312 switch (opcode) { 1313 case Op_RShiftVB: // fall-through 1314 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1315 1316 case Op_LShiftVB: // fall-through 1317 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1318 1319 case Op_URShiftVS: // fall-through 1320 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1321 1322 default: assert(false, "%s", NodeClassNames[opcode]); 1323 } 1324 } 1325 1326 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1327 switch (opcode) { 1328 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1329 case Op_LShiftVL: psllq(dst, shift); break; 1330 case Op_URShiftVL: psrlq(dst, shift); break; 1331 1332 default: assert(false, "%s", NodeClassNames[opcode]); 1333 } 1334 } 1335 1336 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1337 if (opcode == Op_RShiftVL) { 1338 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1339 } else if (opcode == Op_LShiftVL) { 1340 psllq(dst, shift); 1341 } else { 1342 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1343 psrlq(dst, shift); 1344 } 1345 } 1346 1347 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1348 switch (opcode) { 1349 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1350 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1351 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1352 1353 default: assert(false, "%s", NodeClassNames[opcode]); 1354 } 1355 } 1356 1357 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1358 if (opcode == Op_RShiftVL) { 1359 evpsraq(dst, nds, shift, vector_len); 1360 } else if (opcode == Op_LShiftVL) { 1361 vpsllq(dst, nds, shift, vector_len); 1362 } else { 1363 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1364 vpsrlq(dst, nds, shift, vector_len); 1365 } 1366 } 1367 1368 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1369 switch (opcode) { 1370 case Op_RShiftVB: // fall-through 1371 case Op_RShiftVS: // fall-through 1372 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1373 1374 case Op_LShiftVB: // fall-through 1375 case Op_LShiftVS: // fall-through 1376 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1377 1378 case Op_URShiftVB: // fall-through 1379 case Op_URShiftVS: // fall-through 1380 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1381 1382 default: assert(false, "%s", NodeClassNames[opcode]); 1383 } 1384 } 1385 1386 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1387 switch (opcode) { 1388 case Op_RShiftVB: // fall-through 1389 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1390 1391 case Op_LShiftVB: // fall-through 1392 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1393 1394 case Op_URShiftVB: // fall-through 1395 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1396 1397 default: assert(false, "%s", NodeClassNames[opcode]); 1398 } 1399 } 1400 1401 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1402 assert(UseAVX >= 2, "required"); 1403 switch (opcode) { 1404 case Op_RShiftVL: { 1405 if (UseAVX > 2) { 1406 assert(tmp == xnoreg, "not used"); 1407 if (!VM_Version::supports_avx512vl()) { 1408 vlen_enc = Assembler::AVX_512bit; 1409 } 1410 evpsravq(dst, src, shift, vlen_enc); 1411 } else { 1412 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1413 vpsrlvq(dst, src, shift, vlen_enc); 1414 vpsrlvq(tmp, tmp, shift, vlen_enc); 1415 vpxor(dst, dst, tmp, vlen_enc); 1416 vpsubq(dst, dst, tmp, vlen_enc); 1417 } 1418 break; 1419 } 1420 case Op_LShiftVL: { 1421 assert(tmp == xnoreg, "not used"); 1422 vpsllvq(dst, src, shift, vlen_enc); 1423 break; 1424 } 1425 case Op_URShiftVL: { 1426 assert(tmp == xnoreg, "not used"); 1427 vpsrlvq(dst, src, shift, vlen_enc); 1428 break; 1429 } 1430 default: assert(false, "%s", NodeClassNames[opcode]); 1431 } 1432 } 1433 1434 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1435 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1436 assert(opcode == Op_LShiftVB || 1437 opcode == Op_RShiftVB || 1438 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1439 bool sign = (opcode != Op_URShiftVB); 1440 assert(vector_len == 0, "required"); 1441 vextendbd(sign, dst, src, 1); 1442 vpmovzxbd(vtmp, shift, 1); 1443 varshiftd(opcode, dst, dst, vtmp, 1); 1444 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1445 vextracti128_high(vtmp, dst); 1446 vpackusdw(dst, dst, vtmp, 0); 1447 } 1448 1449 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1450 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1451 assert(opcode == Op_LShiftVB || 1452 opcode == Op_RShiftVB || 1453 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1454 bool sign = (opcode != Op_URShiftVB); 1455 int ext_vector_len = vector_len + 1; 1456 vextendbw(sign, dst, src, ext_vector_len); 1457 vpmovzxbw(vtmp, shift, ext_vector_len); 1458 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1459 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1460 if (vector_len == 0) { 1461 vextracti128_high(vtmp, dst); 1462 vpackuswb(dst, dst, vtmp, vector_len); 1463 } else { 1464 vextracti64x4_high(vtmp, dst); 1465 vpackuswb(dst, dst, vtmp, vector_len); 1466 vpermq(dst, dst, 0xD8, vector_len); 1467 } 1468 } 1469 1470 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1471 switch(typ) { 1472 case T_BYTE: 1473 pinsrb(dst, val, idx); 1474 break; 1475 case T_SHORT: 1476 pinsrw(dst, val, idx); 1477 break; 1478 case T_INT: 1479 pinsrd(dst, val, idx); 1480 break; 1481 case T_LONG: 1482 pinsrq(dst, val, idx); 1483 break; 1484 default: 1485 assert(false,"Should not reach here."); 1486 break; 1487 } 1488 } 1489 1490 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1491 switch(typ) { 1492 case T_BYTE: 1493 vpinsrb(dst, src, val, idx); 1494 break; 1495 case T_SHORT: 1496 vpinsrw(dst, src, val, idx); 1497 break; 1498 case T_INT: 1499 vpinsrd(dst, src, val, idx); 1500 break; 1501 case T_LONG: 1502 vpinsrq(dst, src, val, idx); 1503 break; 1504 default: 1505 assert(false,"Should not reach here."); 1506 break; 1507 } 1508 } 1509 1510 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1511 XMMRegister dst, Register base, 1512 Register idx_base, 1513 Register offset, Register mask, 1514 Register mask_idx, Register rtmp, 1515 int vlen_enc) { 1516 vpxor(dst, dst, dst, vlen_enc); 1517 if (elem_bt == T_SHORT) { 1518 for (int i = 0; i < 4; i++) { 1519 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1520 Label skip_load; 1521 btq(mask, mask_idx); 1522 jccb(Assembler::carryClear, skip_load); 1523 movl(rtmp, Address(idx_base, i * 4)); 1524 if (offset != noreg) { 1525 addl(rtmp, offset); 1526 } 1527 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1528 bind(skip_load); 1529 incq(mask_idx); 1530 } 1531 } else { 1532 assert(elem_bt == T_BYTE, ""); 1533 for (int i = 0; i < 8; i++) { 1534 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1535 Label skip_load; 1536 btq(mask, mask_idx); 1537 jccb(Assembler::carryClear, skip_load); 1538 movl(rtmp, Address(idx_base, i * 4)); 1539 if (offset != noreg) { 1540 addl(rtmp, offset); 1541 } 1542 pinsrb(dst, Address(base, rtmp), i); 1543 bind(skip_load); 1544 incq(mask_idx); 1545 } 1546 } 1547 } 1548 1549 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1550 Register base, Register idx_base, 1551 Register offset, Register rtmp, 1552 int vlen_enc) { 1553 vpxor(dst, dst, dst, vlen_enc); 1554 if (elem_bt == T_SHORT) { 1555 for (int i = 0; i < 4; i++) { 1556 // dst[i] = src[offset + idx_base[i]] 1557 movl(rtmp, Address(idx_base, i * 4)); 1558 if (offset != noreg) { 1559 addl(rtmp, offset); 1560 } 1561 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1562 } 1563 } else { 1564 assert(elem_bt == T_BYTE, ""); 1565 for (int i = 0; i < 8; i++) { 1566 // dst[i] = src[offset + idx_base[i]] 1567 movl(rtmp, Address(idx_base, i * 4)); 1568 if (offset != noreg) { 1569 addl(rtmp, offset); 1570 } 1571 pinsrb(dst, Address(base, rtmp), i); 1572 } 1573 } 1574 } 1575 1576 /* 1577 * Gather using hybrid algorithm, first partially unroll scalar loop 1578 * to accumulate values from gather indices into a quad-word(64bit) slice. 1579 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1580 * permutation to place the slice into appropriate vector lane 1581 * locations in destination vector. Following pseudo code describes the 1582 * algorithm in detail: 1583 * 1584 * DST_VEC = ZERO_VEC 1585 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1586 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1587 * FOREACH_ITER: 1588 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1589 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1590 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1591 * PERM_INDEX = PERM_INDEX - TWO_VEC 1592 * 1593 * With each iteration, doubleword permute indices (0,1) corresponding 1594 * to gathered quadword gets right shifted by two lane positions. 1595 * 1596 */ 1597 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1598 Register base, Register idx_base, 1599 Register offset, Register mask, 1600 XMMRegister xtmp1, XMMRegister xtmp2, 1601 XMMRegister temp_dst, Register rtmp, 1602 Register mask_idx, Register length, 1603 int vector_len, int vlen_enc) { 1604 Label GATHER8_LOOP; 1605 assert(is_subword_type(elem_ty), ""); 1606 movl(length, vector_len); 1607 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1608 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1609 vallones(xtmp2, vlen_enc); 1610 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1611 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1612 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1613 1614 bind(GATHER8_LOOP); 1615 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1616 if (mask == noreg) { 1617 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1618 } else { 1619 vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc); 1620 } 1621 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1622 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1623 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1624 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1625 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1626 vpor(dst, dst, temp_dst, vlen_enc); 1627 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1628 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1629 jcc(Assembler::notEqual, GATHER8_LOOP); 1630 } 1631 1632 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1633 switch(typ) { 1634 case T_INT: 1635 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1636 break; 1637 case T_FLOAT: 1638 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1639 break; 1640 case T_LONG: 1641 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1642 break; 1643 case T_DOUBLE: 1644 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1645 break; 1646 default: 1647 assert(false,"Should not reach here."); 1648 break; 1649 } 1650 } 1651 1652 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1653 switch(typ) { 1654 case T_INT: 1655 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1656 break; 1657 case T_FLOAT: 1658 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1659 break; 1660 case T_LONG: 1661 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1662 break; 1663 case T_DOUBLE: 1664 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1665 break; 1666 default: 1667 assert(false,"Should not reach here."); 1668 break; 1669 } 1670 } 1671 1672 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1673 switch(typ) { 1674 case T_INT: 1675 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1676 break; 1677 case T_FLOAT: 1678 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1679 break; 1680 case T_LONG: 1681 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1682 break; 1683 case T_DOUBLE: 1684 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1685 break; 1686 default: 1687 assert(false,"Should not reach here."); 1688 break; 1689 } 1690 } 1691 1692 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1693 if (vlen_in_bytes <= 16) { 1694 pxor (dst, dst); 1695 psubb(dst, src); 1696 switch (elem_bt) { 1697 case T_BYTE: /* nothing to do */ break; 1698 case T_SHORT: pmovsxbw(dst, dst); break; 1699 case T_INT: pmovsxbd(dst, dst); break; 1700 case T_FLOAT: pmovsxbd(dst, dst); break; 1701 case T_LONG: pmovsxbq(dst, dst); break; 1702 case T_DOUBLE: pmovsxbq(dst, dst); break; 1703 1704 default: assert(false, "%s", type2name(elem_bt)); 1705 } 1706 } else { 1707 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1708 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1709 1710 vpxor (dst, dst, dst, vlen_enc); 1711 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1712 1713 switch (elem_bt) { 1714 case T_BYTE: /* nothing to do */ break; 1715 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1716 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1717 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1718 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1719 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1720 1721 default: assert(false, "%s", type2name(elem_bt)); 1722 } 1723 } 1724 } 1725 1726 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1727 if (novlbwdq) { 1728 vpmovsxbd(xtmp, src, vlen_enc); 1729 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1730 Assembler::eq, true, vlen_enc, noreg); 1731 } else { 1732 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1733 vpsubb(xtmp, xtmp, src, vlen_enc); 1734 evpmovb2m(dst, xtmp, vlen_enc); 1735 } 1736 } 1737 1738 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) { 1739 if (is_integral_type(bt)) { 1740 switch (vlen_in_bytes) { 1741 case 4: movdl(dst, src); break; 1742 case 8: movq(dst, src); break; 1743 case 16: movdqu(dst, src); break; 1744 case 32: vmovdqu(dst, src); break; 1745 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1746 default: ShouldNotReachHere(); 1747 } 1748 } else { 1749 switch (vlen_in_bytes) { 1750 case 4: movflt(dst, src); break; 1751 case 8: movdbl(dst, src); break; 1752 case 16: movups(dst, src); break; 1753 case 32: vmovups(dst, src, Assembler::AVX_256bit); break; 1754 case 64: vmovups(dst, src, Assembler::AVX_512bit); break; 1755 default: ShouldNotReachHere(); 1756 } 1757 } 1758 } 1759 1760 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1761 assert(rscratch != noreg || always_reachable(src), "missing"); 1762 1763 if (reachable(src)) { 1764 load_vector(bt, dst, as_Address(src), vlen_in_bytes); 1765 } else { 1766 lea(rscratch, src); 1767 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes); 1768 } 1769 } 1770 1771 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1772 int vlen_enc = vector_length_encoding(vlen); 1773 if (VM_Version::supports_avx()) { 1774 if (bt == T_LONG) { 1775 if (VM_Version::supports_avx2()) { 1776 vpbroadcastq(dst, src, vlen_enc); 1777 } else { 1778 vmovddup(dst, src, vlen_enc); 1779 } 1780 } else if (bt == T_DOUBLE) { 1781 if (vlen_enc != Assembler::AVX_128bit) { 1782 vbroadcastsd(dst, src, vlen_enc, noreg); 1783 } else { 1784 vmovddup(dst, src, vlen_enc); 1785 } 1786 } else { 1787 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1788 vpbroadcastd(dst, src, vlen_enc); 1789 } else { 1790 vbroadcastss(dst, src, vlen_enc); 1791 } 1792 } 1793 } else if (VM_Version::supports_sse3()) { 1794 movddup(dst, src); 1795 } else { 1796 load_vector(bt, dst, src, vlen); 1797 } 1798 } 1799 1800 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1801 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1802 int offset = exact_log2(type2aelembytes(bt)) << 6; 1803 if (is_floating_point_type(bt)) { 1804 offset += 128; 1805 } 1806 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1807 load_vector(T_BYTE, dst, addr, vlen_in_bytes); 1808 } 1809 1810 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1811 1812 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1813 int vector_len = Assembler::AVX_128bit; 1814 1815 switch (opcode) { 1816 case Op_AndReductionV: pand(dst, src); break; 1817 case Op_OrReductionV: por (dst, src); break; 1818 case Op_XorReductionV: pxor(dst, src); break; 1819 case Op_MinReductionV: 1820 switch (typ) { 1821 case T_BYTE: pminsb(dst, src); break; 1822 case T_SHORT: pminsw(dst, src); break; 1823 case T_INT: pminsd(dst, src); break; 1824 case T_LONG: assert(UseAVX > 2, "required"); 1825 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1826 default: assert(false, "wrong type"); 1827 } 1828 break; 1829 case Op_MaxReductionV: 1830 switch (typ) { 1831 case T_BYTE: pmaxsb(dst, src); break; 1832 case T_SHORT: pmaxsw(dst, src); break; 1833 case T_INT: pmaxsd(dst, src); break; 1834 case T_LONG: assert(UseAVX > 2, "required"); 1835 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1836 default: assert(false, "wrong type"); 1837 } 1838 break; 1839 case Op_AddReductionVF: addss(dst, src); break; 1840 case Op_AddReductionVD: addsd(dst, src); break; 1841 case Op_AddReductionVI: 1842 switch (typ) { 1843 case T_BYTE: paddb(dst, src); break; 1844 case T_SHORT: paddw(dst, src); break; 1845 case T_INT: paddd(dst, src); break; 1846 default: assert(false, "wrong type"); 1847 } 1848 break; 1849 case Op_AddReductionVL: paddq(dst, src); break; 1850 case Op_MulReductionVF: mulss(dst, src); break; 1851 case Op_MulReductionVD: mulsd(dst, src); break; 1852 case Op_MulReductionVI: 1853 switch (typ) { 1854 case T_SHORT: pmullw(dst, src); break; 1855 case T_INT: pmulld(dst, src); break; 1856 default: assert(false, "wrong type"); 1857 } 1858 break; 1859 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1860 evpmullq(dst, dst, src, vector_len); break; 1861 default: assert(false, "wrong opcode"); 1862 } 1863 } 1864 1865 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1866 switch (opcode) { 1867 case Op_AddReductionVF: addps(dst, src); break; 1868 case Op_AddReductionVD: addpd(dst, src); break; 1869 case Op_MulReductionVF: mulps(dst, src); break; 1870 case Op_MulReductionVD: mulpd(dst, src); break; 1871 default: assert(false, "%s", NodeClassNames[opcode]); 1872 } 1873 } 1874 1875 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1876 int vector_len = Assembler::AVX_256bit; 1877 1878 switch (opcode) { 1879 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1880 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1881 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1882 case Op_MinReductionV: 1883 switch (typ) { 1884 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1885 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1886 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1887 case T_LONG: assert(UseAVX > 2, "required"); 1888 vpminsq(dst, src1, src2, vector_len); break; 1889 default: assert(false, "wrong type"); 1890 } 1891 break; 1892 case Op_MaxReductionV: 1893 switch (typ) { 1894 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1895 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1896 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1897 case T_LONG: assert(UseAVX > 2, "required"); 1898 vpmaxsq(dst, src1, src2, vector_len); break; 1899 default: assert(false, "wrong type"); 1900 } 1901 break; 1902 case Op_AddReductionVI: 1903 switch (typ) { 1904 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1905 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1906 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1907 default: assert(false, "wrong type"); 1908 } 1909 break; 1910 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1911 case Op_MulReductionVI: 1912 switch (typ) { 1913 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1914 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1915 default: assert(false, "wrong type"); 1916 } 1917 break; 1918 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1919 default: assert(false, "wrong opcode"); 1920 } 1921 } 1922 1923 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1924 int vector_len = Assembler::AVX_256bit; 1925 1926 switch (opcode) { 1927 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 1928 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 1929 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 1930 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 1931 default: assert(false, "%s", NodeClassNames[opcode]); 1932 } 1933 } 1934 1935 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1936 XMMRegister dst, XMMRegister src, 1937 XMMRegister vtmp1, XMMRegister vtmp2) { 1938 switch (opcode) { 1939 case Op_AddReductionVF: 1940 case Op_MulReductionVF: 1941 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1942 break; 1943 1944 case Op_AddReductionVD: 1945 case Op_MulReductionVD: 1946 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1947 break; 1948 1949 default: assert(false, "wrong opcode"); 1950 } 1951 } 1952 1953 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 1954 XMMRegister dst, XMMRegister src, 1955 XMMRegister vtmp1, XMMRegister vtmp2) { 1956 switch (opcode) { 1957 case Op_AddReductionVF: 1958 case Op_MulReductionVF: 1959 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1960 break; 1961 1962 case Op_AddReductionVD: 1963 case Op_MulReductionVD: 1964 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1965 break; 1966 1967 default: assert(false, "%s", NodeClassNames[opcode]); 1968 } 1969 } 1970 1971 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1972 Register dst, Register src1, XMMRegister src2, 1973 XMMRegister vtmp1, XMMRegister vtmp2) { 1974 switch (vlen) { 1975 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1976 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1977 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1978 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1979 1980 default: assert(false, "wrong vector length"); 1981 } 1982 } 1983 1984 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1985 Register dst, Register src1, XMMRegister src2, 1986 XMMRegister vtmp1, XMMRegister vtmp2) { 1987 switch (vlen) { 1988 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1989 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1990 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1991 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1992 1993 default: assert(false, "wrong vector length"); 1994 } 1995 } 1996 1997 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1998 Register dst, Register src1, XMMRegister src2, 1999 XMMRegister vtmp1, XMMRegister vtmp2) { 2000 switch (vlen) { 2001 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2002 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2003 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2004 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2005 2006 default: assert(false, "wrong vector length"); 2007 } 2008 } 2009 2010 void C2_MacroAssembler::reduceI(int opcode, int vlen, 2011 Register dst, Register src1, XMMRegister src2, 2012 XMMRegister vtmp1, XMMRegister vtmp2) { 2013 switch (vlen) { 2014 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2015 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2016 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2017 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2018 2019 default: assert(false, "wrong vector length"); 2020 } 2021 } 2022 2023 void C2_MacroAssembler::reduceL(int opcode, int vlen, 2024 Register dst, Register src1, XMMRegister src2, 2025 XMMRegister vtmp1, XMMRegister vtmp2) { 2026 switch (vlen) { 2027 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2028 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2029 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2030 2031 default: assert(false, "wrong vector length"); 2032 } 2033 } 2034 2035 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2036 switch (vlen) { 2037 case 2: 2038 assert(vtmp2 == xnoreg, ""); 2039 reduce2F(opcode, dst, src, vtmp1); 2040 break; 2041 case 4: 2042 assert(vtmp2 == xnoreg, ""); 2043 reduce4F(opcode, dst, src, vtmp1); 2044 break; 2045 case 8: 2046 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2047 break; 2048 case 16: 2049 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2050 break; 2051 default: assert(false, "wrong vector length"); 2052 } 2053 } 2054 2055 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2056 switch (vlen) { 2057 case 2: 2058 assert(vtmp2 == xnoreg, ""); 2059 reduce2D(opcode, dst, src, vtmp1); 2060 break; 2061 case 4: 2062 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2063 break; 2064 case 8: 2065 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2066 break; 2067 default: assert(false, "wrong vector length"); 2068 } 2069 } 2070 2071 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2072 switch (vlen) { 2073 case 2: 2074 assert(vtmp1 == xnoreg, ""); 2075 assert(vtmp2 == xnoreg, ""); 2076 unorderedReduce2F(opcode, dst, src); 2077 break; 2078 case 4: 2079 assert(vtmp2 == xnoreg, ""); 2080 unorderedReduce4F(opcode, dst, src, vtmp1); 2081 break; 2082 case 8: 2083 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 2084 break; 2085 case 16: 2086 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 2087 break; 2088 default: assert(false, "wrong vector length"); 2089 } 2090 } 2091 2092 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2093 switch (vlen) { 2094 case 2: 2095 assert(vtmp1 == xnoreg, ""); 2096 assert(vtmp2 == xnoreg, ""); 2097 unorderedReduce2D(opcode, dst, src); 2098 break; 2099 case 4: 2100 assert(vtmp2 == xnoreg, ""); 2101 unorderedReduce4D(opcode, dst, src, vtmp1); 2102 break; 2103 case 8: 2104 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2105 break; 2106 default: assert(false, "wrong vector length"); 2107 } 2108 } 2109 2110 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2111 if (opcode == Op_AddReductionVI) { 2112 if (vtmp1 != src2) { 2113 movdqu(vtmp1, src2); 2114 } 2115 phaddd(vtmp1, vtmp1); 2116 } else { 2117 pshufd(vtmp1, src2, 0x1); 2118 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2119 } 2120 movdl(vtmp2, src1); 2121 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2122 movdl(dst, vtmp1); 2123 } 2124 2125 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2126 if (opcode == Op_AddReductionVI) { 2127 if (vtmp1 != src2) { 2128 movdqu(vtmp1, src2); 2129 } 2130 phaddd(vtmp1, src2); 2131 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2132 } else { 2133 pshufd(vtmp2, src2, 0xE); 2134 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2135 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2136 } 2137 } 2138 2139 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2140 if (opcode == Op_AddReductionVI) { 2141 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2142 vextracti128_high(vtmp2, vtmp1); 2143 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2144 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2145 } else { 2146 vextracti128_high(vtmp1, src2); 2147 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2148 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2149 } 2150 } 2151 2152 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2153 vextracti64x4_high(vtmp2, src2); 2154 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2155 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2156 } 2157 2158 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2159 pshufd(vtmp2, src2, 0x1); 2160 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2161 movdqu(vtmp1, vtmp2); 2162 psrldq(vtmp1, 2); 2163 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2164 movdqu(vtmp2, vtmp1); 2165 psrldq(vtmp2, 1); 2166 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2167 movdl(vtmp2, src1); 2168 pmovsxbd(vtmp1, vtmp1); 2169 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2170 pextrb(dst, vtmp1, 0x0); 2171 movsbl(dst, dst); 2172 } 2173 2174 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2175 pshufd(vtmp1, src2, 0xE); 2176 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2177 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2178 } 2179 2180 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2181 vextracti128_high(vtmp2, src2); 2182 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2183 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2184 } 2185 2186 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2187 vextracti64x4_high(vtmp1, src2); 2188 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2189 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2190 } 2191 2192 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2193 pmovsxbw(vtmp2, src2); 2194 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2195 } 2196 2197 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2198 if (UseAVX > 1) { 2199 int vector_len = Assembler::AVX_256bit; 2200 vpmovsxbw(vtmp1, src2, vector_len); 2201 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2202 } else { 2203 pmovsxbw(vtmp2, src2); 2204 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2205 pshufd(vtmp2, src2, 0x1); 2206 pmovsxbw(vtmp2, src2); 2207 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2208 } 2209 } 2210 2211 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2212 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2213 int vector_len = Assembler::AVX_512bit; 2214 vpmovsxbw(vtmp1, src2, vector_len); 2215 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2216 } else { 2217 assert(UseAVX >= 2,"Should not reach here."); 2218 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2219 vextracti128_high(vtmp2, src2); 2220 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2221 } 2222 } 2223 2224 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2225 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2226 vextracti64x4_high(vtmp2, src2); 2227 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2228 } 2229 2230 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2231 if (opcode == Op_AddReductionVI) { 2232 if (vtmp1 != src2) { 2233 movdqu(vtmp1, src2); 2234 } 2235 phaddw(vtmp1, vtmp1); 2236 phaddw(vtmp1, vtmp1); 2237 } else { 2238 pshufd(vtmp2, src2, 0x1); 2239 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2240 movdqu(vtmp1, vtmp2); 2241 psrldq(vtmp1, 2); 2242 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2243 } 2244 movdl(vtmp2, src1); 2245 pmovsxwd(vtmp1, vtmp1); 2246 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2247 pextrw(dst, vtmp1, 0x0); 2248 movswl(dst, dst); 2249 } 2250 2251 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2252 if (opcode == Op_AddReductionVI) { 2253 if (vtmp1 != src2) { 2254 movdqu(vtmp1, src2); 2255 } 2256 phaddw(vtmp1, src2); 2257 } else { 2258 pshufd(vtmp1, src2, 0xE); 2259 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2260 } 2261 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2262 } 2263 2264 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2265 if (opcode == Op_AddReductionVI) { 2266 int vector_len = Assembler::AVX_256bit; 2267 vphaddw(vtmp2, src2, src2, vector_len); 2268 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2269 } else { 2270 vextracti128_high(vtmp2, src2); 2271 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2272 } 2273 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2274 } 2275 2276 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2277 int vector_len = Assembler::AVX_256bit; 2278 vextracti64x4_high(vtmp1, src2); 2279 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2280 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2281 } 2282 2283 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2284 pshufd(vtmp2, src2, 0xE); 2285 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2286 movdq(vtmp1, src1); 2287 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2288 movdq(dst, vtmp1); 2289 } 2290 2291 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2292 vextracti128_high(vtmp1, src2); 2293 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2294 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2295 } 2296 2297 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2298 vextracti64x4_high(vtmp2, src2); 2299 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2300 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2301 } 2302 2303 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2304 mov64(temp, -1L); 2305 bzhiq(temp, temp, len); 2306 kmovql(dst, temp); 2307 } 2308 2309 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2310 reduce_operation_128(T_FLOAT, opcode, dst, src); 2311 pshufd(vtmp, src, 0x1); 2312 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2313 } 2314 2315 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2316 reduce2F(opcode, dst, src, vtmp); 2317 pshufd(vtmp, src, 0x2); 2318 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2319 pshufd(vtmp, src, 0x3); 2320 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2321 } 2322 2323 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2324 reduce4F(opcode, dst, src, vtmp2); 2325 vextractf128_high(vtmp2, src); 2326 reduce4F(opcode, dst, vtmp2, vtmp1); 2327 } 2328 2329 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2330 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2331 vextracti64x4_high(vtmp1, src); 2332 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2333 } 2334 2335 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2336 pshufd(dst, src, 0x1); 2337 reduce_operation_128(T_FLOAT, opcode, dst, src); 2338 } 2339 2340 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2341 pshufd(vtmp, src, 0xE); 2342 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2343 unorderedReduce2F(opcode, dst, vtmp); 2344 } 2345 2346 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2347 vextractf128_high(vtmp1, src); 2348 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2349 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2350 } 2351 2352 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2353 vextractf64x4_high(vtmp2, src); 2354 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2355 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2356 } 2357 2358 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2359 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2360 pshufd(vtmp, src, 0xE); 2361 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2362 } 2363 2364 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2365 reduce2D(opcode, dst, src, vtmp2); 2366 vextractf128_high(vtmp2, src); 2367 reduce2D(opcode, dst, vtmp2, vtmp1); 2368 } 2369 2370 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2371 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2372 vextracti64x4_high(vtmp1, src); 2373 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2374 } 2375 2376 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2377 pshufd(dst, src, 0xE); 2378 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2379 } 2380 2381 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2382 vextractf128_high(vtmp, src); 2383 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2384 unorderedReduce2D(opcode, dst, vtmp); 2385 } 2386 2387 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2388 vextractf64x4_high(vtmp2, src); 2389 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2390 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2391 } 2392 2393 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2394 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2395 } 2396 2397 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2398 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2399 } 2400 2401 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 2402 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2403 } 2404 2405 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2406 int vec_enc) { 2407 switch(elem_bt) { 2408 case T_INT: 2409 case T_FLOAT: 2410 vmaskmovps(dst, src, mask, vec_enc); 2411 break; 2412 case T_LONG: 2413 case T_DOUBLE: 2414 vmaskmovpd(dst, src, mask, vec_enc); 2415 break; 2416 default: 2417 fatal("Unsupported type %s", type2name(elem_bt)); 2418 break; 2419 } 2420 } 2421 2422 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2423 int vec_enc) { 2424 switch(elem_bt) { 2425 case T_INT: 2426 case T_FLOAT: 2427 vmaskmovps(dst, src, mask, vec_enc); 2428 break; 2429 case T_LONG: 2430 case T_DOUBLE: 2431 vmaskmovpd(dst, src, mask, vec_enc); 2432 break; 2433 default: 2434 fatal("Unsupported type %s", type2name(elem_bt)); 2435 break; 2436 } 2437 } 2438 2439 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2440 XMMRegister dst, XMMRegister src, 2441 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2442 XMMRegister xmm_0, XMMRegister xmm_1) { 2443 const int permconst[] = {1, 14}; 2444 XMMRegister wsrc = src; 2445 XMMRegister wdst = xmm_0; 2446 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2447 2448 int vlen_enc = Assembler::AVX_128bit; 2449 if (vlen == 16) { 2450 vlen_enc = Assembler::AVX_256bit; 2451 } 2452 2453 for (int i = log2(vlen) - 1; i >=0; i--) { 2454 if (i == 0 && !is_dst_valid) { 2455 wdst = dst; 2456 } 2457 if (i == 3) { 2458 vextracti64x4_high(wtmp, wsrc); 2459 } else if (i == 2) { 2460 vextracti128_high(wtmp, wsrc); 2461 } else { // i = [0,1] 2462 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2463 } 2464 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2465 wsrc = wdst; 2466 vlen_enc = Assembler::AVX_128bit; 2467 } 2468 if (is_dst_valid) { 2469 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2470 } 2471 } 2472 2473 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2474 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2475 XMMRegister xmm_0, XMMRegister xmm_1) { 2476 XMMRegister wsrc = src; 2477 XMMRegister wdst = xmm_0; 2478 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2479 int vlen_enc = Assembler::AVX_128bit; 2480 if (vlen == 8) { 2481 vlen_enc = Assembler::AVX_256bit; 2482 } 2483 for (int i = log2(vlen) - 1; i >=0; i--) { 2484 if (i == 0 && !is_dst_valid) { 2485 wdst = dst; 2486 } 2487 if (i == 1) { 2488 vextracti128_high(wtmp, wsrc); 2489 } else if (i == 2) { 2490 vextracti64x4_high(wtmp, wsrc); 2491 } else { 2492 assert(i == 0, "%d", i); 2493 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2494 } 2495 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2496 wsrc = wdst; 2497 vlen_enc = Assembler::AVX_128bit; 2498 } 2499 if (is_dst_valid) { 2500 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2501 } 2502 } 2503 2504 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2505 switch (bt) { 2506 case T_BYTE: pextrb(dst, src, idx); break; 2507 case T_SHORT: pextrw(dst, src, idx); break; 2508 case T_INT: pextrd(dst, src, idx); break; 2509 case T_LONG: pextrq(dst, src, idx); break; 2510 2511 default: 2512 assert(false,"Should not reach here."); 2513 break; 2514 } 2515 } 2516 2517 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2518 int esize = type2aelembytes(typ); 2519 int elem_per_lane = 16/esize; 2520 int lane = elemindex / elem_per_lane; 2521 int eindex = elemindex % elem_per_lane; 2522 2523 if (lane >= 2) { 2524 assert(UseAVX > 2, "required"); 2525 vextractf32x4(dst, src, lane & 3); 2526 return dst; 2527 } else if (lane > 0) { 2528 assert(UseAVX > 0, "required"); 2529 vextractf128(dst, src, lane); 2530 return dst; 2531 } else { 2532 return src; 2533 } 2534 } 2535 2536 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2537 if (typ == T_BYTE) { 2538 movsbl(dst, dst); 2539 } else if (typ == T_SHORT) { 2540 movswl(dst, dst); 2541 } 2542 } 2543 2544 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2545 int esize = type2aelembytes(typ); 2546 int elem_per_lane = 16/esize; 2547 int eindex = elemindex % elem_per_lane; 2548 assert(is_integral_type(typ),"required"); 2549 2550 if (eindex == 0) { 2551 if (typ == T_LONG) { 2552 movq(dst, src); 2553 } else { 2554 movdl(dst, src); 2555 movsxl(typ, dst); 2556 } 2557 } else { 2558 extract(typ, dst, src, eindex); 2559 movsxl(typ, dst); 2560 } 2561 } 2562 2563 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2564 int esize = type2aelembytes(typ); 2565 int elem_per_lane = 16/esize; 2566 int eindex = elemindex % elem_per_lane; 2567 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2568 2569 if (eindex == 0) { 2570 movq(dst, src); 2571 } else { 2572 if (typ == T_FLOAT) { 2573 if (UseAVX == 0) { 2574 movdqu(dst, src); 2575 shufps(dst, dst, eindex); 2576 } else { 2577 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2578 } 2579 } else { 2580 if (UseAVX == 0) { 2581 movdqu(dst, src); 2582 psrldq(dst, eindex*esize); 2583 } else { 2584 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2585 } 2586 movq(dst, dst); 2587 } 2588 } 2589 // Zero upper bits 2590 if (typ == T_FLOAT) { 2591 if (UseAVX == 0) { 2592 assert(vtmp != xnoreg, "required."); 2593 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2594 pand(dst, vtmp); 2595 } else { 2596 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2597 } 2598 } 2599 } 2600 2601 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2602 switch(typ) { 2603 case T_BYTE: 2604 case T_BOOLEAN: 2605 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2606 break; 2607 case T_SHORT: 2608 case T_CHAR: 2609 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2610 break; 2611 case T_INT: 2612 case T_FLOAT: 2613 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2614 break; 2615 case T_LONG: 2616 case T_DOUBLE: 2617 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2618 break; 2619 default: 2620 assert(false,"Should not reach here."); 2621 break; 2622 } 2623 } 2624 2625 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2626 assert(rscratch != noreg || always_reachable(src2), "missing"); 2627 2628 switch(typ) { 2629 case T_BOOLEAN: 2630 case T_BYTE: 2631 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2632 break; 2633 case T_CHAR: 2634 case T_SHORT: 2635 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2636 break; 2637 case T_INT: 2638 case T_FLOAT: 2639 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2640 break; 2641 case T_LONG: 2642 case T_DOUBLE: 2643 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2644 break; 2645 default: 2646 assert(false,"Should not reach here."); 2647 break; 2648 } 2649 } 2650 2651 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2652 switch(typ) { 2653 case T_BYTE: 2654 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2655 break; 2656 case T_SHORT: 2657 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2658 break; 2659 case T_INT: 2660 case T_FLOAT: 2661 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2662 break; 2663 case T_LONG: 2664 case T_DOUBLE: 2665 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2666 break; 2667 default: 2668 assert(false,"Should not reach here."); 2669 break; 2670 } 2671 } 2672 2673 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2674 assert(vlen_in_bytes <= 32, ""); 2675 int esize = type2aelembytes(bt); 2676 if (vlen_in_bytes == 32) { 2677 assert(vtmp == xnoreg, "required."); 2678 if (esize >= 4) { 2679 vtestps(src1, src2, AVX_256bit); 2680 } else { 2681 vptest(src1, src2, AVX_256bit); 2682 } 2683 return; 2684 } 2685 if (vlen_in_bytes < 16) { 2686 // Duplicate the lower part to fill the whole register, 2687 // Don't need to do so for src2 2688 assert(vtmp != xnoreg, "required"); 2689 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2690 pshufd(vtmp, src1, shuffle_imm); 2691 } else { 2692 assert(vtmp == xnoreg, "required"); 2693 vtmp = src1; 2694 } 2695 if (esize >= 4 && VM_Version::supports_avx()) { 2696 vtestps(vtmp, src2, AVX_128bit); 2697 } else { 2698 ptest(vtmp, src2); 2699 } 2700 } 2701 2702 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2703 #ifdef ASSERT 2704 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2705 bool is_bw_supported = VM_Version::supports_avx512bw(); 2706 if (is_bw && !is_bw_supported) { 2707 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2708 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2709 "XMM register should be 0-15"); 2710 } 2711 #endif // ASSERT 2712 switch (elem_bt) { 2713 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2714 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2715 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2716 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2717 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2718 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2719 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2720 } 2721 } 2722 2723 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2724 assert(UseAVX >= 2, "required"); 2725 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2726 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2727 if ((UseAVX > 2) && 2728 (!is_bw || VM_Version::supports_avx512bw()) && 2729 (!is_vl || VM_Version::supports_avx512vl())) { 2730 switch (elem_bt) { 2731 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2732 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2733 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2734 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2735 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2736 } 2737 } else { 2738 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2739 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2740 switch (elem_bt) { 2741 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2742 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2743 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2744 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2745 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2746 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2747 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2748 } 2749 } 2750 } 2751 2752 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2753 switch (to_elem_bt) { 2754 case T_SHORT: 2755 vpmovsxbw(dst, src, vlen_enc); 2756 break; 2757 case T_INT: 2758 vpmovsxbd(dst, src, vlen_enc); 2759 break; 2760 case T_FLOAT: 2761 vpmovsxbd(dst, src, vlen_enc); 2762 vcvtdq2ps(dst, dst, vlen_enc); 2763 break; 2764 case T_LONG: 2765 vpmovsxbq(dst, src, vlen_enc); 2766 break; 2767 case T_DOUBLE: { 2768 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2769 vpmovsxbd(dst, src, mid_vlen_enc); 2770 vcvtdq2pd(dst, dst, vlen_enc); 2771 break; 2772 } 2773 default: 2774 fatal("Unsupported type %s", type2name(to_elem_bt)); 2775 break; 2776 } 2777 } 2778 2779 //------------------------------------------------------------------------------------------- 2780 2781 // IndexOf for constant substrings with size >= 8 chars 2782 // which don't need to be loaded through stack. 2783 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2784 Register cnt1, Register cnt2, 2785 int int_cnt2, Register result, 2786 XMMRegister vec, Register tmp, 2787 int ae) { 2788 ShortBranchVerifier sbv(this); 2789 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2790 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2791 2792 // This method uses the pcmpestri instruction with bound registers 2793 // inputs: 2794 // xmm - substring 2795 // rax - substring length (elements count) 2796 // mem - scanned string 2797 // rdx - string length (elements count) 2798 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2799 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2800 // outputs: 2801 // rcx - matched index in string 2802 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2803 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2804 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2805 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2806 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2807 2808 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2809 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2810 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2811 2812 // Note, inline_string_indexOf() generates checks: 2813 // if (substr.count > string.count) return -1; 2814 // if (substr.count == 0) return 0; 2815 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2816 2817 // Load substring. 2818 if (ae == StrIntrinsicNode::UL) { 2819 pmovzxbw(vec, Address(str2, 0)); 2820 } else { 2821 movdqu(vec, Address(str2, 0)); 2822 } 2823 movl(cnt2, int_cnt2); 2824 movptr(result, str1); // string addr 2825 2826 if (int_cnt2 > stride) { 2827 jmpb(SCAN_TO_SUBSTR); 2828 2829 // Reload substr for rescan, this code 2830 // is executed only for large substrings (> 8 chars) 2831 bind(RELOAD_SUBSTR); 2832 if (ae == StrIntrinsicNode::UL) { 2833 pmovzxbw(vec, Address(str2, 0)); 2834 } else { 2835 movdqu(vec, Address(str2, 0)); 2836 } 2837 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2838 2839 bind(RELOAD_STR); 2840 // We came here after the beginning of the substring was 2841 // matched but the rest of it was not so we need to search 2842 // again. Start from the next element after the previous match. 2843 2844 // cnt2 is number of substring reminding elements and 2845 // cnt1 is number of string reminding elements when cmp failed. 2846 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2847 subl(cnt1, cnt2); 2848 addl(cnt1, int_cnt2); 2849 movl(cnt2, int_cnt2); // Now restore cnt2 2850 2851 decrementl(cnt1); // Shift to next element 2852 cmpl(cnt1, cnt2); 2853 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2854 2855 addptr(result, (1<<scale1)); 2856 2857 } // (int_cnt2 > 8) 2858 2859 // Scan string for start of substr in 16-byte vectors 2860 bind(SCAN_TO_SUBSTR); 2861 pcmpestri(vec, Address(result, 0), mode); 2862 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2863 subl(cnt1, stride); 2864 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2865 cmpl(cnt1, cnt2); 2866 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2867 addptr(result, 16); 2868 jmpb(SCAN_TO_SUBSTR); 2869 2870 // Found a potential substr 2871 bind(FOUND_CANDIDATE); 2872 // Matched whole vector if first element matched (tmp(rcx) == 0). 2873 if (int_cnt2 == stride) { 2874 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2875 } else { // int_cnt2 > 8 2876 jccb(Assembler::overflow, FOUND_SUBSTR); 2877 } 2878 // After pcmpestri tmp(rcx) contains matched element index 2879 // Compute start addr of substr 2880 lea(result, Address(result, tmp, scale1)); 2881 2882 // Make sure string is still long enough 2883 subl(cnt1, tmp); 2884 cmpl(cnt1, cnt2); 2885 if (int_cnt2 == stride) { 2886 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2887 } else { // int_cnt2 > 8 2888 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2889 } 2890 // Left less then substring. 2891 2892 bind(RET_NOT_FOUND); 2893 movl(result, -1); 2894 jmp(EXIT); 2895 2896 if (int_cnt2 > stride) { 2897 // This code is optimized for the case when whole substring 2898 // is matched if its head is matched. 2899 bind(MATCH_SUBSTR_HEAD); 2900 pcmpestri(vec, Address(result, 0), mode); 2901 // Reload only string if does not match 2902 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2903 2904 Label CONT_SCAN_SUBSTR; 2905 // Compare the rest of substring (> 8 chars). 2906 bind(FOUND_SUBSTR); 2907 // First 8 chars are already matched. 2908 negptr(cnt2); 2909 addptr(cnt2, stride); 2910 2911 bind(SCAN_SUBSTR); 2912 subl(cnt1, stride); 2913 cmpl(cnt2, -stride); // Do not read beyond substring 2914 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2915 // Back-up strings to avoid reading beyond substring: 2916 // cnt1 = cnt1 - cnt2 + 8 2917 addl(cnt1, cnt2); // cnt2 is negative 2918 addl(cnt1, stride); 2919 movl(cnt2, stride); negptr(cnt2); 2920 bind(CONT_SCAN_SUBSTR); 2921 if (int_cnt2 < (int)G) { 2922 int tail_off1 = int_cnt2<<scale1; 2923 int tail_off2 = int_cnt2<<scale2; 2924 if (ae == StrIntrinsicNode::UL) { 2925 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2926 } else { 2927 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2928 } 2929 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2930 } else { 2931 // calculate index in register to avoid integer overflow (int_cnt2*2) 2932 movl(tmp, int_cnt2); 2933 addptr(tmp, cnt2); 2934 if (ae == StrIntrinsicNode::UL) { 2935 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2936 } else { 2937 movdqu(vec, Address(str2, tmp, scale2, 0)); 2938 } 2939 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2940 } 2941 // Need to reload strings pointers if not matched whole vector 2942 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2943 addptr(cnt2, stride); 2944 jcc(Assembler::negative, SCAN_SUBSTR); 2945 // Fall through if found full substring 2946 2947 } // (int_cnt2 > 8) 2948 2949 bind(RET_FOUND); 2950 // Found result if we matched full small substring. 2951 // Compute substr offset 2952 subptr(result, str1); 2953 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2954 shrl(result, 1); // index 2955 } 2956 bind(EXIT); 2957 2958 } // string_indexofC8 2959 2960 // Small strings are loaded through stack if they cross page boundary. 2961 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2962 Register cnt1, Register cnt2, 2963 int int_cnt2, Register result, 2964 XMMRegister vec, Register tmp, 2965 int ae) { 2966 ShortBranchVerifier sbv(this); 2967 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2968 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2969 2970 // 2971 // int_cnt2 is length of small (< 8 chars) constant substring 2972 // or (-1) for non constant substring in which case its length 2973 // is in cnt2 register. 2974 // 2975 // Note, inline_string_indexOf() generates checks: 2976 // if (substr.count > string.count) return -1; 2977 // if (substr.count == 0) return 0; 2978 // 2979 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2980 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2981 // This method uses the pcmpestri instruction with bound registers 2982 // inputs: 2983 // xmm - substring 2984 // rax - substring length (elements count) 2985 // mem - scanned string 2986 // rdx - string length (elements count) 2987 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2988 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2989 // outputs: 2990 // rcx - matched index in string 2991 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2992 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2993 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2994 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2995 2996 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2997 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2998 FOUND_CANDIDATE; 2999 3000 { //======================================================== 3001 // We don't know where these strings are located 3002 // and we can't read beyond them. Load them through stack. 3003 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 3004 3005 movptr(tmp, rsp); // save old SP 3006 3007 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 3008 if (int_cnt2 == (1>>scale2)) { // One byte 3009 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 3010 load_unsigned_byte(result, Address(str2, 0)); 3011 movdl(vec, result); // move 32 bits 3012 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 3013 // Not enough header space in 32-bit VM: 12+3 = 15. 3014 movl(result, Address(str2, -1)); 3015 shrl(result, 8); 3016 movdl(vec, result); // move 32 bits 3017 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 3018 load_unsigned_short(result, Address(str2, 0)); 3019 movdl(vec, result); // move 32 bits 3020 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 3021 movdl(vec, Address(str2, 0)); // move 32 bits 3022 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 3023 movq(vec, Address(str2, 0)); // move 64 bits 3024 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 3025 // Array header size is 12 bytes in 32-bit VM 3026 // + 6 bytes for 3 chars == 18 bytes, 3027 // enough space to load vec and shift. 3028 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3029 if (ae == StrIntrinsicNode::UL) { 3030 int tail_off = int_cnt2-8; 3031 pmovzxbw(vec, Address(str2, tail_off)); 3032 psrldq(vec, -2*tail_off); 3033 } 3034 else { 3035 int tail_off = int_cnt2*(1<<scale2); 3036 movdqu(vec, Address(str2, tail_off-16)); 3037 psrldq(vec, 16-tail_off); 3038 } 3039 } 3040 } else { // not constant substring 3041 cmpl(cnt2, stride); 3042 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3043 3044 // We can read beyond string if srt+16 does not cross page boundary 3045 // since heaps are aligned and mapped by pages. 3046 assert(os::vm_page_size() < (int)G, "default page should be small"); 3047 movl(result, str2); // We need only low 32 bits 3048 andl(result, ((int)os::vm_page_size()-1)); 3049 cmpl(result, ((int)os::vm_page_size()-16)); 3050 jccb(Assembler::belowEqual, CHECK_STR); 3051 3052 // Move small strings to stack to allow load 16 bytes into vec. 3053 subptr(rsp, 16); 3054 int stk_offset = wordSize-(1<<scale2); 3055 push(cnt2); 3056 3057 bind(COPY_SUBSTR); 3058 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3059 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3060 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3061 } else if (ae == StrIntrinsicNode::UU) { 3062 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3063 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3064 } 3065 decrement(cnt2); 3066 jccb(Assembler::notZero, COPY_SUBSTR); 3067 3068 pop(cnt2); 3069 movptr(str2, rsp); // New substring address 3070 } // non constant 3071 3072 bind(CHECK_STR); 3073 cmpl(cnt1, stride); 3074 jccb(Assembler::aboveEqual, BIG_STRINGS); 3075 3076 // Check cross page boundary. 3077 movl(result, str1); // We need only low 32 bits 3078 andl(result, ((int)os::vm_page_size()-1)); 3079 cmpl(result, ((int)os::vm_page_size()-16)); 3080 jccb(Assembler::belowEqual, BIG_STRINGS); 3081 3082 subptr(rsp, 16); 3083 int stk_offset = -(1<<scale1); 3084 if (int_cnt2 < 0) { // not constant 3085 push(cnt2); 3086 stk_offset += wordSize; 3087 } 3088 movl(cnt2, cnt1); 3089 3090 bind(COPY_STR); 3091 if (ae == StrIntrinsicNode::LL) { 3092 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3093 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3094 } else { 3095 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3096 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3097 } 3098 decrement(cnt2); 3099 jccb(Assembler::notZero, COPY_STR); 3100 3101 if (int_cnt2 < 0) { // not constant 3102 pop(cnt2); 3103 } 3104 movptr(str1, rsp); // New string address 3105 3106 bind(BIG_STRINGS); 3107 // Load substring. 3108 if (int_cnt2 < 0) { // -1 3109 if (ae == StrIntrinsicNode::UL) { 3110 pmovzxbw(vec, Address(str2, 0)); 3111 } else { 3112 movdqu(vec, Address(str2, 0)); 3113 } 3114 push(cnt2); // substr count 3115 push(str2); // substr addr 3116 push(str1); // string addr 3117 } else { 3118 // Small (< 8 chars) constant substrings are loaded already. 3119 movl(cnt2, int_cnt2); 3120 } 3121 push(tmp); // original SP 3122 3123 } // Finished loading 3124 3125 //======================================================== 3126 // Start search 3127 // 3128 3129 movptr(result, str1); // string addr 3130 3131 if (int_cnt2 < 0) { // Only for non constant substring 3132 jmpb(SCAN_TO_SUBSTR); 3133 3134 // SP saved at sp+0 3135 // String saved at sp+1*wordSize 3136 // Substr saved at sp+2*wordSize 3137 // Substr count saved at sp+3*wordSize 3138 3139 // Reload substr for rescan, this code 3140 // is executed only for large substrings (> 8 chars) 3141 bind(RELOAD_SUBSTR); 3142 movptr(str2, Address(rsp, 2*wordSize)); 3143 movl(cnt2, Address(rsp, 3*wordSize)); 3144 if (ae == StrIntrinsicNode::UL) { 3145 pmovzxbw(vec, Address(str2, 0)); 3146 } else { 3147 movdqu(vec, Address(str2, 0)); 3148 } 3149 // We came here after the beginning of the substring was 3150 // matched but the rest of it was not so we need to search 3151 // again. Start from the next element after the previous match. 3152 subptr(str1, result); // Restore counter 3153 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3154 shrl(str1, 1); 3155 } 3156 addl(cnt1, str1); 3157 decrementl(cnt1); // Shift to next element 3158 cmpl(cnt1, cnt2); 3159 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3160 3161 addptr(result, (1<<scale1)); 3162 } // non constant 3163 3164 // Scan string for start of substr in 16-byte vectors 3165 bind(SCAN_TO_SUBSTR); 3166 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3167 pcmpestri(vec, Address(result, 0), mode); 3168 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3169 subl(cnt1, stride); 3170 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3171 cmpl(cnt1, cnt2); 3172 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3173 addptr(result, 16); 3174 3175 bind(ADJUST_STR); 3176 cmpl(cnt1, stride); // Do not read beyond string 3177 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3178 // Back-up string to avoid reading beyond string. 3179 lea(result, Address(result, cnt1, scale1, -16)); 3180 movl(cnt1, stride); 3181 jmpb(SCAN_TO_SUBSTR); 3182 3183 // Found a potential substr 3184 bind(FOUND_CANDIDATE); 3185 // After pcmpestri tmp(rcx) contains matched element index 3186 3187 // Make sure string is still long enough 3188 subl(cnt1, tmp); 3189 cmpl(cnt1, cnt2); 3190 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3191 // Left less then substring. 3192 3193 bind(RET_NOT_FOUND); 3194 movl(result, -1); 3195 jmp(CLEANUP); 3196 3197 bind(FOUND_SUBSTR); 3198 // Compute start addr of substr 3199 lea(result, Address(result, tmp, scale1)); 3200 if (int_cnt2 > 0) { // Constant substring 3201 // Repeat search for small substring (< 8 chars) 3202 // from new point without reloading substring. 3203 // Have to check that we don't read beyond string. 3204 cmpl(tmp, stride-int_cnt2); 3205 jccb(Assembler::greater, ADJUST_STR); 3206 // Fall through if matched whole substring. 3207 } else { // non constant 3208 assert(int_cnt2 == -1, "should be != 0"); 3209 3210 addl(tmp, cnt2); 3211 // Found result if we matched whole substring. 3212 cmpl(tmp, stride); 3213 jcc(Assembler::lessEqual, RET_FOUND); 3214 3215 // Repeat search for small substring (<= 8 chars) 3216 // from new point 'str1' without reloading substring. 3217 cmpl(cnt2, stride); 3218 // Have to check that we don't read beyond string. 3219 jccb(Assembler::lessEqual, ADJUST_STR); 3220 3221 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3222 // Compare the rest of substring (> 8 chars). 3223 movptr(str1, result); 3224 3225 cmpl(tmp, cnt2); 3226 // First 8 chars are already matched. 3227 jccb(Assembler::equal, CHECK_NEXT); 3228 3229 bind(SCAN_SUBSTR); 3230 pcmpestri(vec, Address(str1, 0), mode); 3231 // Need to reload strings pointers if not matched whole vector 3232 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3233 3234 bind(CHECK_NEXT); 3235 subl(cnt2, stride); 3236 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3237 addptr(str1, 16); 3238 if (ae == StrIntrinsicNode::UL) { 3239 addptr(str2, 8); 3240 } else { 3241 addptr(str2, 16); 3242 } 3243 subl(cnt1, stride); 3244 cmpl(cnt2, stride); // Do not read beyond substring 3245 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3246 // Back-up strings to avoid reading beyond substring. 3247 3248 if (ae == StrIntrinsicNode::UL) { 3249 lea(str2, Address(str2, cnt2, scale2, -8)); 3250 lea(str1, Address(str1, cnt2, scale1, -16)); 3251 } else { 3252 lea(str2, Address(str2, cnt2, scale2, -16)); 3253 lea(str1, Address(str1, cnt2, scale1, -16)); 3254 } 3255 subl(cnt1, cnt2); 3256 movl(cnt2, stride); 3257 addl(cnt1, stride); 3258 bind(CONT_SCAN_SUBSTR); 3259 if (ae == StrIntrinsicNode::UL) { 3260 pmovzxbw(vec, Address(str2, 0)); 3261 } else { 3262 movdqu(vec, Address(str2, 0)); 3263 } 3264 jmp(SCAN_SUBSTR); 3265 3266 bind(RET_FOUND_LONG); 3267 movptr(str1, Address(rsp, wordSize)); 3268 } // non constant 3269 3270 bind(RET_FOUND); 3271 // Compute substr offset 3272 subptr(result, str1); 3273 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3274 shrl(result, 1); // index 3275 } 3276 bind(CLEANUP); 3277 pop(rsp); // restore SP 3278 3279 } // string_indexof 3280 3281 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3282 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3283 ShortBranchVerifier sbv(this); 3284 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3285 3286 int stride = 8; 3287 3288 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3289 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3290 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3291 FOUND_SEQ_CHAR, DONE_LABEL; 3292 3293 movptr(result, str1); 3294 if (UseAVX >= 2) { 3295 cmpl(cnt1, stride); 3296 jcc(Assembler::less, SCAN_TO_CHAR); 3297 cmpl(cnt1, 2*stride); 3298 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3299 movdl(vec1, ch); 3300 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3301 vpxor(vec2, vec2); 3302 movl(tmp, cnt1); 3303 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3304 andl(cnt1,0x0000000F); //tail count (in chars) 3305 3306 bind(SCAN_TO_16_CHAR_LOOP); 3307 vmovdqu(vec3, Address(result, 0)); 3308 vpcmpeqw(vec3, vec3, vec1, 1); 3309 vptest(vec2, vec3); 3310 jcc(Assembler::carryClear, FOUND_CHAR); 3311 addptr(result, 32); 3312 subl(tmp, 2*stride); 3313 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3314 jmp(SCAN_TO_8_CHAR); 3315 bind(SCAN_TO_8_CHAR_INIT); 3316 movdl(vec1, ch); 3317 pshuflw(vec1, vec1, 0x00); 3318 pshufd(vec1, vec1, 0); 3319 pxor(vec2, vec2); 3320 } 3321 bind(SCAN_TO_8_CHAR); 3322 cmpl(cnt1, stride); 3323 jcc(Assembler::less, SCAN_TO_CHAR); 3324 if (UseAVX < 2) { 3325 movdl(vec1, ch); 3326 pshuflw(vec1, vec1, 0x00); 3327 pshufd(vec1, vec1, 0); 3328 pxor(vec2, vec2); 3329 } 3330 movl(tmp, cnt1); 3331 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3332 andl(cnt1,0x00000007); //tail count (in chars) 3333 3334 bind(SCAN_TO_8_CHAR_LOOP); 3335 movdqu(vec3, Address(result, 0)); 3336 pcmpeqw(vec3, vec1); 3337 ptest(vec2, vec3); 3338 jcc(Assembler::carryClear, FOUND_CHAR); 3339 addptr(result, 16); 3340 subl(tmp, stride); 3341 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3342 bind(SCAN_TO_CHAR); 3343 testl(cnt1, cnt1); 3344 jcc(Assembler::zero, RET_NOT_FOUND); 3345 bind(SCAN_TO_CHAR_LOOP); 3346 load_unsigned_short(tmp, Address(result, 0)); 3347 cmpl(ch, tmp); 3348 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3349 addptr(result, 2); 3350 subl(cnt1, 1); 3351 jccb(Assembler::zero, RET_NOT_FOUND); 3352 jmp(SCAN_TO_CHAR_LOOP); 3353 3354 bind(RET_NOT_FOUND); 3355 movl(result, -1); 3356 jmpb(DONE_LABEL); 3357 3358 bind(FOUND_CHAR); 3359 if (UseAVX >= 2) { 3360 vpmovmskb(tmp, vec3); 3361 } else { 3362 pmovmskb(tmp, vec3); 3363 } 3364 bsfl(ch, tmp); 3365 addptr(result, ch); 3366 3367 bind(FOUND_SEQ_CHAR); 3368 subptr(result, str1); 3369 shrl(result, 1); 3370 3371 bind(DONE_LABEL); 3372 } // string_indexof_char 3373 3374 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3375 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3376 ShortBranchVerifier sbv(this); 3377 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3378 3379 int stride = 16; 3380 3381 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3382 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3383 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3384 FOUND_SEQ_CHAR, DONE_LABEL; 3385 3386 movptr(result, str1); 3387 if (UseAVX >= 2) { 3388 cmpl(cnt1, stride); 3389 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3390 cmpl(cnt1, stride*2); 3391 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3392 movdl(vec1, ch); 3393 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3394 vpxor(vec2, vec2); 3395 movl(tmp, cnt1); 3396 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3397 andl(cnt1,0x0000001F); //tail count (in chars) 3398 3399 bind(SCAN_TO_32_CHAR_LOOP); 3400 vmovdqu(vec3, Address(result, 0)); 3401 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3402 vptest(vec2, vec3); 3403 jcc(Assembler::carryClear, FOUND_CHAR); 3404 addptr(result, 32); 3405 subl(tmp, stride*2); 3406 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3407 jmp(SCAN_TO_16_CHAR); 3408 3409 bind(SCAN_TO_16_CHAR_INIT); 3410 movdl(vec1, ch); 3411 pxor(vec2, vec2); 3412 pshufb(vec1, vec2); 3413 } 3414 3415 bind(SCAN_TO_16_CHAR); 3416 cmpl(cnt1, stride); 3417 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3418 if (UseAVX < 2) { 3419 movdl(vec1, ch); 3420 pxor(vec2, vec2); 3421 pshufb(vec1, vec2); 3422 } 3423 movl(tmp, cnt1); 3424 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3425 andl(cnt1,0x0000000F); //tail count (in bytes) 3426 3427 bind(SCAN_TO_16_CHAR_LOOP); 3428 movdqu(vec3, Address(result, 0)); 3429 pcmpeqb(vec3, vec1); 3430 ptest(vec2, vec3); 3431 jcc(Assembler::carryClear, FOUND_CHAR); 3432 addptr(result, 16); 3433 subl(tmp, stride); 3434 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3435 3436 bind(SCAN_TO_CHAR_INIT); 3437 testl(cnt1, cnt1); 3438 jcc(Assembler::zero, RET_NOT_FOUND); 3439 bind(SCAN_TO_CHAR_LOOP); 3440 load_unsigned_byte(tmp, Address(result, 0)); 3441 cmpl(ch, tmp); 3442 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3443 addptr(result, 1); 3444 subl(cnt1, 1); 3445 jccb(Assembler::zero, RET_NOT_FOUND); 3446 jmp(SCAN_TO_CHAR_LOOP); 3447 3448 bind(RET_NOT_FOUND); 3449 movl(result, -1); 3450 jmpb(DONE_LABEL); 3451 3452 bind(FOUND_CHAR); 3453 if (UseAVX >= 2) { 3454 vpmovmskb(tmp, vec3); 3455 } else { 3456 pmovmskb(tmp, vec3); 3457 } 3458 bsfl(ch, tmp); 3459 addptr(result, ch); 3460 3461 bind(FOUND_SEQ_CHAR); 3462 subptr(result, str1); 3463 3464 bind(DONE_LABEL); 3465 } // stringL_indexof_char 3466 3467 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3468 switch (eltype) { 3469 case T_BOOLEAN: return sizeof(jboolean); 3470 case T_BYTE: return sizeof(jbyte); 3471 case T_SHORT: return sizeof(jshort); 3472 case T_CHAR: return sizeof(jchar); 3473 case T_INT: return sizeof(jint); 3474 default: 3475 ShouldNotReachHere(); 3476 return -1; 3477 } 3478 } 3479 3480 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3481 switch (eltype) { 3482 // T_BOOLEAN used as surrogate for unsigned byte 3483 case T_BOOLEAN: movzbl(dst, src); break; 3484 case T_BYTE: movsbl(dst, src); break; 3485 case T_SHORT: movswl(dst, src); break; 3486 case T_CHAR: movzwl(dst, src); break; 3487 case T_INT: movl(dst, src); break; 3488 default: 3489 ShouldNotReachHere(); 3490 } 3491 } 3492 3493 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3494 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3495 } 3496 3497 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3498 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3499 } 3500 3501 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3502 const int vlen = Assembler::AVX_256bit; 3503 switch (eltype) { 3504 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3505 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3506 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3507 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3508 case T_INT: 3509 // do nothing 3510 break; 3511 default: 3512 ShouldNotReachHere(); 3513 } 3514 } 3515 3516 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3517 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3518 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3519 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3520 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3521 BasicType eltype) { 3522 ShortBranchVerifier sbv(this); 3523 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3524 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3525 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3526 3527 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3528 SHORT_UNROLLED_LOOP_EXIT, 3529 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3530 UNROLLED_VECTOR_LOOP_BEGIN, 3531 END; 3532 switch (eltype) { 3533 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3534 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3535 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3536 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3537 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3538 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3539 } 3540 3541 // For "renaming" for readibility of the code 3542 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3543 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3544 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3545 3546 const int elsize = arrays_hashcode_elsize(eltype); 3547 3548 /* 3549 if (cnt1 >= 2) { 3550 if (cnt1 >= 32) { 3551 UNROLLED VECTOR LOOP 3552 } 3553 UNROLLED SCALAR LOOP 3554 } 3555 SINGLE SCALAR 3556 */ 3557 3558 cmpl(cnt1, 32); 3559 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3560 3561 // cnt1 >= 32 && generate_vectorized_loop 3562 xorl(index, index); 3563 3564 // vresult = IntVector.zero(I256); 3565 for (int idx = 0; idx < 4; idx++) { 3566 vpxor(vresult[idx], vresult[idx]); 3567 } 3568 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3569 Register bound = tmp2; 3570 Register next = tmp3; 3571 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3572 movl(next, Address(tmp2, 0)); 3573 movdl(vnext, next); 3574 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3575 3576 // index = 0; 3577 // bound = cnt1 & ~(32 - 1); 3578 movl(bound, cnt1); 3579 andl(bound, ~(32 - 1)); 3580 // for (; index < bound; index += 32) { 3581 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3582 // result *= next; 3583 imull(result, next); 3584 // loop fission to upfront the cost of fetching from memory, OOO execution 3585 // can then hopefully do a better job of prefetching 3586 for (int idx = 0; idx < 4; idx++) { 3587 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3588 } 3589 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3590 for (int idx = 0; idx < 4; idx++) { 3591 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3592 arrays_hashcode_elvcast(vtmp[idx], eltype); 3593 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3594 } 3595 // index += 32; 3596 addl(index, 32); 3597 // index < bound; 3598 cmpl(index, bound); 3599 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3600 // } 3601 3602 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3603 subl(cnt1, bound); 3604 // release bound 3605 3606 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3607 for (int idx = 0; idx < 4; idx++) { 3608 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3609 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3610 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3611 } 3612 // result += vresult.reduceLanes(ADD); 3613 for (int idx = 0; idx < 4; idx++) { 3614 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3615 } 3616 3617 // } else if (cnt1 < 32) { 3618 3619 bind(SHORT_UNROLLED_BEGIN); 3620 // int i = 1; 3621 movl(index, 1); 3622 cmpl(index, cnt1); 3623 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3624 3625 // for (; i < cnt1 ; i += 2) { 3626 bind(SHORT_UNROLLED_LOOP_BEGIN); 3627 movl(tmp3, 961); 3628 imull(result, tmp3); 3629 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3630 movl(tmp3, tmp2); 3631 shll(tmp3, 5); 3632 subl(tmp3, tmp2); 3633 addl(result, tmp3); 3634 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3635 addl(result, tmp3); 3636 addl(index, 2); 3637 cmpl(index, cnt1); 3638 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3639 3640 // } 3641 // if (i >= cnt1) { 3642 bind(SHORT_UNROLLED_LOOP_EXIT); 3643 jccb(Assembler::greater, END); 3644 movl(tmp2, result); 3645 shll(result, 5); 3646 subl(result, tmp2); 3647 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3648 addl(result, tmp3); 3649 // } 3650 bind(END); 3651 3652 BLOCK_COMMENT("} // arrays_hashcode"); 3653 3654 } // arrays_hashcode 3655 3656 // helper function for string_compare 3657 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3658 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3659 Address::ScaleFactor scale2, Register index, int ae) { 3660 if (ae == StrIntrinsicNode::LL) { 3661 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3662 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3663 } else if (ae == StrIntrinsicNode::UU) { 3664 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3665 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3666 } else { 3667 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3668 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3669 } 3670 } 3671 3672 // Compare strings, used for char[] and byte[]. 3673 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3674 Register cnt1, Register cnt2, Register result, 3675 XMMRegister vec1, int ae, KRegister mask) { 3676 ShortBranchVerifier sbv(this); 3677 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3678 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3 3679 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3680 int stride2x2 = 0x40; 3681 Address::ScaleFactor scale = Address::no_scale; 3682 Address::ScaleFactor scale1 = Address::no_scale; 3683 Address::ScaleFactor scale2 = Address::no_scale; 3684 3685 if (ae != StrIntrinsicNode::LL) { 3686 stride2x2 = 0x20; 3687 } 3688 3689 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3690 shrl(cnt2, 1); 3691 } 3692 // Compute the minimum of the string lengths and the 3693 // difference of the string lengths (stack). 3694 // Do the conditional move stuff 3695 movl(result, cnt1); 3696 subl(cnt1, cnt2); 3697 push(cnt1); 3698 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3699 3700 // Is the minimum length zero? 3701 testl(cnt2, cnt2); 3702 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3703 if (ae == StrIntrinsicNode::LL) { 3704 // Load first bytes 3705 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3706 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3707 } else if (ae == StrIntrinsicNode::UU) { 3708 // Load first characters 3709 load_unsigned_short(result, Address(str1, 0)); 3710 load_unsigned_short(cnt1, Address(str2, 0)); 3711 } else { 3712 load_unsigned_byte(result, Address(str1, 0)); 3713 load_unsigned_short(cnt1, Address(str2, 0)); 3714 } 3715 subl(result, cnt1); 3716 jcc(Assembler::notZero, POP_LABEL); 3717 3718 if (ae == StrIntrinsicNode::UU) { 3719 // Divide length by 2 to get number of chars 3720 shrl(cnt2, 1); 3721 } 3722 cmpl(cnt2, 1); 3723 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3724 3725 // Check if the strings start at the same location and setup scale and stride 3726 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3727 cmpptr(str1, str2); 3728 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3729 if (ae == StrIntrinsicNode::LL) { 3730 scale = Address::times_1; 3731 stride = 16; 3732 } else { 3733 scale = Address::times_2; 3734 stride = 8; 3735 } 3736 } else { 3737 scale1 = Address::times_1; 3738 scale2 = Address::times_2; 3739 // scale not used 3740 stride = 8; 3741 } 3742 3743 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3744 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3745 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3746 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3747 Label COMPARE_TAIL_LONG; 3748 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3 3749 3750 int pcmpmask = 0x19; 3751 if (ae == StrIntrinsicNode::LL) { 3752 pcmpmask &= ~0x01; 3753 } 3754 3755 // Setup to compare 16-chars (32-bytes) vectors, 3756 // start from first character again because it has aligned address. 3757 if (ae == StrIntrinsicNode::LL) { 3758 stride2 = 32; 3759 } else { 3760 stride2 = 16; 3761 } 3762 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3763 adr_stride = stride << scale; 3764 } else { 3765 adr_stride1 = 8; //stride << scale1; 3766 adr_stride2 = 16; //stride << scale2; 3767 } 3768 3769 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3770 // rax and rdx are used by pcmpestri as elements counters 3771 movl(result, cnt2); 3772 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3773 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3774 3775 // fast path : compare first 2 8-char vectors. 3776 bind(COMPARE_16_CHARS); 3777 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3778 movdqu(vec1, Address(str1, 0)); 3779 } else { 3780 pmovzxbw(vec1, Address(str1, 0)); 3781 } 3782 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3783 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3784 3785 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3786 movdqu(vec1, Address(str1, adr_stride)); 3787 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3788 } else { 3789 pmovzxbw(vec1, Address(str1, adr_stride1)); 3790 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3791 } 3792 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3793 addl(cnt1, stride); 3794 3795 // Compare the characters at index in cnt1 3796 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3797 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3798 subl(result, cnt2); 3799 jmp(POP_LABEL); 3800 3801 // Setup the registers to start vector comparison loop 3802 bind(COMPARE_WIDE_VECTORS); 3803 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3804 lea(str1, Address(str1, result, scale)); 3805 lea(str2, Address(str2, result, scale)); 3806 } else { 3807 lea(str1, Address(str1, result, scale1)); 3808 lea(str2, Address(str2, result, scale2)); 3809 } 3810 subl(result, stride2); 3811 subl(cnt2, stride2); 3812 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3813 negptr(result); 3814 3815 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3816 bind(COMPARE_WIDE_VECTORS_LOOP); 3817 3818 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3819 cmpl(cnt2, stride2x2); 3820 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3821 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3822 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3823 3824 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3825 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3826 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3827 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3828 } else { 3829 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3830 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3831 } 3832 kortestql(mask, mask); 3833 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3834 addptr(result, stride2x2); // update since we already compared at this addr 3835 subl(cnt2, stride2x2); // and sub the size too 3836 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3837 3838 vpxor(vec1, vec1); 3839 jmpb(COMPARE_WIDE_TAIL); 3840 }//if (VM_Version::supports_avx512vlbw()) 3841 3842 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3843 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3844 vmovdqu(vec1, Address(str1, result, scale)); 3845 vpxor(vec1, Address(str2, result, scale)); 3846 } else { 3847 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3848 vpxor(vec1, Address(str2, result, scale2)); 3849 } 3850 vptest(vec1, vec1); 3851 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3852 addptr(result, stride2); 3853 subl(cnt2, stride2); 3854 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3855 // clean upper bits of YMM registers 3856 vpxor(vec1, vec1); 3857 3858 // compare wide vectors tail 3859 bind(COMPARE_WIDE_TAIL); 3860 testptr(result, result); 3861 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3862 3863 movl(result, stride2); 3864 movl(cnt2, result); 3865 negptr(result); 3866 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3867 3868 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3869 bind(VECTOR_NOT_EQUAL); 3870 // clean upper bits of YMM registers 3871 vpxor(vec1, vec1); 3872 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3873 lea(str1, Address(str1, result, scale)); 3874 lea(str2, Address(str2, result, scale)); 3875 } else { 3876 lea(str1, Address(str1, result, scale1)); 3877 lea(str2, Address(str2, result, scale2)); 3878 } 3879 jmp(COMPARE_16_CHARS); 3880 3881 // Compare tail chars, length between 1 to 15 chars 3882 bind(COMPARE_TAIL_LONG); 3883 movl(cnt2, result); 3884 cmpl(cnt2, stride); 3885 jcc(Assembler::less, COMPARE_SMALL_STR); 3886 3887 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3888 movdqu(vec1, Address(str1, 0)); 3889 } else { 3890 pmovzxbw(vec1, Address(str1, 0)); 3891 } 3892 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3893 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3894 subptr(cnt2, stride); 3895 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3896 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3897 lea(str1, Address(str1, result, scale)); 3898 lea(str2, Address(str2, result, scale)); 3899 } else { 3900 lea(str1, Address(str1, result, scale1)); 3901 lea(str2, Address(str2, result, scale2)); 3902 } 3903 negptr(cnt2); 3904 jmpb(WHILE_HEAD_LABEL); 3905 3906 bind(COMPARE_SMALL_STR); 3907 } else if (UseSSE42Intrinsics) { 3908 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3909 int pcmpmask = 0x19; 3910 // Setup to compare 8-char (16-byte) vectors, 3911 // start from first character again because it has aligned address. 3912 movl(result, cnt2); 3913 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3914 if (ae == StrIntrinsicNode::LL) { 3915 pcmpmask &= ~0x01; 3916 } 3917 jcc(Assembler::zero, COMPARE_TAIL); 3918 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3919 lea(str1, Address(str1, result, scale)); 3920 lea(str2, Address(str2, result, scale)); 3921 } else { 3922 lea(str1, Address(str1, result, scale1)); 3923 lea(str2, Address(str2, result, scale2)); 3924 } 3925 negptr(result); 3926 3927 // pcmpestri 3928 // inputs: 3929 // vec1- substring 3930 // rax - negative string length (elements count) 3931 // mem - scanned string 3932 // rdx - string length (elements count) 3933 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3934 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3935 // outputs: 3936 // rcx - first mismatched element index 3937 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3938 3939 bind(COMPARE_WIDE_VECTORS); 3940 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3941 movdqu(vec1, Address(str1, result, scale)); 3942 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3943 } else { 3944 pmovzxbw(vec1, Address(str1, result, scale1)); 3945 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3946 } 3947 // After pcmpestri cnt1(rcx) contains mismatched element index 3948 3949 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3950 addptr(result, stride); 3951 subptr(cnt2, stride); 3952 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3953 3954 // compare wide vectors tail 3955 testptr(result, result); 3956 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3957 3958 movl(cnt2, stride); 3959 movl(result, stride); 3960 negptr(result); 3961 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3962 movdqu(vec1, Address(str1, result, scale)); 3963 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3964 } else { 3965 pmovzxbw(vec1, Address(str1, result, scale1)); 3966 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3967 } 3968 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3969 3970 // Mismatched characters in the vectors 3971 bind(VECTOR_NOT_EQUAL); 3972 addptr(cnt1, result); 3973 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3974 subl(result, cnt2); 3975 jmpb(POP_LABEL); 3976 3977 bind(COMPARE_TAIL); // limit is zero 3978 movl(cnt2, result); 3979 // Fallthru to tail compare 3980 } 3981 // Shift str2 and str1 to the end of the arrays, negate min 3982 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3983 lea(str1, Address(str1, cnt2, scale)); 3984 lea(str2, Address(str2, cnt2, scale)); 3985 } else { 3986 lea(str1, Address(str1, cnt2, scale1)); 3987 lea(str2, Address(str2, cnt2, scale2)); 3988 } 3989 decrementl(cnt2); // first character was compared already 3990 negptr(cnt2); 3991 3992 // Compare the rest of the elements 3993 bind(WHILE_HEAD_LABEL); 3994 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3995 subl(result, cnt1); 3996 jccb(Assembler::notZero, POP_LABEL); 3997 increment(cnt2); 3998 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3999 4000 // Strings are equal up to min length. Return the length difference. 4001 bind(LENGTH_DIFF_LABEL); 4002 pop(result); 4003 if (ae == StrIntrinsicNode::UU) { 4004 // Divide diff by 2 to get number of chars 4005 sarl(result, 1); 4006 } 4007 jmpb(DONE_LABEL); 4008 4009 if (VM_Version::supports_avx512vlbw()) { 4010 4011 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 4012 4013 kmovql(cnt1, mask); 4014 notq(cnt1); 4015 bsfq(cnt2, cnt1); 4016 if (ae != StrIntrinsicNode::LL) { 4017 // Divide diff by 2 to get number of chars 4018 sarl(cnt2, 1); 4019 } 4020 addq(result, cnt2); 4021 if (ae == StrIntrinsicNode::LL) { 4022 load_unsigned_byte(cnt1, Address(str2, result)); 4023 load_unsigned_byte(result, Address(str1, result)); 4024 } else if (ae == StrIntrinsicNode::UU) { 4025 load_unsigned_short(cnt1, Address(str2, result, scale)); 4026 load_unsigned_short(result, Address(str1, result, scale)); 4027 } else { 4028 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4029 load_unsigned_byte(result, Address(str1, result, scale1)); 4030 } 4031 subl(result, cnt1); 4032 jmpb(POP_LABEL); 4033 }//if (VM_Version::supports_avx512vlbw()) 4034 4035 // Discard the stored length difference 4036 bind(POP_LABEL); 4037 pop(cnt1); 4038 4039 // That's it 4040 bind(DONE_LABEL); 4041 if(ae == StrIntrinsicNode::UL) { 4042 negl(result); 4043 } 4044 4045 } 4046 4047 // Search for Non-ASCII character (Negative byte value) in a byte array, 4048 // return the index of the first such character, otherwise the length 4049 // of the array segment searched. 4050 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4051 // @IntrinsicCandidate 4052 // public static int countPositives(byte[] ba, int off, int len) { 4053 // for (int i = off; i < off + len; i++) { 4054 // if (ba[i] < 0) { 4055 // return i - off; 4056 // } 4057 // } 4058 // return len; 4059 // } 4060 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4061 Register result, Register tmp1, 4062 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4063 // rsi: byte array 4064 // rcx: len 4065 // rax: result 4066 ShortBranchVerifier sbv(this); 4067 assert_different_registers(ary1, len, result, tmp1); 4068 assert_different_registers(vec1, vec2); 4069 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4070 4071 movl(result, len); // copy 4072 // len == 0 4073 testl(len, len); 4074 jcc(Assembler::zero, DONE); 4075 4076 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4077 VM_Version::supports_avx512vlbw() && 4078 VM_Version::supports_bmi2()) { 4079 4080 Label test_64_loop, test_tail, BREAK_LOOP; 4081 movl(tmp1, len); 4082 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4083 4084 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4085 andl(len, 0xffffffc0); // vector count (in chars) 4086 jccb(Assembler::zero, test_tail); 4087 4088 lea(ary1, Address(ary1, len, Address::times_1)); 4089 negptr(len); 4090 4091 bind(test_64_loop); 4092 // Check whether our 64 elements of size byte contain negatives 4093 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4094 kortestql(mask1, mask1); 4095 jcc(Assembler::notZero, BREAK_LOOP); 4096 4097 addptr(len, 64); 4098 jccb(Assembler::notZero, test_64_loop); 4099 4100 bind(test_tail); 4101 // bail out when there is nothing to be done 4102 testl(tmp1, -1); 4103 jcc(Assembler::zero, DONE); 4104 4105 4106 // check the tail for absense of negatives 4107 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4108 { 4109 Register tmp3_aliased = len; 4110 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4111 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4112 notq(tmp3_aliased); 4113 kmovql(mask2, tmp3_aliased); 4114 } 4115 4116 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4117 ktestq(mask1, mask2); 4118 jcc(Assembler::zero, DONE); 4119 4120 // do a full check for negative registers in the tail 4121 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4122 // ary1 already pointing to the right place 4123 jmpb(TAIL_START); 4124 4125 bind(BREAK_LOOP); 4126 // At least one byte in the last 64 byte block was negative. 4127 // Set up to look at the last 64 bytes as if they were a tail 4128 lea(ary1, Address(ary1, len, Address::times_1)); 4129 addptr(result, len); 4130 // Ignore the very last byte: if all others are positive, 4131 // it must be negative, so we can skip right to the 2+1 byte 4132 // end comparison at this point 4133 orl(result, 63); 4134 movl(len, 63); 4135 // Fallthru to tail compare 4136 } else { 4137 4138 if (UseAVX >= 2) { 4139 // With AVX2, use 32-byte vector compare 4140 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4141 4142 // Compare 32-byte vectors 4143 testl(len, 0xffffffe0); // vector count (in bytes) 4144 jccb(Assembler::zero, TAIL_START); 4145 4146 andl(len, 0xffffffe0); 4147 lea(ary1, Address(ary1, len, Address::times_1)); 4148 negptr(len); 4149 4150 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4151 movdl(vec2, tmp1); 4152 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4153 4154 bind(COMPARE_WIDE_VECTORS); 4155 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4156 vptest(vec1, vec2); 4157 jccb(Assembler::notZero, BREAK_LOOP); 4158 addptr(len, 32); 4159 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4160 4161 testl(result, 0x0000001f); // any bytes remaining? 4162 jcc(Assembler::zero, DONE); 4163 4164 // Quick test using the already prepared vector mask 4165 movl(len, result); 4166 andl(len, 0x0000001f); 4167 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4168 vptest(vec1, vec2); 4169 jcc(Assembler::zero, DONE); 4170 // There are zeros, jump to the tail to determine exactly where 4171 jmpb(TAIL_START); 4172 4173 bind(BREAK_LOOP); 4174 // At least one byte in the last 32-byte vector is negative. 4175 // Set up to look at the last 32 bytes as if they were a tail 4176 lea(ary1, Address(ary1, len, Address::times_1)); 4177 addptr(result, len); 4178 // Ignore the very last byte: if all others are positive, 4179 // it must be negative, so we can skip right to the 2+1 byte 4180 // end comparison at this point 4181 orl(result, 31); 4182 movl(len, 31); 4183 // Fallthru to tail compare 4184 } else if (UseSSE42Intrinsics) { 4185 // With SSE4.2, use double quad vector compare 4186 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4187 4188 // Compare 16-byte vectors 4189 testl(len, 0xfffffff0); // vector count (in bytes) 4190 jcc(Assembler::zero, TAIL_START); 4191 4192 andl(len, 0xfffffff0); 4193 lea(ary1, Address(ary1, len, Address::times_1)); 4194 negptr(len); 4195 4196 movl(tmp1, 0x80808080); 4197 movdl(vec2, tmp1); 4198 pshufd(vec2, vec2, 0); 4199 4200 bind(COMPARE_WIDE_VECTORS); 4201 movdqu(vec1, Address(ary1, len, Address::times_1)); 4202 ptest(vec1, vec2); 4203 jccb(Assembler::notZero, BREAK_LOOP); 4204 addptr(len, 16); 4205 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4206 4207 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4208 jcc(Assembler::zero, DONE); 4209 4210 // Quick test using the already prepared vector mask 4211 movl(len, result); 4212 andl(len, 0x0000000f); // tail count (in bytes) 4213 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4214 ptest(vec1, vec2); 4215 jcc(Assembler::zero, DONE); 4216 jmpb(TAIL_START); 4217 4218 bind(BREAK_LOOP); 4219 // At least one byte in the last 16-byte vector is negative. 4220 // Set up and look at the last 16 bytes as if they were a tail 4221 lea(ary1, Address(ary1, len, Address::times_1)); 4222 addptr(result, len); 4223 // Ignore the very last byte: if all others are positive, 4224 // it must be negative, so we can skip right to the 2+1 byte 4225 // end comparison at this point 4226 orl(result, 15); 4227 movl(len, 15); 4228 // Fallthru to tail compare 4229 } 4230 } 4231 4232 bind(TAIL_START); 4233 // Compare 4-byte vectors 4234 andl(len, 0xfffffffc); // vector count (in bytes) 4235 jccb(Assembler::zero, COMPARE_CHAR); 4236 4237 lea(ary1, Address(ary1, len, Address::times_1)); 4238 negptr(len); 4239 4240 bind(COMPARE_VECTORS); 4241 movl(tmp1, Address(ary1, len, Address::times_1)); 4242 andl(tmp1, 0x80808080); 4243 jccb(Assembler::notZero, TAIL_ADJUST); 4244 addptr(len, 4); 4245 jccb(Assembler::notZero, COMPARE_VECTORS); 4246 4247 // Compare trailing char (final 2-3 bytes), if any 4248 bind(COMPARE_CHAR); 4249 4250 testl(result, 0x2); // tail char 4251 jccb(Assembler::zero, COMPARE_BYTE); 4252 load_unsigned_short(tmp1, Address(ary1, 0)); 4253 andl(tmp1, 0x00008080); 4254 jccb(Assembler::notZero, CHAR_ADJUST); 4255 lea(ary1, Address(ary1, 2)); 4256 4257 bind(COMPARE_BYTE); 4258 testl(result, 0x1); // tail byte 4259 jccb(Assembler::zero, DONE); 4260 load_unsigned_byte(tmp1, Address(ary1, 0)); 4261 testl(tmp1, 0x00000080); 4262 jccb(Assembler::zero, DONE); 4263 subptr(result, 1); 4264 jmpb(DONE); 4265 4266 bind(TAIL_ADJUST); 4267 // there are negative bits in the last 4 byte block. 4268 // Adjust result and check the next three bytes 4269 addptr(result, len); 4270 orl(result, 3); 4271 lea(ary1, Address(ary1, len, Address::times_1)); 4272 jmpb(COMPARE_CHAR); 4273 4274 bind(CHAR_ADJUST); 4275 // We are looking at a char + optional byte tail, and found that one 4276 // of the bytes in the char is negative. Adjust the result, check the 4277 // first byte and readjust if needed. 4278 andl(result, 0xfffffffc); 4279 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4280 jccb(Assembler::notZero, DONE); 4281 addptr(result, 1); 4282 4283 // That's it 4284 bind(DONE); 4285 if (UseAVX >= 2) { 4286 // clean upper bits of YMM registers 4287 vpxor(vec1, vec1); 4288 vpxor(vec2, vec2); 4289 } 4290 } 4291 4292 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4293 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4294 Register limit, Register result, Register chr, 4295 XMMRegister vec1, XMMRegister vec2, bool is_char, 4296 KRegister mask, bool expand_ary2) { 4297 // for expand_ary2, limit is the (smaller) size of the second array. 4298 ShortBranchVerifier sbv(this); 4299 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4300 4301 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4302 "Expansion only implemented for AVX2"); 4303 4304 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4305 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4306 4307 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4308 int scaleIncr = expand_ary2 ? 8 : 16; 4309 4310 if (is_array_equ) { 4311 // Check the input args 4312 cmpoop(ary1, ary2); 4313 jcc(Assembler::equal, TRUE_LABEL); 4314 4315 // Need additional checks for arrays_equals. 4316 testptr(ary1, ary1); 4317 jcc(Assembler::zero, FALSE_LABEL); 4318 testptr(ary2, ary2); 4319 jcc(Assembler::zero, FALSE_LABEL); 4320 4321 // Check the lengths 4322 movl(limit, Address(ary1, length_offset)); 4323 cmpl(limit, Address(ary2, length_offset)); 4324 jcc(Assembler::notEqual, FALSE_LABEL); 4325 } 4326 4327 // count == 0 4328 testl(limit, limit); 4329 jcc(Assembler::zero, TRUE_LABEL); 4330 4331 if (is_array_equ) { 4332 // Load array address 4333 lea(ary1, Address(ary1, base_offset)); 4334 lea(ary2, Address(ary2, base_offset)); 4335 } 4336 4337 if (is_array_equ && is_char) { 4338 // arrays_equals when used for char[]. 4339 shll(limit, 1); // byte count != 0 4340 } 4341 movl(result, limit); // copy 4342 4343 if (UseAVX >= 2) { 4344 // With AVX2, use 32-byte vector compare 4345 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4346 4347 // Compare 32-byte vectors 4348 if (expand_ary2) { 4349 andl(result, 0x0000000f); // tail count (in bytes) 4350 andl(limit, 0xfffffff0); // vector count (in bytes) 4351 jcc(Assembler::zero, COMPARE_TAIL); 4352 } else { 4353 andl(result, 0x0000001f); // tail count (in bytes) 4354 andl(limit, 0xffffffe0); // vector count (in bytes) 4355 jcc(Assembler::zero, COMPARE_TAIL_16); 4356 } 4357 4358 lea(ary1, Address(ary1, limit, scaleFactor)); 4359 lea(ary2, Address(ary2, limit, Address::times_1)); 4360 negptr(limit); 4361 4362 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4363 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4364 4365 cmpl(limit, -64); 4366 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4367 4368 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4369 4370 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4371 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4372 kortestql(mask, mask); 4373 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4374 addptr(limit, 64); // update since we already compared at this addr 4375 cmpl(limit, -64); 4376 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4377 4378 // At this point we may still need to compare -limit+result bytes. 4379 // We could execute the next two instruction and just continue via non-wide path: 4380 // cmpl(limit, 0); 4381 // jcc(Assembler::equal, COMPARE_TAIL); // true 4382 // But since we stopped at the points ary{1,2}+limit which are 4383 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4384 // (|limit| <= 32 and result < 32), 4385 // we may just compare the last 64 bytes. 4386 // 4387 addptr(result, -64); // it is safe, bc we just came from this area 4388 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4389 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4390 kortestql(mask, mask); 4391 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4392 4393 jmp(TRUE_LABEL); 4394 4395 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4396 4397 }//if (VM_Version::supports_avx512vlbw()) 4398 4399 bind(COMPARE_WIDE_VECTORS); 4400 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4401 if (expand_ary2) { 4402 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4403 } else { 4404 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4405 } 4406 vpxor(vec1, vec2); 4407 4408 vptest(vec1, vec1); 4409 jcc(Assembler::notZero, FALSE_LABEL); 4410 addptr(limit, scaleIncr * 2); 4411 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4412 4413 testl(result, result); 4414 jcc(Assembler::zero, TRUE_LABEL); 4415 4416 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4417 if (expand_ary2) { 4418 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4419 } else { 4420 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4421 } 4422 vpxor(vec1, vec2); 4423 4424 vptest(vec1, vec1); 4425 jcc(Assembler::notZero, FALSE_LABEL); 4426 jmp(TRUE_LABEL); 4427 4428 bind(COMPARE_TAIL_16); // limit is zero 4429 movl(limit, result); 4430 4431 // Compare 16-byte chunks 4432 andl(result, 0x0000000f); // tail count (in bytes) 4433 andl(limit, 0xfffffff0); // vector count (in bytes) 4434 jcc(Assembler::zero, COMPARE_TAIL); 4435 4436 lea(ary1, Address(ary1, limit, scaleFactor)); 4437 lea(ary2, Address(ary2, limit, Address::times_1)); 4438 negptr(limit); 4439 4440 bind(COMPARE_WIDE_VECTORS_16); 4441 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4442 if (expand_ary2) { 4443 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4444 } else { 4445 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4446 } 4447 pxor(vec1, vec2); 4448 4449 ptest(vec1, vec1); 4450 jcc(Assembler::notZero, FALSE_LABEL); 4451 addptr(limit, scaleIncr); 4452 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4453 4454 bind(COMPARE_TAIL); // limit is zero 4455 movl(limit, result); 4456 // Fallthru to tail compare 4457 } else if (UseSSE42Intrinsics) { 4458 // With SSE4.2, use double quad vector compare 4459 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4460 4461 // Compare 16-byte vectors 4462 andl(result, 0x0000000f); // tail count (in bytes) 4463 andl(limit, 0xfffffff0); // vector count (in bytes) 4464 jcc(Assembler::zero, COMPARE_TAIL); 4465 4466 lea(ary1, Address(ary1, limit, Address::times_1)); 4467 lea(ary2, Address(ary2, limit, Address::times_1)); 4468 negptr(limit); 4469 4470 bind(COMPARE_WIDE_VECTORS); 4471 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4472 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4473 pxor(vec1, vec2); 4474 4475 ptest(vec1, vec1); 4476 jcc(Assembler::notZero, FALSE_LABEL); 4477 addptr(limit, 16); 4478 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4479 4480 testl(result, result); 4481 jcc(Assembler::zero, TRUE_LABEL); 4482 4483 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4484 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4485 pxor(vec1, vec2); 4486 4487 ptest(vec1, vec1); 4488 jccb(Assembler::notZero, FALSE_LABEL); 4489 jmpb(TRUE_LABEL); 4490 4491 bind(COMPARE_TAIL); // limit is zero 4492 movl(limit, result); 4493 // Fallthru to tail compare 4494 } 4495 4496 // Compare 4-byte vectors 4497 if (expand_ary2) { 4498 testl(result, result); 4499 jccb(Assembler::zero, TRUE_LABEL); 4500 } else { 4501 andl(limit, 0xfffffffc); // vector count (in bytes) 4502 jccb(Assembler::zero, COMPARE_CHAR); 4503 } 4504 4505 lea(ary1, Address(ary1, limit, scaleFactor)); 4506 lea(ary2, Address(ary2, limit, Address::times_1)); 4507 negptr(limit); 4508 4509 bind(COMPARE_VECTORS); 4510 if (expand_ary2) { 4511 // There are no "vector" operations for bytes to shorts 4512 movzbl(chr, Address(ary2, limit, Address::times_1)); 4513 cmpw(Address(ary1, limit, Address::times_2), chr); 4514 jccb(Assembler::notEqual, FALSE_LABEL); 4515 addptr(limit, 1); 4516 jcc(Assembler::notZero, COMPARE_VECTORS); 4517 jmp(TRUE_LABEL); 4518 } else { 4519 movl(chr, Address(ary1, limit, Address::times_1)); 4520 cmpl(chr, Address(ary2, limit, Address::times_1)); 4521 jccb(Assembler::notEqual, FALSE_LABEL); 4522 addptr(limit, 4); 4523 jcc(Assembler::notZero, COMPARE_VECTORS); 4524 } 4525 4526 // Compare trailing char (final 2 bytes), if any 4527 bind(COMPARE_CHAR); 4528 testl(result, 0x2); // tail char 4529 jccb(Assembler::zero, COMPARE_BYTE); 4530 load_unsigned_short(chr, Address(ary1, 0)); 4531 load_unsigned_short(limit, Address(ary2, 0)); 4532 cmpl(chr, limit); 4533 jccb(Assembler::notEqual, FALSE_LABEL); 4534 4535 if (is_array_equ && is_char) { 4536 bind(COMPARE_BYTE); 4537 } else { 4538 lea(ary1, Address(ary1, 2)); 4539 lea(ary2, Address(ary2, 2)); 4540 4541 bind(COMPARE_BYTE); 4542 testl(result, 0x1); // tail byte 4543 jccb(Assembler::zero, TRUE_LABEL); 4544 load_unsigned_byte(chr, Address(ary1, 0)); 4545 load_unsigned_byte(limit, Address(ary2, 0)); 4546 cmpl(chr, limit); 4547 jccb(Assembler::notEqual, FALSE_LABEL); 4548 } 4549 bind(TRUE_LABEL); 4550 movl(result, 1); // return true 4551 jmpb(DONE); 4552 4553 bind(FALSE_LABEL); 4554 xorl(result, result); // return false 4555 4556 // That's it 4557 bind(DONE); 4558 if (UseAVX >= 2) { 4559 // clean upper bits of YMM registers 4560 vpxor(vec1, vec1); 4561 vpxor(vec2, vec2); 4562 } 4563 } 4564 4565 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4566 #define __ masm. 4567 Register dst = stub.data<0>(); 4568 XMMRegister src = stub.data<1>(); 4569 address target = stub.data<2>(); 4570 __ bind(stub.entry()); 4571 __ subptr(rsp, 8); 4572 __ movdbl(Address(rsp), src); 4573 __ call(RuntimeAddress(target)); 4574 __ pop(dst); 4575 __ jmp(stub.continuation()); 4576 #undef __ 4577 } 4578 4579 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4580 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4581 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4582 4583 address slowpath_target; 4584 if (dst_bt == T_INT) { 4585 if (src_bt == T_FLOAT) { 4586 cvttss2sil(dst, src); 4587 cmpl(dst, 0x80000000); 4588 slowpath_target = StubRoutines::x86::f2i_fixup(); 4589 } else { 4590 cvttsd2sil(dst, src); 4591 cmpl(dst, 0x80000000); 4592 slowpath_target = StubRoutines::x86::d2i_fixup(); 4593 } 4594 } else { 4595 if (src_bt == T_FLOAT) { 4596 cvttss2siq(dst, src); 4597 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4598 slowpath_target = StubRoutines::x86::f2l_fixup(); 4599 } else { 4600 cvttsd2siq(dst, src); 4601 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4602 slowpath_target = StubRoutines::x86::d2l_fixup(); 4603 } 4604 } 4605 4606 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4607 jcc(Assembler::equal, stub->entry()); 4608 bind(stub->continuation()); 4609 } 4610 4611 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4612 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4613 switch(ideal_opc) { 4614 case Op_LShiftVS: 4615 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4616 case Op_LShiftVI: 4617 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4618 case Op_LShiftVL: 4619 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4620 case Op_RShiftVS: 4621 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4622 case Op_RShiftVI: 4623 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4624 case Op_RShiftVL: 4625 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4626 case Op_URShiftVS: 4627 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4628 case Op_URShiftVI: 4629 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4630 case Op_URShiftVL: 4631 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4632 case Op_RotateRightV: 4633 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4634 case Op_RotateLeftV: 4635 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4636 default: 4637 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4638 break; 4639 } 4640 } 4641 4642 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4643 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) { 4644 if (is_unsigned) { 4645 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4646 } else { 4647 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4648 } 4649 } 4650 4651 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4652 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4653 switch (elem_bt) { 4654 case T_BYTE: 4655 if (ideal_opc == Op_SaturatingAddV) { 4656 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4657 } else { 4658 assert(ideal_opc == Op_SaturatingSubV, ""); 4659 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4660 } 4661 break; 4662 case T_SHORT: 4663 if (ideal_opc == Op_SaturatingAddV) { 4664 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4665 } else { 4666 assert(ideal_opc == Op_SaturatingSubV, ""); 4667 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4668 } 4669 break; 4670 default: 4671 fatal("Unsupported type %s", type2name(elem_bt)); 4672 break; 4673 } 4674 } 4675 4676 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4677 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4678 switch (elem_bt) { 4679 case T_BYTE: 4680 if (ideal_opc == Op_SaturatingAddV) { 4681 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4682 } else { 4683 assert(ideal_opc == Op_SaturatingSubV, ""); 4684 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4685 } 4686 break; 4687 case T_SHORT: 4688 if (ideal_opc == Op_SaturatingAddV) { 4689 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4690 } else { 4691 assert(ideal_opc == Op_SaturatingSubV, ""); 4692 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4693 } 4694 break; 4695 default: 4696 fatal("Unsupported type %s", type2name(elem_bt)); 4697 break; 4698 } 4699 } 4700 4701 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4702 Address src2, bool is_unsigned, bool merge, int vlen_enc) { 4703 if (is_unsigned) { 4704 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4705 } else { 4706 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4707 } 4708 } 4709 4710 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4711 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4712 switch (elem_bt) { 4713 case T_BYTE: 4714 if (ideal_opc == Op_SaturatingAddV) { 4715 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4716 } else { 4717 assert(ideal_opc == Op_SaturatingSubV, ""); 4718 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4719 } 4720 break; 4721 case T_SHORT: 4722 if (ideal_opc == Op_SaturatingAddV) { 4723 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4724 } else { 4725 assert(ideal_opc == Op_SaturatingSubV, ""); 4726 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4727 } 4728 break; 4729 default: 4730 fatal("Unsupported type %s", type2name(elem_bt)); 4731 break; 4732 } 4733 } 4734 4735 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4736 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4737 switch (elem_bt) { 4738 case T_BYTE: 4739 if (ideal_opc == Op_SaturatingAddV) { 4740 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4741 } else { 4742 assert(ideal_opc == Op_SaturatingSubV, ""); 4743 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4744 } 4745 break; 4746 case T_SHORT: 4747 if (ideal_opc == Op_SaturatingAddV) { 4748 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4749 } else { 4750 assert(ideal_opc == Op_SaturatingSubV, ""); 4751 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4752 } 4753 break; 4754 default: 4755 fatal("Unsupported type %s", type2name(elem_bt)); 4756 break; 4757 } 4758 } 4759 4760 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4761 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4762 bool is_varshift) { 4763 switch (ideal_opc) { 4764 case Op_AddVB: 4765 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4766 case Op_AddVS: 4767 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4768 case Op_AddVI: 4769 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4770 case Op_AddVL: 4771 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4772 case Op_AddVF: 4773 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4774 case Op_AddVD: 4775 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4776 case Op_SubVB: 4777 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4778 case Op_SubVS: 4779 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4780 case Op_SubVI: 4781 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4782 case Op_SubVL: 4783 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4784 case Op_SubVF: 4785 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4786 case Op_SubVD: 4787 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4788 case Op_MulVS: 4789 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4790 case Op_MulVI: 4791 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4792 case Op_MulVL: 4793 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4794 case Op_MulVF: 4795 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4796 case Op_MulVD: 4797 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4798 case Op_DivVF: 4799 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4800 case Op_DivVD: 4801 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4802 case Op_SqrtVF: 4803 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4804 case Op_SqrtVD: 4805 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4806 case Op_AbsVB: 4807 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4808 case Op_AbsVS: 4809 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4810 case Op_AbsVI: 4811 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4812 case Op_AbsVL: 4813 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4814 case Op_FmaVF: 4815 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4816 case Op_FmaVD: 4817 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4818 case Op_VectorRearrange: 4819 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4820 case Op_LShiftVS: 4821 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4822 case Op_LShiftVI: 4823 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4824 case Op_LShiftVL: 4825 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4826 case Op_RShiftVS: 4827 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4828 case Op_RShiftVI: 4829 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4830 case Op_RShiftVL: 4831 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4832 case Op_URShiftVS: 4833 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4834 case Op_URShiftVI: 4835 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4836 case Op_URShiftVL: 4837 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4838 case Op_RotateLeftV: 4839 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4840 case Op_RotateRightV: 4841 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4842 case Op_MaxV: 4843 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4844 case Op_MinV: 4845 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4846 case Op_UMinV: 4847 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4848 case Op_UMaxV: 4849 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4850 case Op_XorV: 4851 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4852 case Op_OrV: 4853 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4854 case Op_AndV: 4855 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4856 default: 4857 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4858 break; 4859 } 4860 } 4861 4862 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4863 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4864 switch (ideal_opc) { 4865 case Op_AddVB: 4866 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4867 case Op_AddVS: 4868 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4869 case Op_AddVI: 4870 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4871 case Op_AddVL: 4872 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4873 case Op_AddVF: 4874 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4875 case Op_AddVD: 4876 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4877 case Op_SubVB: 4878 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4879 case Op_SubVS: 4880 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4881 case Op_SubVI: 4882 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4883 case Op_SubVL: 4884 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4885 case Op_SubVF: 4886 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4887 case Op_SubVD: 4888 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4889 case Op_MulVS: 4890 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4891 case Op_MulVI: 4892 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4893 case Op_MulVL: 4894 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4895 case Op_MulVF: 4896 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4897 case Op_MulVD: 4898 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4899 case Op_DivVF: 4900 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4901 case Op_DivVD: 4902 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4903 case Op_FmaVF: 4904 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4905 case Op_FmaVD: 4906 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4907 case Op_MaxV: 4908 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4909 case Op_MinV: 4910 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4911 case Op_UMaxV: 4912 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4913 case Op_UMinV: 4914 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4915 case Op_XorV: 4916 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4917 case Op_OrV: 4918 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4919 case Op_AndV: 4920 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4921 default: 4922 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4923 break; 4924 } 4925 } 4926 4927 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4928 KRegister src1, KRegister src2) { 4929 BasicType etype = T_ILLEGAL; 4930 switch(mask_len) { 4931 case 2: 4932 case 4: 4933 case 8: etype = T_BYTE; break; 4934 case 16: etype = T_SHORT; break; 4935 case 32: etype = T_INT; break; 4936 case 64: etype = T_LONG; break; 4937 default: fatal("Unsupported type"); break; 4938 } 4939 assert(etype != T_ILLEGAL, ""); 4940 switch(ideal_opc) { 4941 case Op_AndVMask: 4942 kand(etype, dst, src1, src2); break; 4943 case Op_OrVMask: 4944 kor(etype, dst, src1, src2); break; 4945 case Op_XorVMask: 4946 kxor(etype, dst, src1, src2); break; 4947 default: 4948 fatal("Unsupported masked operation"); break; 4949 } 4950 } 4951 4952 /* 4953 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4954 * If src is NaN, the result is 0. 4955 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4956 * the result is equal to the value of Integer.MIN_VALUE. 4957 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4958 * the result is equal to the value of Integer.MAX_VALUE. 4959 */ 4960 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4961 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4962 Register rscratch, AddressLiteral float_sign_flip, 4963 int vec_enc) { 4964 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4965 Label done; 4966 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4967 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4968 vptest(xtmp2, xtmp2, vec_enc); 4969 jccb(Assembler::equal, done); 4970 4971 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4972 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4973 4974 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4975 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4976 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4977 4978 // Recompute the mask for remaining special value. 4979 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4980 // Extract SRC values corresponding to TRUE mask lanes. 4981 vpand(xtmp4, xtmp2, src, vec_enc); 4982 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4983 // values are set. 4984 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4985 4986 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4987 bind(done); 4988 } 4989 4990 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4991 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4992 Register rscratch, AddressLiteral float_sign_flip, 4993 int vec_enc) { 4994 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4995 Label done; 4996 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4997 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4998 kortestwl(ktmp1, ktmp1); 4999 jccb(Assembler::equal, done); 5000 5001 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5002 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5003 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5004 5005 kxorwl(ktmp1, ktmp1, ktmp2); 5006 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5007 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5008 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5009 bind(done); 5010 } 5011 5012 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5013 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5014 Register rscratch, AddressLiteral double_sign_flip, 5015 int vec_enc) { 5016 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5017 5018 Label done; 5019 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5020 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 5021 kortestwl(ktmp1, ktmp1); 5022 jccb(Assembler::equal, done); 5023 5024 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5025 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5026 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5027 5028 kxorwl(ktmp1, ktmp1, ktmp2); 5029 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5030 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5031 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5032 bind(done); 5033 } 5034 5035 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5036 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5037 Register rscratch, AddressLiteral float_sign_flip, 5038 int vec_enc) { 5039 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5040 Label done; 5041 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5042 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5043 kortestwl(ktmp1, ktmp1); 5044 jccb(Assembler::equal, done); 5045 5046 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5047 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5048 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5049 5050 kxorwl(ktmp1, ktmp1, ktmp2); 5051 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5052 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5053 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5054 bind(done); 5055 } 5056 5057 /* 5058 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5059 * If src is NaN, the result is 0. 5060 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 5061 * the result is equal to the value of Long.MIN_VALUE. 5062 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 5063 * the result is equal to the value of Long.MAX_VALUE. 5064 */ 5065 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5066 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5067 Register rscratch, AddressLiteral double_sign_flip, 5068 int vec_enc) { 5069 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5070 5071 Label done; 5072 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5073 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 5074 kortestwl(ktmp1, ktmp1); 5075 jccb(Assembler::equal, done); 5076 5077 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5078 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5079 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5080 5081 kxorwl(ktmp1, ktmp1, ktmp2); 5082 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5083 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5084 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5085 bind(done); 5086 } 5087 5088 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5089 XMMRegister xtmp, int index, int vec_enc) { 5090 assert(vec_enc < Assembler::AVX_512bit, ""); 5091 if (vec_enc == Assembler::AVX_256bit) { 5092 vextractf128_high(xtmp, src); 5093 vshufps(dst, src, xtmp, index, vec_enc); 5094 } else { 5095 vshufps(dst, src, zero, index, vec_enc); 5096 } 5097 } 5098 5099 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5100 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5101 AddressLiteral float_sign_flip, int src_vec_enc) { 5102 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5103 5104 Label done; 5105 // Compare the destination lanes with float_sign_flip 5106 // value to get mask for all special values. 5107 movdqu(xtmp1, float_sign_flip, rscratch); 5108 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5109 ptest(xtmp2, xtmp2); 5110 jccb(Assembler::equal, done); 5111 5112 // Flip float_sign_flip to get max integer value. 5113 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5114 pxor(xtmp1, xtmp4); 5115 5116 // Set detination lanes corresponding to unordered source lanes as zero. 5117 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5118 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5119 5120 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5121 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5122 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5123 5124 // Recompute the mask for remaining special value. 5125 pxor(xtmp2, xtmp3); 5126 // Extract mask corresponding to non-negative source lanes. 5127 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5128 5129 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5130 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5131 pand(xtmp3, xtmp2); 5132 5133 // Replace destination lanes holding special value(0x80000000) with max int 5134 // if corresponding source lane holds a +ve value. 5135 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5136 bind(done); 5137 } 5138 5139 5140 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5141 XMMRegister xtmp, Register rscratch, int vec_enc) { 5142 switch(to_elem_bt) { 5143 case T_SHORT: 5144 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5145 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5146 vpackusdw(dst, dst, zero, vec_enc); 5147 if (vec_enc == Assembler::AVX_256bit) { 5148 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5149 } 5150 break; 5151 case T_BYTE: 5152 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5153 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5154 vpackusdw(dst, dst, zero, vec_enc); 5155 if (vec_enc == Assembler::AVX_256bit) { 5156 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5157 } 5158 vpackuswb(dst, dst, zero, vec_enc); 5159 break; 5160 default: assert(false, "%s", type2name(to_elem_bt)); 5161 } 5162 } 5163 5164 /* 5165 * Algorithm for vector D2L and F2I conversions:- 5166 * a) Perform vector D2L/F2I cast. 5167 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5168 * It signifies that source value could be any of the special floating point 5169 * values(NaN,-Inf,Inf,Max,-Min). 5170 * c) Set destination to zero if source is NaN value. 5171 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5172 */ 5173 5174 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5175 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5176 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5177 int to_elem_sz = type2aelembytes(to_elem_bt); 5178 assert(to_elem_sz <= 4, ""); 5179 vcvttps2dq(dst, src, vec_enc); 5180 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5181 if (to_elem_sz < 4) { 5182 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5183 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5184 } 5185 } 5186 5187 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5188 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5189 Register rscratch, int vec_enc) { 5190 int to_elem_sz = type2aelembytes(to_elem_bt); 5191 assert(to_elem_sz <= 4, ""); 5192 vcvttps2dq(dst, src, vec_enc); 5193 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5194 switch(to_elem_bt) { 5195 case T_INT: 5196 break; 5197 case T_SHORT: 5198 evpmovdw(dst, dst, vec_enc); 5199 break; 5200 case T_BYTE: 5201 evpmovdb(dst, dst, vec_enc); 5202 break; 5203 default: assert(false, "%s", type2name(to_elem_bt)); 5204 } 5205 } 5206 5207 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5208 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5209 Register rscratch, int vec_enc) { 5210 evcvttps2qq(dst, src, vec_enc); 5211 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5212 } 5213 5214 // Handling for downcasting from double to integer or sub-word types on AVX2. 5215 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5216 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5217 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5218 int to_elem_sz = type2aelembytes(to_elem_bt); 5219 assert(to_elem_sz < 8, ""); 5220 vcvttpd2dq(dst, src, vec_enc); 5221 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5222 float_sign_flip, vec_enc); 5223 if (to_elem_sz < 4) { 5224 // xtmp4 holds all zero lanes. 5225 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5226 } 5227 } 5228 5229 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5230 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5231 KRegister ktmp2, AddressLiteral sign_flip, 5232 Register rscratch, int vec_enc) { 5233 if (VM_Version::supports_avx512dq()) { 5234 evcvttpd2qq(dst, src, vec_enc); 5235 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5236 switch(to_elem_bt) { 5237 case T_LONG: 5238 break; 5239 case T_INT: 5240 evpmovsqd(dst, dst, vec_enc); 5241 break; 5242 case T_SHORT: 5243 evpmovsqd(dst, dst, vec_enc); 5244 evpmovdw(dst, dst, vec_enc); 5245 break; 5246 case T_BYTE: 5247 evpmovsqd(dst, dst, vec_enc); 5248 evpmovdb(dst, dst, vec_enc); 5249 break; 5250 default: assert(false, "%s", type2name(to_elem_bt)); 5251 } 5252 } else { 5253 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5254 vcvttpd2dq(dst, src, vec_enc); 5255 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5256 switch(to_elem_bt) { 5257 case T_INT: 5258 break; 5259 case T_SHORT: 5260 evpmovdw(dst, dst, vec_enc); 5261 break; 5262 case T_BYTE: 5263 evpmovdb(dst, dst, vec_enc); 5264 break; 5265 default: assert(false, "%s", type2name(to_elem_bt)); 5266 } 5267 } 5268 } 5269 5270 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5271 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5272 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5273 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5274 // and re-instantiate original MXCSR.RC mode after that. 5275 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5276 5277 mov64(tmp, julong_cast(0.5L)); 5278 evpbroadcastq(xtmp1, tmp, vec_enc); 5279 vaddpd(xtmp1, src , xtmp1, vec_enc); 5280 evcvtpd2qq(dst, xtmp1, vec_enc); 5281 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5282 double_sign_flip, vec_enc);; 5283 5284 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5285 } 5286 5287 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5288 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5289 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5290 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5291 // and re-instantiate original MXCSR.RC mode after that. 5292 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5293 5294 movl(tmp, jint_cast(0.5)); 5295 movq(xtmp1, tmp); 5296 vbroadcastss(xtmp1, xtmp1, vec_enc); 5297 vaddps(xtmp1, src , xtmp1, vec_enc); 5298 vcvtps2dq(dst, xtmp1, vec_enc); 5299 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5300 float_sign_flip, vec_enc); 5301 5302 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5303 } 5304 5305 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5306 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5307 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5308 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5309 // and re-instantiate original MXCSR.RC mode after that. 5310 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5311 5312 movl(tmp, jint_cast(0.5)); 5313 movq(xtmp1, tmp); 5314 vbroadcastss(xtmp1, xtmp1, vec_enc); 5315 vaddps(xtmp1, src , xtmp1, vec_enc); 5316 vcvtps2dq(dst, xtmp1, vec_enc); 5317 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5318 5319 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5320 } 5321 5322 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5323 BasicType from_elem_bt, BasicType to_elem_bt) { 5324 switch (from_elem_bt) { 5325 case T_BYTE: 5326 switch (to_elem_bt) { 5327 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5328 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5329 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5330 default: ShouldNotReachHere(); 5331 } 5332 break; 5333 case T_SHORT: 5334 switch (to_elem_bt) { 5335 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5336 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5337 default: ShouldNotReachHere(); 5338 } 5339 break; 5340 case T_INT: 5341 assert(to_elem_bt == T_LONG, ""); 5342 vpmovzxdq(dst, src, vlen_enc); 5343 break; 5344 default: 5345 ShouldNotReachHere(); 5346 } 5347 } 5348 5349 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5350 BasicType from_elem_bt, BasicType to_elem_bt) { 5351 switch (from_elem_bt) { 5352 case T_BYTE: 5353 switch (to_elem_bt) { 5354 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5355 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5356 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5357 default: ShouldNotReachHere(); 5358 } 5359 break; 5360 case T_SHORT: 5361 switch (to_elem_bt) { 5362 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5363 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5364 default: ShouldNotReachHere(); 5365 } 5366 break; 5367 case T_INT: 5368 assert(to_elem_bt == T_LONG, ""); 5369 vpmovsxdq(dst, src, vlen_enc); 5370 break; 5371 default: 5372 ShouldNotReachHere(); 5373 } 5374 } 5375 5376 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5377 BasicType dst_bt, BasicType src_bt, int vlen) { 5378 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5379 assert(vlen_enc != AVX_512bit, ""); 5380 5381 int dst_bt_size = type2aelembytes(dst_bt); 5382 int src_bt_size = type2aelembytes(src_bt); 5383 if (dst_bt_size > src_bt_size) { 5384 switch (dst_bt_size / src_bt_size) { 5385 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5386 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5387 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5388 default: ShouldNotReachHere(); 5389 } 5390 } else { 5391 assert(dst_bt_size < src_bt_size, ""); 5392 switch (src_bt_size / dst_bt_size) { 5393 case 2: { 5394 if (vlen_enc == AVX_128bit) { 5395 vpacksswb(dst, src, src, vlen_enc); 5396 } else { 5397 vpacksswb(dst, src, src, vlen_enc); 5398 vpermq(dst, dst, 0x08, vlen_enc); 5399 } 5400 break; 5401 } 5402 case 4: { 5403 if (vlen_enc == AVX_128bit) { 5404 vpackssdw(dst, src, src, vlen_enc); 5405 vpacksswb(dst, dst, dst, vlen_enc); 5406 } else { 5407 vpackssdw(dst, src, src, vlen_enc); 5408 vpermq(dst, dst, 0x08, vlen_enc); 5409 vpacksswb(dst, dst, dst, AVX_128bit); 5410 } 5411 break; 5412 } 5413 case 8: { 5414 if (vlen_enc == AVX_128bit) { 5415 vpshufd(dst, src, 0x08, vlen_enc); 5416 vpackssdw(dst, dst, dst, vlen_enc); 5417 vpacksswb(dst, dst, dst, vlen_enc); 5418 } else { 5419 vpshufd(dst, src, 0x08, vlen_enc); 5420 vpermq(dst, dst, 0x08, vlen_enc); 5421 vpackssdw(dst, dst, dst, AVX_128bit); 5422 vpacksswb(dst, dst, dst, AVX_128bit); 5423 } 5424 break; 5425 } 5426 default: ShouldNotReachHere(); 5427 } 5428 } 5429 } 5430 5431 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5432 bool merge, BasicType bt, int vlen_enc) { 5433 if (bt == T_INT) { 5434 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5435 } else { 5436 assert(bt == T_LONG, ""); 5437 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5438 } 5439 } 5440 5441 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5442 bool merge, BasicType bt, int vlen_enc) { 5443 if (bt == T_INT) { 5444 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5445 } else { 5446 assert(bt == T_LONG, ""); 5447 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5448 } 5449 } 5450 5451 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5452 Register rtmp2, XMMRegister xtmp, int mask_len, 5453 int vec_enc) { 5454 int index = 0; 5455 int vindex = 0; 5456 mov64(rtmp1, 0x0101010101010101L); 5457 pdepq(rtmp1, src, rtmp1); 5458 if (mask_len > 8) { 5459 movq(rtmp2, src); 5460 vpxor(xtmp, xtmp, xtmp, vec_enc); 5461 movq(xtmp, rtmp1); 5462 } 5463 movq(dst, rtmp1); 5464 5465 mask_len -= 8; 5466 while (mask_len > 0) { 5467 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5468 index++; 5469 if ((index % 2) == 0) { 5470 pxor(xtmp, xtmp); 5471 } 5472 mov64(rtmp1, 0x0101010101010101L); 5473 shrq(rtmp2, 8); 5474 pdepq(rtmp1, rtmp2, rtmp1); 5475 pinsrq(xtmp, rtmp1, index % 2); 5476 vindex = index / 2; 5477 if (vindex) { 5478 // Write entire 16 byte vector when both 64 bit 5479 // lanes are update to save redundant instructions. 5480 if (index % 2) { 5481 vinsertf128(dst, dst, xtmp, vindex); 5482 } 5483 } else { 5484 vmovdqu(dst, xtmp); 5485 } 5486 mask_len -= 8; 5487 } 5488 } 5489 5490 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5491 switch(opc) { 5492 case Op_VectorMaskTrueCount: 5493 popcntq(dst, tmp); 5494 break; 5495 case Op_VectorMaskLastTrue: 5496 if (VM_Version::supports_lzcnt()) { 5497 lzcntq(tmp, tmp); 5498 movl(dst, 63); 5499 subl(dst, tmp); 5500 } else { 5501 movl(dst, -1); 5502 bsrq(tmp, tmp); 5503 cmov32(Assembler::notZero, dst, tmp); 5504 } 5505 break; 5506 case Op_VectorMaskFirstTrue: 5507 if (VM_Version::supports_bmi1()) { 5508 if (masklen < 32) { 5509 orl(tmp, 1 << masklen); 5510 tzcntl(dst, tmp); 5511 } else if (masklen == 32) { 5512 tzcntl(dst, tmp); 5513 } else { 5514 assert(masklen == 64, ""); 5515 tzcntq(dst, tmp); 5516 } 5517 } else { 5518 if (masklen < 32) { 5519 orl(tmp, 1 << masklen); 5520 bsfl(dst, tmp); 5521 } else { 5522 assert(masklen == 32 || masklen == 64, ""); 5523 movl(dst, masklen); 5524 if (masklen == 32) { 5525 bsfl(tmp, tmp); 5526 } else { 5527 bsfq(tmp, tmp); 5528 } 5529 cmov32(Assembler::notZero, dst, tmp); 5530 } 5531 } 5532 break; 5533 case Op_VectorMaskToLong: 5534 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5535 break; 5536 default: assert(false, "Unhandled mask operation"); 5537 } 5538 } 5539 5540 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5541 int masklen, int masksize, int vec_enc) { 5542 assert(VM_Version::supports_popcnt(), ""); 5543 5544 if(VM_Version::supports_avx512bw()) { 5545 kmovql(tmp, mask); 5546 } else { 5547 assert(masklen <= 16, ""); 5548 kmovwl(tmp, mask); 5549 } 5550 5551 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5552 // operations needs to be clipped. 5553 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5554 andq(tmp, (1 << masklen) - 1); 5555 } 5556 5557 vector_mask_operation_helper(opc, dst, tmp, masklen); 5558 } 5559 5560 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5561 Register tmp, int masklen, BasicType bt, int vec_enc) { 5562 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5563 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5564 assert(VM_Version::supports_popcnt(), ""); 5565 5566 bool need_clip = false; 5567 switch(bt) { 5568 case T_BOOLEAN: 5569 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5570 vpxor(xtmp, xtmp, xtmp, vec_enc); 5571 vpsubb(xtmp, xtmp, mask, vec_enc); 5572 vpmovmskb(tmp, xtmp, vec_enc); 5573 need_clip = masklen < 16; 5574 break; 5575 case T_BYTE: 5576 vpmovmskb(tmp, mask, vec_enc); 5577 need_clip = masklen < 16; 5578 break; 5579 case T_SHORT: 5580 vpacksswb(xtmp, mask, mask, vec_enc); 5581 if (masklen >= 16) { 5582 vpermpd(xtmp, xtmp, 8, vec_enc); 5583 } 5584 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5585 need_clip = masklen < 16; 5586 break; 5587 case T_INT: 5588 case T_FLOAT: 5589 vmovmskps(tmp, mask, vec_enc); 5590 need_clip = masklen < 4; 5591 break; 5592 case T_LONG: 5593 case T_DOUBLE: 5594 vmovmskpd(tmp, mask, vec_enc); 5595 need_clip = masklen < 2; 5596 break; 5597 default: assert(false, "Unhandled type, %s", type2name(bt)); 5598 } 5599 5600 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5601 // operations needs to be clipped. 5602 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5603 // need_clip implies masklen < 32 5604 andq(tmp, (1 << masklen) - 1); 5605 } 5606 5607 vector_mask_operation_helper(opc, dst, tmp, masklen); 5608 } 5609 5610 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5611 Register rtmp2, int mask_len) { 5612 kmov(rtmp1, src); 5613 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5614 mov64(rtmp2, -1L); 5615 pextq(rtmp2, rtmp2, rtmp1); 5616 kmov(dst, rtmp2); 5617 } 5618 5619 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5620 XMMRegister mask, Register rtmp, Register rscratch, 5621 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5622 int vec_enc) { 5623 assert(type2aelembytes(bt) >= 4, ""); 5624 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5625 address compress_perm_table = nullptr; 5626 address expand_perm_table = nullptr; 5627 if (type2aelembytes(bt) == 8) { 5628 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5629 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5630 vmovmskpd(rtmp, mask, vec_enc); 5631 } else { 5632 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5633 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5634 vmovmskps(rtmp, mask, vec_enc); 5635 } 5636 shlq(rtmp, 5); // for 32 byte permute row. 5637 if (opcode == Op_CompressV) { 5638 lea(rscratch, ExternalAddress(compress_perm_table)); 5639 } else { 5640 lea(rscratch, ExternalAddress(expand_perm_table)); 5641 } 5642 addptr(rtmp, rscratch); 5643 vmovdqu(permv, Address(rtmp)); 5644 vpermps(dst, permv, src, Assembler::AVX_256bit); 5645 vpxor(xtmp, xtmp, xtmp, vec_enc); 5646 // Blend the result with zero vector using permute mask, each column entry 5647 // in a permute table row contains either a valid permute index or a -1 (default) 5648 // value, this can potentially be used as a blending mask after 5649 // compressing/expanding the source vector lanes. 5650 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5651 } 5652 5653 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5654 bool merge, BasicType bt, int vec_enc) { 5655 if (opcode == Op_CompressV) { 5656 switch(bt) { 5657 case T_BYTE: 5658 evpcompressb(dst, mask, src, merge, vec_enc); 5659 break; 5660 case T_CHAR: 5661 case T_SHORT: 5662 evpcompressw(dst, mask, src, merge, vec_enc); 5663 break; 5664 case T_INT: 5665 evpcompressd(dst, mask, src, merge, vec_enc); 5666 break; 5667 case T_FLOAT: 5668 evcompressps(dst, mask, src, merge, vec_enc); 5669 break; 5670 case T_LONG: 5671 evpcompressq(dst, mask, src, merge, vec_enc); 5672 break; 5673 case T_DOUBLE: 5674 evcompresspd(dst, mask, src, merge, vec_enc); 5675 break; 5676 default: 5677 fatal("Unsupported type %s", type2name(bt)); 5678 break; 5679 } 5680 } else { 5681 assert(opcode == Op_ExpandV, ""); 5682 switch(bt) { 5683 case T_BYTE: 5684 evpexpandb(dst, mask, src, merge, vec_enc); 5685 break; 5686 case T_CHAR: 5687 case T_SHORT: 5688 evpexpandw(dst, mask, src, merge, vec_enc); 5689 break; 5690 case T_INT: 5691 evpexpandd(dst, mask, src, merge, vec_enc); 5692 break; 5693 case T_FLOAT: 5694 evexpandps(dst, mask, src, merge, vec_enc); 5695 break; 5696 case T_LONG: 5697 evpexpandq(dst, mask, src, merge, vec_enc); 5698 break; 5699 case T_DOUBLE: 5700 evexpandpd(dst, mask, src, merge, vec_enc); 5701 break; 5702 default: 5703 fatal("Unsupported type %s", type2name(bt)); 5704 break; 5705 } 5706 } 5707 } 5708 5709 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5710 KRegister ktmp1, int vec_enc) { 5711 if (opcode == Op_SignumVD) { 5712 vsubpd(dst, zero, one, vec_enc); 5713 // if src < 0 ? -1 : 1 5714 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5715 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5716 // if src == NaN, -0.0 or 0.0 return src. 5717 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5718 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5719 } else { 5720 assert(opcode == Op_SignumVF, ""); 5721 vsubps(dst, zero, one, vec_enc); 5722 // if src < 0 ? -1 : 1 5723 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5724 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5725 // if src == NaN, -0.0 or 0.0 return src. 5726 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5727 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5728 } 5729 } 5730 5731 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5732 XMMRegister xtmp1, int vec_enc) { 5733 if (opcode == Op_SignumVD) { 5734 vsubpd(dst, zero, one, vec_enc); 5735 // if src < 0 ? -1 : 1 5736 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5737 // if src == NaN, -0.0 or 0.0 return src. 5738 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5739 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5740 } else { 5741 assert(opcode == Op_SignumVF, ""); 5742 vsubps(dst, zero, one, vec_enc); 5743 // if src < 0 ? -1 : 1 5744 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5745 // if src == NaN, -0.0 or 0.0 return src. 5746 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5747 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5748 } 5749 } 5750 5751 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5752 if (VM_Version::supports_avx512bw()) { 5753 if (mask_len > 32) { 5754 kmovql(dst, src); 5755 } else { 5756 kmovdl(dst, src); 5757 if (mask_len != 32) { 5758 kshiftrdl(dst, dst, 32 - mask_len); 5759 } 5760 } 5761 } else { 5762 assert(mask_len <= 16, ""); 5763 kmovwl(dst, src); 5764 if (mask_len != 16) { 5765 kshiftrwl(dst, dst, 16 - mask_len); 5766 } 5767 } 5768 } 5769 5770 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5771 int lane_size = type2aelembytes(bt); 5772 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5773 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) { 5774 movptr(rtmp, imm32); 5775 switch(lane_size) { 5776 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5777 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5778 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5779 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5780 fatal("Unsupported lane size %d", lane_size); 5781 break; 5782 } 5783 } else { 5784 movptr(rtmp, imm32); 5785 movq(dst, rtmp); 5786 switch(lane_size) { 5787 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5788 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5789 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5790 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5791 fatal("Unsupported lane size %d", lane_size); 5792 break; 5793 } 5794 } 5795 } 5796 5797 // 5798 // Following is lookup table based popcount computation algorithm:- 5799 // Index Bit set count 5800 // [ 0000 -> 0, 5801 // 0001 -> 1, 5802 // 0010 -> 1, 5803 // 0011 -> 2, 5804 // 0100 -> 1, 5805 // 0101 -> 2, 5806 // 0110 -> 2, 5807 // 0111 -> 3, 5808 // 1000 -> 1, 5809 // 1001 -> 2, 5810 // 1010 -> 3, 5811 // 1011 -> 3, 5812 // 1100 -> 2, 5813 // 1101 -> 3, 5814 // 1111 -> 4 ] 5815 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5816 // shuffle indices for lookup table access. 5817 // b. Right shift each byte of vector lane by 4 positions. 5818 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5819 // shuffle indices for lookup table access. 5820 // d. Add the bitset count of upper and lower 4 bits of each byte. 5821 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5822 // count of all the bytes of a quadword. 5823 // f. Perform step e. for upper 128bit vector lane. 5824 // g. Pack the bitset count of quadwords back to double word. 5825 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5826 5827 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5828 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5829 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5830 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5831 vpsrlw(dst, src, 4, vec_enc); 5832 vpand(dst, dst, xtmp1, vec_enc); 5833 vpand(xtmp1, src, xtmp1, vec_enc); 5834 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5835 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5836 vpshufb(dst, xtmp2, dst, vec_enc); 5837 vpaddb(dst, dst, xtmp1, vec_enc); 5838 } 5839 5840 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5841 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5842 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5843 // Following code is as per steps e,f,g and h of above algorithm. 5844 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5845 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5846 vpsadbw(dst, dst, xtmp2, vec_enc); 5847 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5848 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5849 vpackuswb(dst, xtmp1, dst, vec_enc); 5850 } 5851 5852 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5853 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5854 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5855 // Add the popcount of upper and lower bytes of word. 5856 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5857 vpsrlw(dst, xtmp1, 8, vec_enc); 5858 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5859 vpaddw(dst, dst, xtmp1, vec_enc); 5860 } 5861 5862 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5863 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5864 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5865 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5866 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5867 } 5868 5869 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5870 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5871 switch(bt) { 5872 case T_LONG: 5873 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5874 break; 5875 case T_INT: 5876 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5877 break; 5878 case T_CHAR: 5879 case T_SHORT: 5880 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5881 break; 5882 case T_BYTE: 5883 case T_BOOLEAN: 5884 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5885 break; 5886 default: 5887 fatal("Unsupported type %s", type2name(bt)); 5888 break; 5889 } 5890 } 5891 5892 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5893 KRegister mask, bool merge, int vec_enc) { 5894 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5895 switch(bt) { 5896 case T_LONG: 5897 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5898 evpopcntq(dst, mask, src, merge, vec_enc); 5899 break; 5900 case T_INT: 5901 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5902 evpopcntd(dst, mask, src, merge, vec_enc); 5903 break; 5904 case T_CHAR: 5905 case T_SHORT: 5906 assert(VM_Version::supports_avx512_bitalg(), ""); 5907 evpopcntw(dst, mask, src, merge, vec_enc); 5908 break; 5909 case T_BYTE: 5910 case T_BOOLEAN: 5911 assert(VM_Version::supports_avx512_bitalg(), ""); 5912 evpopcntb(dst, mask, src, merge, vec_enc); 5913 break; 5914 default: 5915 fatal("Unsupported type %s", type2name(bt)); 5916 break; 5917 } 5918 } 5919 5920 // Bit reversal algorithm first reverses the bits of each byte followed by 5921 // a byte level reversal for multi-byte primitive types (short/int/long). 5922 // Algorithm performs a lookup table access to get reverse bit sequence 5923 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5924 // is obtained by swapping the reverse bit sequences of upper and lower 5925 // nibble of a byte. 5926 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5927 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5928 if (VM_Version::supports_avx512vlbw()) { 5929 5930 // Get the reverse bit sequence of lower nibble of each byte. 5931 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5932 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5933 evpandq(dst, xtmp2, src, vec_enc); 5934 vpshufb(dst, xtmp1, dst, vec_enc); 5935 vpsllq(dst, dst, 4, vec_enc); 5936 5937 // Get the reverse bit sequence of upper nibble of each byte. 5938 vpandn(xtmp2, xtmp2, src, vec_enc); 5939 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5940 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5941 5942 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5943 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5944 evporq(xtmp2, dst, xtmp2, vec_enc); 5945 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5946 5947 } else if(vec_enc == Assembler::AVX_512bit) { 5948 // Shift based bit reversal. 5949 assert(bt == T_LONG || bt == T_INT, ""); 5950 5951 // Swap lower and upper nibble of each byte. 5952 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5953 5954 // Swap two least and most significant bits of each nibble. 5955 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5956 5957 // Swap adjacent pair of bits. 5958 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5959 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5960 5961 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5962 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5963 } else { 5964 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5965 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5966 5967 // Get the reverse bit sequence of lower nibble of each byte. 5968 vpand(dst, xtmp2, src, vec_enc); 5969 vpshufb(dst, xtmp1, dst, vec_enc); 5970 vpsllq(dst, dst, 4, vec_enc); 5971 5972 // Get the reverse bit sequence of upper nibble of each byte. 5973 vpandn(xtmp2, xtmp2, src, vec_enc); 5974 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5975 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5976 5977 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5978 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5979 vpor(xtmp2, dst, xtmp2, vec_enc); 5980 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5981 } 5982 } 5983 5984 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5985 XMMRegister xtmp, Register rscratch) { 5986 assert(VM_Version::supports_gfni(), ""); 5987 assert(rscratch != noreg || always_reachable(mask), "missing"); 5988 5989 // Galois field instruction based bit reversal based on following algorithm. 5990 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5991 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5992 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5993 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5994 } 5995 5996 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5997 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5998 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5999 evpandq(dst, xtmp1, src, vec_enc); 6000 vpsllq(dst, dst, nbits, vec_enc); 6001 vpandn(xtmp1, xtmp1, src, vec_enc); 6002 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 6003 evporq(dst, dst, xtmp1, vec_enc); 6004 } 6005 6006 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6007 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6008 // Shift based bit reversal. 6009 assert(VM_Version::supports_evex(), ""); 6010 switch(bt) { 6011 case T_LONG: 6012 // Swap upper and lower double word of each quad word. 6013 evprorq(xtmp1, k0, src, 32, true, vec_enc); 6014 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 6015 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6016 break; 6017 case T_INT: 6018 // Swap upper and lower word of each double word. 6019 evprord(xtmp1, k0, src, 16, true, vec_enc); 6020 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6021 break; 6022 case T_CHAR: 6023 case T_SHORT: 6024 // Swap upper and lower byte of each word. 6025 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 6026 break; 6027 case T_BYTE: 6028 evmovdquq(dst, k0, src, true, vec_enc); 6029 break; 6030 default: 6031 fatal("Unsupported type %s", type2name(bt)); 6032 break; 6033 } 6034 } 6035 6036 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 6037 if (bt == T_BYTE) { 6038 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 6039 evmovdquq(dst, k0, src, true, vec_enc); 6040 } else { 6041 vmovdqu(dst, src); 6042 } 6043 return; 6044 } 6045 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 6046 // pre-computed shuffle indices. 6047 switch(bt) { 6048 case T_LONG: 6049 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 6050 break; 6051 case T_INT: 6052 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 6053 break; 6054 case T_CHAR: 6055 case T_SHORT: 6056 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6057 break; 6058 default: 6059 fatal("Unsupported type %s", type2name(bt)); 6060 break; 6061 } 6062 vpshufb(dst, src, dst, vec_enc); 6063 } 6064 6065 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6066 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6067 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6068 assert(is_integral_type(bt), ""); 6069 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6070 assert(VM_Version::supports_avx512cd(), ""); 6071 switch(bt) { 6072 case T_LONG: 6073 evplzcntq(dst, ktmp, src, merge, vec_enc); 6074 break; 6075 case T_INT: 6076 evplzcntd(dst, ktmp, src, merge, vec_enc); 6077 break; 6078 case T_SHORT: 6079 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6080 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6081 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6082 vpunpckhwd(dst, xtmp1, src, vec_enc); 6083 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6084 vpackusdw(dst, xtmp2, dst, vec_enc); 6085 break; 6086 case T_BYTE: 6087 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6088 // accessing the lookup table. 6089 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6090 // accessing the lookup table. 6091 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6092 assert(VM_Version::supports_avx512bw(), ""); 6093 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6094 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6095 vpand(xtmp2, dst, src, vec_enc); 6096 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6097 vpsrlw(xtmp3, src, 4, vec_enc); 6098 vpand(xtmp3, dst, xtmp3, vec_enc); 6099 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6100 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6101 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6102 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6103 break; 6104 default: 6105 fatal("Unsupported type %s", type2name(bt)); 6106 break; 6107 } 6108 } 6109 6110 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6111 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6112 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6113 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6114 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6115 // accessing the lookup table. 6116 vpand(dst, xtmp2, src, vec_enc); 6117 vpshufb(dst, xtmp1, dst, vec_enc); 6118 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6119 // accessing the lookup table. 6120 vpsrlw(xtmp3, src, 4, vec_enc); 6121 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6122 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6123 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6124 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6125 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6126 vpaddb(dst, dst, xtmp2, vec_enc); 6127 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6128 } 6129 6130 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6131 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6132 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6133 // Add zero counts of lower byte and upper byte of a word if 6134 // upper byte holds a zero value. 6135 vpsrlw(xtmp3, src, 8, vec_enc); 6136 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6137 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6138 vpsllw(xtmp2, dst, 8, vec_enc); 6139 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6140 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6141 vpsrlw(dst, dst, 8, vec_enc); 6142 } 6143 6144 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6145 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6146 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6147 // hence biased exponent can be used to compute leading zero count as per 6148 // following formula:- 6149 // LZCNT = 31 - (biased_exp - 127) 6150 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6151 6152 // Broadcast 0xFF 6153 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6154 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6155 6156 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher 6157 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit 6158 // contributes to the leading number of zeros. 6159 vpsrld(xtmp2, src, 1, vec_enc); 6160 vpandn(xtmp3, xtmp2, src, vec_enc); 6161 6162 // Extract biased exponent. 6163 vcvtdq2ps(dst, xtmp3, vec_enc); 6164 vpsrld(dst, dst, 23, vec_enc); 6165 vpand(dst, dst, xtmp1, vec_enc); 6166 6167 // Broadcast 127. 6168 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6169 // Exponent = biased_exp - 127 6170 vpsubd(dst, dst, xtmp1, vec_enc); 6171 6172 // Exponent_plus_one = Exponent + 1 6173 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6174 vpaddd(dst, dst, xtmp3, vec_enc); 6175 6176 // Replace -ve exponent with zero, exponent is -ve when src 6177 // lane contains a zero value. 6178 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6179 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6180 6181 // Rematerialize broadcast 32. 6182 vpslld(xtmp1, xtmp3, 5, vec_enc); 6183 // Exponent is 32 if corresponding source lane contains max_int value. 6184 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6185 // LZCNT = 32 - exponent_plus_one 6186 vpsubd(dst, xtmp1, dst, vec_enc); 6187 6188 // Replace LZCNT with a value 1 if corresponding source lane 6189 // contains max_int value. 6190 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6191 6192 // Replace biased_exp with 0 if source lane value is less than zero. 6193 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6194 vblendvps(dst, dst, xtmp2, src, vec_enc); 6195 } 6196 6197 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6198 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6199 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6200 // Add zero counts of lower word and upper word of a double word if 6201 // upper word holds a zero value. 6202 vpsrld(xtmp3, src, 16, vec_enc); 6203 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6204 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6205 vpslld(xtmp2, dst, 16, vec_enc); 6206 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6207 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6208 vpsrld(dst, dst, 16, vec_enc); 6209 // Add zero counts of lower doubleword and upper doubleword of a 6210 // quadword if upper doubleword holds a zero value. 6211 vpsrlq(xtmp3, src, 32, vec_enc); 6212 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6213 vpsllq(xtmp2, dst, 32, vec_enc); 6214 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6215 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6216 vpsrlq(dst, dst, 32, vec_enc); 6217 } 6218 6219 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6220 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6221 Register rtmp, int vec_enc) { 6222 assert(is_integral_type(bt), "unexpected type"); 6223 assert(vec_enc < Assembler::AVX_512bit, ""); 6224 switch(bt) { 6225 case T_LONG: 6226 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6227 break; 6228 case T_INT: 6229 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6230 break; 6231 case T_SHORT: 6232 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6233 break; 6234 case T_BYTE: 6235 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6236 break; 6237 default: 6238 fatal("Unsupported type %s", type2name(bt)); 6239 break; 6240 } 6241 } 6242 6243 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6244 switch(bt) { 6245 case T_BYTE: 6246 vpsubb(dst, src1, src2, vec_enc); 6247 break; 6248 case T_SHORT: 6249 vpsubw(dst, src1, src2, vec_enc); 6250 break; 6251 case T_INT: 6252 vpsubd(dst, src1, src2, vec_enc); 6253 break; 6254 case T_LONG: 6255 vpsubq(dst, src1, src2, vec_enc); 6256 break; 6257 default: 6258 fatal("Unsupported type %s", type2name(bt)); 6259 break; 6260 } 6261 } 6262 6263 // Trailing zero count computation is based on leading zero count operation as per 6264 // following equation. All AVX3 targets support AVX512CD feature which offers 6265 // direct vector instruction to compute leading zero count. 6266 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6267 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6268 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6269 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6270 assert(is_integral_type(bt), ""); 6271 // xtmp = -1 6272 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6273 // xtmp = xtmp + src 6274 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6275 // xtmp = xtmp & ~src 6276 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6277 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6278 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6279 vpsub(bt, dst, xtmp4, dst, vec_enc); 6280 } 6281 6282 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6283 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6284 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6285 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6286 assert(is_integral_type(bt), ""); 6287 // xtmp = 0 6288 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6289 // xtmp = 0 - src 6290 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6291 // xtmp = xtmp | src 6292 vpor(xtmp3, xtmp3, src, vec_enc); 6293 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6294 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6295 vpsub(bt, dst, xtmp1, dst, vec_enc); 6296 } 6297 6298 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6299 Label done; 6300 Label neg_divisor_fastpath; 6301 cmpl(divisor, 0); 6302 jccb(Assembler::less, neg_divisor_fastpath); 6303 xorl(rdx, rdx); 6304 divl(divisor); 6305 jmpb(done); 6306 bind(neg_divisor_fastpath); 6307 // Fastpath for divisor < 0: 6308 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6309 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6310 movl(rdx, rax); 6311 subl(rdx, divisor); 6312 if (VM_Version::supports_bmi1()) { 6313 andnl(rax, rdx, rax); 6314 } else { 6315 notl(rdx); 6316 andl(rax, rdx); 6317 } 6318 shrl(rax, 31); 6319 bind(done); 6320 } 6321 6322 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6323 Label done; 6324 Label neg_divisor_fastpath; 6325 cmpl(divisor, 0); 6326 jccb(Assembler::less, neg_divisor_fastpath); 6327 xorl(rdx, rdx); 6328 divl(divisor); 6329 jmpb(done); 6330 bind(neg_divisor_fastpath); 6331 // Fastpath when divisor < 0: 6332 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6333 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6334 movl(rdx, rax); 6335 subl(rax, divisor); 6336 if (VM_Version::supports_bmi1()) { 6337 andnl(rax, rax, rdx); 6338 } else { 6339 notl(rax); 6340 andl(rax, rdx); 6341 } 6342 sarl(rax, 31); 6343 andl(rax, divisor); 6344 subl(rdx, rax); 6345 bind(done); 6346 } 6347 6348 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6349 Label done; 6350 Label neg_divisor_fastpath; 6351 6352 cmpl(divisor, 0); 6353 jccb(Assembler::less, neg_divisor_fastpath); 6354 xorl(rdx, rdx); 6355 divl(divisor); 6356 jmpb(done); 6357 bind(neg_divisor_fastpath); 6358 // Fastpath for divisor < 0: 6359 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6360 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6361 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6362 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6363 movl(rdx, rax); 6364 subl(rax, divisor); 6365 if (VM_Version::supports_bmi1()) { 6366 andnl(rax, rax, rdx); 6367 } else { 6368 notl(rax); 6369 andl(rax, rdx); 6370 } 6371 movl(tmp, rax); 6372 shrl(rax, 31); // quotient 6373 sarl(tmp, 31); 6374 andl(tmp, divisor); 6375 subl(rdx, tmp); // remainder 6376 bind(done); 6377 } 6378 6379 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6380 XMMRegister xtmp2, Register rtmp) { 6381 if(VM_Version::supports_gfni()) { 6382 // Galois field instruction based bit reversal based on following algorithm. 6383 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6384 mov64(rtmp, 0x8040201008040201L); 6385 movq(xtmp1, src); 6386 movq(xtmp2, rtmp); 6387 gf2p8affineqb(xtmp1, xtmp2, 0); 6388 movq(dst, xtmp1); 6389 } else { 6390 // Swap even and odd numbered bits. 6391 movl(rtmp, src); 6392 andl(rtmp, 0x55555555); 6393 shll(rtmp, 1); 6394 movl(dst, src); 6395 andl(dst, 0xAAAAAAAA); 6396 shrl(dst, 1); 6397 orl(dst, rtmp); 6398 6399 // Swap LSB and MSB 2 bits of each nibble. 6400 movl(rtmp, dst); 6401 andl(rtmp, 0x33333333); 6402 shll(rtmp, 2); 6403 andl(dst, 0xCCCCCCCC); 6404 shrl(dst, 2); 6405 orl(dst, rtmp); 6406 6407 // Swap LSB and MSB 4 bits of each byte. 6408 movl(rtmp, dst); 6409 andl(rtmp, 0x0F0F0F0F); 6410 shll(rtmp, 4); 6411 andl(dst, 0xF0F0F0F0); 6412 shrl(dst, 4); 6413 orl(dst, rtmp); 6414 } 6415 bswapl(dst); 6416 } 6417 6418 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6419 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6420 if(VM_Version::supports_gfni()) { 6421 // Galois field instruction based bit reversal based on following algorithm. 6422 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6423 mov64(rtmp1, 0x8040201008040201L); 6424 movq(xtmp1, src); 6425 movq(xtmp2, rtmp1); 6426 gf2p8affineqb(xtmp1, xtmp2, 0); 6427 movq(dst, xtmp1); 6428 } else { 6429 // Swap even and odd numbered bits. 6430 movq(rtmp1, src); 6431 mov64(rtmp2, 0x5555555555555555L); 6432 andq(rtmp1, rtmp2); 6433 shlq(rtmp1, 1); 6434 movq(dst, src); 6435 notq(rtmp2); 6436 andq(dst, rtmp2); 6437 shrq(dst, 1); 6438 orq(dst, rtmp1); 6439 6440 // Swap LSB and MSB 2 bits of each nibble. 6441 movq(rtmp1, dst); 6442 mov64(rtmp2, 0x3333333333333333L); 6443 andq(rtmp1, rtmp2); 6444 shlq(rtmp1, 2); 6445 notq(rtmp2); 6446 andq(dst, rtmp2); 6447 shrq(dst, 2); 6448 orq(dst, rtmp1); 6449 6450 // Swap LSB and MSB 4 bits of each byte. 6451 movq(rtmp1, dst); 6452 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6453 andq(rtmp1, rtmp2); 6454 shlq(rtmp1, 4); 6455 notq(rtmp2); 6456 andq(dst, rtmp2); 6457 shrq(dst, 4); 6458 orq(dst, rtmp1); 6459 } 6460 bswapq(dst); 6461 } 6462 6463 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6464 Label done; 6465 Label neg_divisor_fastpath; 6466 cmpq(divisor, 0); 6467 jccb(Assembler::less, neg_divisor_fastpath); 6468 xorl(rdx, rdx); 6469 divq(divisor); 6470 jmpb(done); 6471 bind(neg_divisor_fastpath); 6472 // Fastpath for divisor < 0: 6473 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6474 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6475 movq(rdx, rax); 6476 subq(rdx, divisor); 6477 if (VM_Version::supports_bmi1()) { 6478 andnq(rax, rdx, rax); 6479 } else { 6480 notq(rdx); 6481 andq(rax, rdx); 6482 } 6483 shrq(rax, 63); 6484 bind(done); 6485 } 6486 6487 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6488 Label done; 6489 Label neg_divisor_fastpath; 6490 cmpq(divisor, 0); 6491 jccb(Assembler::less, neg_divisor_fastpath); 6492 xorq(rdx, rdx); 6493 divq(divisor); 6494 jmp(done); 6495 bind(neg_divisor_fastpath); 6496 // Fastpath when divisor < 0: 6497 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6498 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6499 movq(rdx, rax); 6500 subq(rax, divisor); 6501 if (VM_Version::supports_bmi1()) { 6502 andnq(rax, rax, rdx); 6503 } else { 6504 notq(rax); 6505 andq(rax, rdx); 6506 } 6507 sarq(rax, 63); 6508 andq(rax, divisor); 6509 subq(rdx, rax); 6510 bind(done); 6511 } 6512 6513 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6514 Label done; 6515 Label neg_divisor_fastpath; 6516 cmpq(divisor, 0); 6517 jccb(Assembler::less, neg_divisor_fastpath); 6518 xorq(rdx, rdx); 6519 divq(divisor); 6520 jmp(done); 6521 bind(neg_divisor_fastpath); 6522 // Fastpath for divisor < 0: 6523 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6524 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6525 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6526 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6527 movq(rdx, rax); 6528 subq(rax, divisor); 6529 if (VM_Version::supports_bmi1()) { 6530 andnq(rax, rax, rdx); 6531 } else { 6532 notq(rax); 6533 andq(rax, rdx); 6534 } 6535 movq(tmp, rax); 6536 shrq(rax, 63); // quotient 6537 sarq(tmp, 63); 6538 andq(tmp, divisor); 6539 subq(rdx, tmp); // remainder 6540 bind(done); 6541 } 6542 6543 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6544 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6545 int vlen_enc) { 6546 assert(VM_Version::supports_avx512bw(), ""); 6547 // Byte shuffles are inlane operations and indices are determined using 6548 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6549 // normalized to index range 0-15. This makes sure that all the multiples 6550 // of an index value are placed at same relative position in 128 bit 6551 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6552 // will be 16th element in their respective 128 bit lanes. 6553 movl(rtmp, 16); 6554 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6555 6556 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6557 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6558 // original shuffle indices and move the shuffled lanes corresponding to true 6559 // mask to destination vector. 6560 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6561 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6562 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6563 6564 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6565 // and broadcasting second 128 bit lane. 6566 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6567 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6568 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6569 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6570 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6571 6572 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6573 // and broadcasting third 128 bit lane. 6574 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6575 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6576 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6577 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6578 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6579 6580 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6581 // and broadcasting third 128 bit lane. 6582 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6583 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6584 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6585 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6586 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6587 } 6588 6589 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6590 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6591 if (vlen_enc == AVX_128bit) { 6592 vpermilps(dst, src, shuffle, vlen_enc); 6593 } else if (bt == T_INT) { 6594 vpermd(dst, shuffle, src, vlen_enc); 6595 } else { 6596 assert(bt == T_FLOAT, ""); 6597 vpermps(dst, shuffle, src, vlen_enc); 6598 } 6599 } 6600 6601 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 6602 switch(opcode) { 6603 case Op_AddHF: vaddsh(dst, src1, src2); break; 6604 case Op_SubHF: vsubsh(dst, src1, src2); break; 6605 case Op_MulHF: vmulsh(dst, src1, src2); break; 6606 case Op_DivHF: vdivsh(dst, src1, src2); break; 6607 default: assert(false, "%s", NodeClassNames[opcode]); break; 6608 } 6609 } 6610 6611 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6612 switch(elem_bt) { 6613 case T_BYTE: 6614 if (ideal_opc == Op_SaturatingAddV) { 6615 vpaddsb(dst, src1, src2, vlen_enc); 6616 } else { 6617 assert(ideal_opc == Op_SaturatingSubV, ""); 6618 vpsubsb(dst, src1, src2, vlen_enc); 6619 } 6620 break; 6621 case T_SHORT: 6622 if (ideal_opc == Op_SaturatingAddV) { 6623 vpaddsw(dst, src1, src2, vlen_enc); 6624 } else { 6625 assert(ideal_opc == Op_SaturatingSubV, ""); 6626 vpsubsw(dst, src1, src2, vlen_enc); 6627 } 6628 break; 6629 default: 6630 fatal("Unsupported type %s", type2name(elem_bt)); 6631 break; 6632 } 6633 } 6634 6635 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6636 switch(elem_bt) { 6637 case T_BYTE: 6638 if (ideal_opc == Op_SaturatingAddV) { 6639 vpaddusb(dst, src1, src2, vlen_enc); 6640 } else { 6641 assert(ideal_opc == Op_SaturatingSubV, ""); 6642 vpsubusb(dst, src1, src2, vlen_enc); 6643 } 6644 break; 6645 case T_SHORT: 6646 if (ideal_opc == Op_SaturatingAddV) { 6647 vpaddusw(dst, src1, src2, vlen_enc); 6648 } else { 6649 assert(ideal_opc == Op_SaturatingSubV, ""); 6650 vpsubusw(dst, src1, src2, vlen_enc); 6651 } 6652 break; 6653 default: 6654 fatal("Unsupported type %s", type2name(elem_bt)); 6655 break; 6656 } 6657 } 6658 6659 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6660 XMMRegister src2, KRegister ktmp, int vlen_enc) { 6661 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input. 6662 // overflow_mask = Inp1 <u Inp2 6663 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc); 6664 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative) 6665 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false); 6666 } 6667 6668 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6669 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6670 // Emulate unsigned comparison using signed comparison 6671 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE 6672 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true); 6673 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc); 6674 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc); 6675 6676 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc); 6677 6678 // Res = INP1 - INP2 (non-commutative and non-associative) 6679 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6680 // Res = Mask ? Zero : Res 6681 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); 6682 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc); 6683 } 6684 6685 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6686 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) { 6687 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation. 6688 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2) 6689 // Res = Signed Add INP1, INP2 6690 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6691 // T1 = SRC1 | SRC2 6692 vpor(xtmp1, src1, src2, vlen_enc); 6693 // Max_Unsigned = -1 6694 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6695 // Unsigned compare: Mask = Res <u T1 6696 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc); 6697 // res = Mask ? Max_Unsigned : Res 6698 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc); 6699 } 6700 6701 // 6702 // Section 2-13 Hacker's Delight list following overflow detection check for saturating 6703 // unsigned addition operation. 6704 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1 6705 // 6706 // We empirically determined its semantic equivalence to following reduced expression 6707 // overflow_mask = (a + b) <u (a | b) 6708 // 6709 // and also verified it though Alive2 solver. 6710 // (https://alive2.llvm.org/ce/z/XDQ7dY) 6711 // 6712 6713 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6714 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) { 6715 // Res = Signed Add INP1, INP2 6716 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6717 // Compute T1 = INP1 | INP2 6718 vpor(xtmp3, src1, src2, vlen_enc); 6719 // T1 = Minimum signed value. 6720 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6721 // Convert T1 to signed value, T1 = T1 + MIN_VALUE 6722 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc); 6723 // Convert Res to signed value, Res<s> = Res + MIN_VALUE 6724 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc); 6725 // Compute overflow detection mask = Res<1> <s T1 6726 if (elem_bt == T_INT) { 6727 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc); 6728 } else { 6729 assert(elem_bt == T_LONG, ""); 6730 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc); 6731 } 6732 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6733 } 6734 6735 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6736 int vlen_enc, bool xtmp2_hold_M1) { 6737 if (VM_Version::supports_avx512dq()) { 6738 evpmovq2m(ktmp, src, vlen_enc); 6739 } else { 6740 assert(VM_Version::supports_evex(), ""); 6741 if (!xtmp2_hold_M1) { 6742 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6743 } 6744 evpsraq(xtmp1, src, 63, vlen_enc); 6745 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6746 } 6747 } 6748 6749 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6750 int vlen_enc, bool xtmp2_hold_M1) { 6751 if (VM_Version::supports_avx512dq()) { 6752 evpmovd2m(ktmp, src, vlen_enc); 6753 } else { 6754 assert(VM_Version::supports_evex(), ""); 6755 if (!xtmp2_hold_M1) { 6756 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6757 } 6758 vpsrad(xtmp1, src, 31, vlen_enc); 6759 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6760 } 6761 } 6762 6763 6764 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 6765 if (elem_bt == T_LONG) { 6766 if (VM_Version::supports_evex()) { 6767 evpsraq(dst, src, 63, vlen_enc); 6768 } else { 6769 vpsrad(dst, src, 31, vlen_enc); 6770 vpshufd(dst, dst, 0xF5, vlen_enc); 6771 } 6772 } else { 6773 assert(elem_bt == T_INT, ""); 6774 vpsrad(dst, src, 31, vlen_enc); 6775 } 6776 } 6777 6778 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6779 if (compute_allones) { 6780 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6781 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6782 } else { 6783 vpcmpeqq(allones, allones, allones, vlen_enc); 6784 } 6785 } 6786 if (elem_bt == T_LONG) { 6787 vpsrlq(dst, allones, 1, vlen_enc); 6788 } else { 6789 assert(elem_bt == T_INT, ""); 6790 vpsrld(dst, allones, 1, vlen_enc); 6791 } 6792 } 6793 6794 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6795 if (compute_allones) { 6796 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6797 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6798 } else { 6799 vpcmpeqq(allones, allones, allones, vlen_enc); 6800 } 6801 } 6802 if (elem_bt == T_LONG) { 6803 vpsllq(dst, allones, 63, vlen_enc); 6804 } else { 6805 assert(elem_bt == T_INT, ""); 6806 vpslld(dst, allones, 31, vlen_enc); 6807 } 6808 } 6809 6810 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2, 6811 Assembler::ComparisonPredicate cond, int vlen_enc) { 6812 switch(elem_bt) { 6813 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break; 6814 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break; 6815 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break; 6816 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break; 6817 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6818 } 6819 } 6820 6821 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6822 switch(elem_bt) { 6823 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break; 6824 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break; 6825 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break; 6826 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break; 6827 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6828 } 6829 } 6830 6831 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1, 6832 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) { 6833 if (elem_bt == T_LONG) { 6834 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6835 } else { 6836 assert(elem_bt == T_INT, ""); 6837 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6838 } 6839 } 6840 6841 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6842 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6843 KRegister ktmp1, KRegister ktmp2, int vlen_enc) { 6844 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6845 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6846 // Overflow detection based on Hacker's delight section 2-13. 6847 if (ideal_opc == Op_SaturatingAddV) { 6848 // res = src1 + src2 6849 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6850 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6851 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6852 vpxor(xtmp1, dst, src1, vlen_enc); 6853 vpxor(xtmp2, dst, src2, vlen_enc); 6854 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6855 } else { 6856 assert(ideal_opc == Op_SaturatingSubV, ""); 6857 // res = src1 - src2 6858 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6859 // Overflow occurs when both inputs have opposite polarity and 6860 // result polarity does not comply with first input polarity. 6861 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6862 vpxor(xtmp1, src1, src2, vlen_enc); 6863 vpxor(xtmp2, dst, src1, vlen_enc); 6864 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6865 } 6866 6867 // Compute overflow detection mask. 6868 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc); 6869 // Note: xtmp1 hold -1 in all its lanes after above call. 6870 6871 // Compute mask based on first input polarity. 6872 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true); 6873 6874 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6875 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6876 6877 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to 6878 // set bits in first input polarity mask holds a min value. 6879 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc); 6880 // Blend destination lanes with saturated values using overflow detection mask. 6881 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc); 6882 } 6883 6884 6885 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6886 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6887 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) { 6888 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6889 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6890 // Overflow detection based on Hacker's delight section 2-13. 6891 if (ideal_opc == Op_SaturatingAddV) { 6892 // res = src1 + src2 6893 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6894 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6895 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6896 vpxor(xtmp1, dst, src1, vlen_enc); 6897 vpxor(xtmp2, dst, src2, vlen_enc); 6898 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6899 } else { 6900 assert(ideal_opc == Op_SaturatingSubV, ""); 6901 // res = src1 - src2 6902 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6903 // Overflow occurs when both inputs have opposite polarity and 6904 // result polarity does not comply with first input polarity. 6905 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6906 vpxor(xtmp1, src1, src2, vlen_enc); 6907 vpxor(xtmp2, dst, src1, vlen_enc); 6908 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6909 } 6910 6911 // Sign-extend to compute overflow detection mask. 6912 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc); 6913 6914 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc); 6915 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc); 6916 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6917 6918 // Compose saturating min/max vector using first input polarity mask. 6919 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc); 6920 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc); 6921 6922 // Blend result with saturating vector using overflow detection mask. 6923 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6924 } 6925 6926 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6927 switch(elem_bt) { 6928 case T_BYTE: 6929 if (ideal_opc == Op_SaturatingAddV) { 6930 vpaddsb(dst, src1, src2, vlen_enc); 6931 } else { 6932 assert(ideal_opc == Op_SaturatingSubV, ""); 6933 vpsubsb(dst, src1, src2, vlen_enc); 6934 } 6935 break; 6936 case T_SHORT: 6937 if (ideal_opc == Op_SaturatingAddV) { 6938 vpaddsw(dst, src1, src2, vlen_enc); 6939 } else { 6940 assert(ideal_opc == Op_SaturatingSubV, ""); 6941 vpsubsw(dst, src1, src2, vlen_enc); 6942 } 6943 break; 6944 default: 6945 fatal("Unsupported type %s", type2name(elem_bt)); 6946 break; 6947 } 6948 } 6949 6950 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6951 switch(elem_bt) { 6952 case T_BYTE: 6953 if (ideal_opc == Op_SaturatingAddV) { 6954 vpaddusb(dst, src1, src2, vlen_enc); 6955 } else { 6956 assert(ideal_opc == Op_SaturatingSubV, ""); 6957 vpsubusb(dst, src1, src2, vlen_enc); 6958 } 6959 break; 6960 case T_SHORT: 6961 if (ideal_opc == Op_SaturatingAddV) { 6962 vpaddusw(dst, src1, src2, vlen_enc); 6963 } else { 6964 assert(ideal_opc == Op_SaturatingSubV, ""); 6965 vpsubusw(dst, src1, src2, vlen_enc); 6966 } 6967 break; 6968 default: 6969 fatal("Unsupported type %s", type2name(elem_bt)); 6970 break; 6971 } 6972 } 6973 6974 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6975 XMMRegister src2, int vlen_enc) { 6976 switch(elem_bt) { 6977 case T_BYTE: 6978 evpermi2b(dst, src1, src2, vlen_enc); 6979 break; 6980 case T_SHORT: 6981 evpermi2w(dst, src1, src2, vlen_enc); 6982 break; 6983 case T_INT: 6984 evpermi2d(dst, src1, src2, vlen_enc); 6985 break; 6986 case T_LONG: 6987 evpermi2q(dst, src1, src2, vlen_enc); 6988 break; 6989 case T_FLOAT: 6990 evpermi2ps(dst, src1, src2, vlen_enc); 6991 break; 6992 case T_DOUBLE: 6993 evpermi2pd(dst, src1, src2, vlen_enc); 6994 break; 6995 default: 6996 fatal("Unsupported type %s", type2name(elem_bt)); 6997 break; 6998 } 6999 } 7000 7001 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) { 7002 if (is_unsigned) { 7003 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7004 } else { 7005 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7006 } 7007 } 7008 7009 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) { 7010 if (is_unsigned) { 7011 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7012 } else { 7013 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7014 } 7015 } 7016 7017 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 7018 switch(opcode) { 7019 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 7020 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 7021 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 7022 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 7023 default: assert(false, "%s", NodeClassNames[opcode]); break; 7024 } 7025 } 7026 7027 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7028 switch(opcode) { 7029 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 7030 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 7031 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 7032 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 7033 default: assert(false, "%s", NodeClassNames[opcode]); break; 7034 } 7035 } 7036 7037 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7038 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) { 7039 vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit); 7040 } 7041 7042 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7043 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 7044 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) { 7045 // Move sign bits of src2 to mask register. 7046 evpmovw2m(ktmp, src2, vlen_enc); 7047 // xtmp1 = src2 < 0 ? src2 : src1 7048 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7049 // xtmp2 = src2 < 0 ? ? src1 : src2 7050 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7051 // Idea behind above swapping is to make seconds source operand a +ve value. 7052 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in 7053 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction, 7054 // the second source operand, either a NaN or a valid floating-point value, is returned 7055 // dst = max(xtmp1, xtmp2) 7056 evmaxph(dst, xtmp1, xtmp2, vlen_enc); 7057 // isNaN = is_unordered_quiet(xtmp1) 7058 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7059 // Final result is same as first source if its a NaN value, 7060 // in case second operand holds a NaN value then as per above semantics 7061 // result is same as second operand. 7062 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7063 } else { 7064 assert(opcode == Op_MinVHF || opcode == Op_MinHF, ""); 7065 // Move sign bits of src1 to mask register. 7066 evpmovw2m(ktmp, src1, vlen_enc); 7067 // xtmp1 = src1 < 0 ? src2 : src1 7068 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7069 // xtmp2 = src1 < 0 ? src1 : src2 7070 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7071 // Idea behind above swapping is to make seconds source operand a -ve value. 7072 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in 7073 // the second source operand is returned. 7074 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN 7075 // or a valid floating-point value, is written to the result. 7076 // dst = min(xtmp1, xtmp2) 7077 evminph(dst, xtmp1, xtmp2, vlen_enc); 7078 // isNaN = is_unordered_quiet(xtmp1) 7079 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7080 // Final result is same as first source if its a NaN value, 7081 // in case second operand holds a NaN value then as per above semantics 7082 // result is same as second operand. 7083 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7084 } 7085 }