1 /* 2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "gc/shared/barrierSet.hpp" 28 #include "gc/shared/barrierSetAssembler.hpp" 29 #include "oops/methodData.hpp" 30 #include "opto/c2_MacroAssembler.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/output.hpp" 33 #include "opto/opcodes.hpp" 34 #include "opto/subnode.hpp" 35 #include "runtime/globals.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 #include "utilities/checkedCast.hpp" 39 #include "utilities/globalDefinitions.hpp" 40 #include "utilities/powerOfTwo.hpp" 41 #include "utilities/sizes.hpp" 42 43 #ifdef PRODUCT 44 #define BLOCK_COMMENT(str) /* nothing */ 45 #define STOP(error) stop(error) 46 #else 47 #define BLOCK_COMMENT(str) block_comment(str) 48 #define STOP(error) block_comment(error); stop(error) 49 #endif 50 51 // C2 compiled method's prolog code. 52 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) { 53 if (C->clinit_barrier_on_entry()) { 54 assert(VM_Version::supports_fast_class_init_checks(), "sanity"); 55 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started"); 56 57 Label L_skip_barrier; 58 Register klass = rscratch1; 59 60 mov_metadata(klass, C->method()->holder()->constant_encoding()); 61 clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/); 62 63 jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 64 65 bind(L_skip_barrier); 66 } 67 68 int framesize = C->output()->frame_size_in_bytes(); 69 int bangsize = C->output()->bang_size_in_bytes(); 70 bool fp_mode_24b = false; 71 int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0; 72 73 // WARNING: Initial instruction MUST be 5 bytes or longer so that 74 // NativeJump::patch_verified_entry will be able to patch out the entry 75 // code safely. The push to verify stack depth is ok at 5 bytes, 76 // the frame allocation can be either 3 or 6 bytes. So if we don't do 77 // stack bang then we must use the 6 byte frame allocation even if 78 // we have no frame. :-( 79 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 80 81 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 82 // Remove word for return addr 83 framesize -= wordSize; 84 stack_bang_size -= wordSize; 85 86 // Calls to C2R adapters often do not accept exceptional returns. 87 // We require that their callers must bang for them. But be careful, because 88 // some VM calls (such as call site linkage) can use several kilobytes of 89 // stack. But the stack safety zone should account for that. 90 // See bugs 4446381, 4468289, 4497237. 91 if (stack_bang_size > 0) { 92 generate_stack_overflow_check(stack_bang_size); 93 94 // We always push rbp, so that on return to interpreter rbp, will be 95 // restored correctly and we can correct the stack. 96 push(rbp); 97 // Save caller's stack pointer into RBP if the frame pointer is preserved. 98 if (PreserveFramePointer) { 99 mov(rbp, rsp); 100 } 101 // Remove word for ebp 102 framesize -= wordSize; 103 104 // Create frame 105 if (framesize) { 106 subptr(rsp, framesize); 107 } 108 } else { 109 // Create frame (force generation of a 4 byte immediate value) 110 subptr_imm32(rsp, framesize); 111 112 // Save RBP register now. 113 framesize -= wordSize; 114 movptr(Address(rsp, framesize), rbp); 115 // Save caller's stack pointer into RBP if the frame pointer is preserved. 116 if (PreserveFramePointer) { 117 movptr(rbp, rsp); 118 if (framesize > 0) { 119 addptr(rbp, framesize); 120 } 121 } 122 } 123 124 if (C->needs_stack_repair()) { 125 // Save stack increment just below the saved rbp (also account for fixed framesize and rbp) 126 assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned"); 127 movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize); 128 } 129 130 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 131 framesize -= wordSize; 132 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 133 } 134 135 #ifdef ASSERT 136 if (VerifyStackAtCalls) { 137 Label L; 138 push(rax); 139 mov(rax, rsp); 140 andptr(rax, StackAlignmentInBytes-1); 141 cmpptr(rax, StackAlignmentInBytes-wordSize); 142 pop(rax); 143 jcc(Assembler::equal, L); 144 STOP("Stack is not properly aligned!"); 145 bind(L); 146 } 147 #endif 148 } 149 150 void C2_MacroAssembler::entry_barrier() { 151 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 152 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 153 Label dummy_slow_path; 154 Label dummy_continuation; 155 Label* slow_path = &dummy_slow_path; 156 Label* continuation = &dummy_continuation; 157 if (!Compile::current()->output()->in_scratch_emit_size()) { 158 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 159 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 160 Compile::current()->output()->add_stub(stub); 161 slow_path = &stub->entry(); 162 continuation = &stub->continuation(); 163 } 164 bs->nmethod_entry_barrier(this, slow_path, continuation); 165 } 166 167 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 168 switch (vlen_in_bytes) { 169 case 4: // fall-through 170 case 8: // fall-through 171 case 16: return Assembler::AVX_128bit; 172 case 32: return Assembler::AVX_256bit; 173 case 64: return Assembler::AVX_512bit; 174 175 default: { 176 ShouldNotReachHere(); 177 return Assembler::AVX_NoVec; 178 } 179 } 180 } 181 182 // fast_lock and fast_unlock used by C2 183 184 // Because the transitions from emitted code to the runtime 185 // monitorenter/exit helper stubs are so slow it's critical that 186 // we inline both the stack-locking fast path and the inflated fast path. 187 // 188 // See also: cmpFastLock and cmpFastUnlock. 189 // 190 // What follows is a specialized inline transliteration of the code 191 // in enter() and exit(). If we're concerned about I$ bloat another 192 // option would be to emit TrySlowEnter and TrySlowExit methods 193 // at startup-time. These methods would accept arguments as 194 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 195 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 196 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 197 // In practice, however, the # of lock sites is bounded and is usually small. 198 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 199 // if the processor uses simple bimodal branch predictors keyed by EIP 200 // Since the helper routines would be called from multiple synchronization 201 // sites. 202 // 203 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 204 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 205 // to those specialized methods. That'd give us a mostly platform-independent 206 // implementation that the JITs could optimize and inline at their pleasure. 207 // Done correctly, the only time we'd need to cross to native could would be 208 // to park() or unpark() threads. We'd also need a few more unsafe operators 209 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 210 // (b) explicit barriers or fence operations. 211 // 212 // TODO: 213 // 214 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 215 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 216 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 217 // the lock operators would typically be faster than reifying Self. 218 // 219 // * Ideally I'd define the primitives as: 220 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 221 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 222 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 223 // Instead, we're stuck with a rather awkward and brittle register assignments below. 224 // Furthermore the register assignments are overconstrained, possibly resulting in 225 // sub-optimal code near the synchronization site. 226 // 227 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 228 // Alternately, use a better sp-proximity test. 229 // 230 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 231 // Either one is sufficient to uniquely identify a thread. 232 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 233 // 234 // * Intrinsify notify() and notifyAll() for the common cases where the 235 // object is locked by the calling thread but the waitlist is empty. 236 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 237 // 238 // * use jccb and jmpb instead of jcc and jmp to improve code density. 239 // But beware of excessive branch density on AMD Opterons. 240 // 241 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 242 // or failure of the fast path. If the fast path fails then we pass 243 // control to the slow path, typically in C. In fast_lock and 244 // fast_unlock we often branch to DONE_LABEL, just to find that C2 245 // will emit a conditional branch immediately after the node. 246 // So we have branches to branches and lots of ICC.ZF games. 247 // Instead, it might be better to have C2 pass a "FailureLabel" 248 // into fast_lock and fast_unlock. In the case of success, control 249 // will drop through the node. ICC.ZF is undefined at exit. 250 // In the case of failure, the node will branch directly to the 251 // FailureLabel 252 253 254 // obj: object to lock 255 // box: on-stack box address (displaced header location) - KILLED 256 // rax,: tmp -- KILLED 257 // scr: tmp -- KILLED 258 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 259 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 260 Metadata* method_data) { 261 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 262 // Ensure the register assignments are disjoint 263 assert(tmpReg == rax, ""); 264 assert(cx1Reg == noreg, ""); 265 assert(cx2Reg == noreg, ""); 266 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 267 268 // Possible cases that we'll encounter in fast_lock 269 // ------------------------------------------------ 270 // * Inflated 271 // -- unlocked 272 // -- Locked 273 // = by self 274 // = by other 275 // * neutral 276 // * stack-locked 277 // -- by self 278 // = sp-proximity test hits 279 // = sp-proximity test generates false-negative 280 // -- by other 281 // 282 283 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 284 285 if (DiagnoseSyncOnValueBasedClasses != 0) { 286 load_klass(tmpReg, objReg, scrReg); 287 testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 288 jcc(Assembler::notZero, DONE_LABEL); 289 } 290 291 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 292 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 293 jcc(Assembler::notZero, IsInflated); 294 295 if (LockingMode == LM_MONITOR) { 296 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 297 testptr(objReg, objReg); 298 } else { 299 assert(LockingMode == LM_LEGACY, "must be"); 300 // Attempt stack-locking ... 301 orptr (tmpReg, markWord::unlocked_value); 302 if (EnableValhalla) { 303 // Mask inline_type bit such that we go to the slow path if object is an inline type 304 andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place)); 305 } 306 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 307 lock(); 308 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 309 jcc(Assembler::equal, COUNT); // Success 310 311 // Recursive locking. 312 // The object is stack-locked: markword contains stack pointer to BasicLock. 313 // Locked by current thread if difference with current SP is less than one page. 314 subptr(tmpReg, rsp); 315 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 316 andptr(tmpReg, (int32_t) (7 - (int)os::vm_page_size()) ); 317 movptr(Address(boxReg, 0), tmpReg); 318 } 319 jmp(DONE_LABEL); 320 321 bind(IsInflated); 322 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 323 324 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 325 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 326 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 327 328 // It's inflated and we use scrReg for ObjectMonitor* in this section. 329 movptr(boxReg, Address(r15_thread, JavaThread::monitor_owner_id_offset())); 330 movq(scrReg, tmpReg); 331 xorq(tmpReg, tmpReg); 332 lock(); 333 cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 334 335 // Propagate ICC.ZF from CAS above into DONE_LABEL. 336 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 337 338 cmpptr(boxReg, rax); // Check if we are already the owner (recursive lock) 339 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 340 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 341 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 342 bind(DONE_LABEL); 343 344 // ZFlag == 1 count in fast path 345 // ZFlag == 0 count in slow path 346 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 347 348 bind(COUNT); 349 if (LockingMode == LM_LEGACY) { 350 // Count monitors in fast path 351 increment(Address(thread, JavaThread::held_monitor_count_offset())); 352 } 353 xorl(tmpReg, tmpReg); // Set ZF == 1 354 355 bind(NO_COUNT); 356 357 // At NO_COUNT the icc ZFlag is set as follows ... 358 // fast_unlock uses the same protocol. 359 // ZFlag == 1 -> Success 360 // ZFlag == 0 -> Failure - force control through the slow path 361 } 362 363 // obj: object to unlock 364 // box: box address (displaced header location), killed. Must be EAX. 365 // tmp: killed, cannot be obj nor box. 366 // 367 // Some commentary on balanced locking: 368 // 369 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 370 // Methods that don't have provably balanced locking are forced to run in the 371 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 372 // The interpreter provides two properties: 373 // I1: At return-time the interpreter automatically and quietly unlocks any 374 // objects acquired the current activation (frame). Recall that the 375 // interpreter maintains an on-stack list of locks currently held by 376 // a frame. 377 // I2: If a method attempts to unlock an object that is not held by the 378 // the frame the interpreter throws IMSX. 379 // 380 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 381 // B() doesn't have provably balanced locking so it runs in the interpreter. 382 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 383 // is still locked by A(). 384 // 385 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 386 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 387 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 388 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 389 // Arguably given that the spec legislates the JNI case as undefined our implementation 390 // could reasonably *avoid* checking owner in fast_unlock(). 391 // In the interest of performance we elide m->Owner==Self check in unlock. 392 // A perfectly viable alternative is to elide the owner check except when 393 // Xcheck:jni is enabled. 394 395 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { 396 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 397 assert(boxReg == rax, ""); 398 assert_different_registers(objReg, boxReg, tmpReg); 399 400 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 401 402 if (LockingMode == LM_LEGACY) { 403 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 404 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 405 } 406 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 407 if (LockingMode != LM_MONITOR) { 408 testptr(tmpReg, markWord::monitor_value); // Inflated? 409 jcc(Assembler::zero, Stacked); 410 } 411 412 // It's inflated. 413 414 // Despite our balanced locking property we still check that m->_owner == Self 415 // as java routines or native JNI code called by this thread might 416 // have released the lock. 417 // 418 // If there's no contention try a 1-0 exit. That is, exit without 419 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 420 // we detect and recover from the race that the 1-0 exit admits. 421 // 422 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 423 // before it STs null into _owner, releasing the lock. Updates 424 // to data protected by the critical section must be visible before 425 // we drop the lock (and thus before any other thread could acquire 426 // the lock and observe the fields protected by the lock). 427 // IA32's memory-model is SPO, so STs are ordered with respect to 428 // each other and there's no need for an explicit barrier (fence). 429 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 430 Label LSuccess, LNotRecursive; 431 432 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 433 jccb(Assembler::equal, LNotRecursive); 434 435 // Recursive inflated unlock 436 decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 437 jmpb(LSuccess); 438 439 bind(LNotRecursive); 440 441 // Set owner to null. 442 // Release to satisfy the JMM 443 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 444 // We need a full fence after clearing owner to avoid stranding. 445 // StoreLoad achieves this. 446 membar(StoreLoad); 447 448 // Check if the entry_list is empty. 449 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(entry_list)), NULL_WORD); 450 jccb(Assembler::zero, LSuccess); // If so we are done. 451 452 // Check if there is a successor. 453 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 454 jccb(Assembler::notZero, LSuccess); // If so we are done. 455 456 // Save the monitor pointer in the current thread, so we can try to 457 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 458 andptr(tmpReg, ~(int32_t)markWord::monitor_value); 459 movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg); 460 461 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 462 jmpb (DONE_LABEL); 463 464 bind (LSuccess); 465 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 466 jmpb (DONE_LABEL); 467 468 if (LockingMode == LM_LEGACY) { 469 bind (Stacked); 470 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 471 lock(); 472 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 473 // Intentional fall-thru into DONE_LABEL 474 } 475 476 bind(DONE_LABEL); 477 478 // ZFlag == 1 count in fast path 479 // ZFlag == 0 count in slow path 480 jccb(Assembler::notZero, NO_COUNT); 481 482 bind(COUNT); 483 484 if (LockingMode == LM_LEGACY) { 485 // Count monitors in fast path 486 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 487 } 488 489 xorl(tmpReg, tmpReg); // Set ZF == 1 490 491 bind(NO_COUNT); 492 } 493 494 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 495 Register t, Register thread) { 496 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 497 assert(rax_reg == rax, "Used for CAS"); 498 assert_different_registers(obj, box, rax_reg, t, thread); 499 500 // Handle inflated monitor. 501 Label inflated; 502 // Finish fast lock successfully. ZF value is irrelevant. 503 Label locked; 504 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 505 Label slow_path; 506 507 if (UseObjectMonitorTable) { 508 // Clear cache in case fast locking succeeds or we need to take the slow-path. 509 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 510 } 511 512 if (DiagnoseSyncOnValueBasedClasses != 0) { 513 load_klass(rax_reg, obj, t); 514 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 515 jcc(Assembler::notZero, slow_path); 516 } 517 518 const Register mark = t; 519 520 { // Lightweight Lock 521 522 Label push; 523 524 const Register top = UseObjectMonitorTable ? rax_reg : box; 525 526 // Load the mark. 527 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 528 529 // Prefetch top. 530 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 531 532 // Check for monitor (0b10). 533 testptr(mark, markWord::monitor_value); 534 jcc(Assembler::notZero, inflated); 535 536 // Check if lock-stack is full. 537 cmpl(top, LockStack::end_offset() - 1); 538 jcc(Assembler::greater, slow_path); 539 540 // Check if recursive. 541 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 542 jccb(Assembler::equal, push); 543 544 // Try to lock. Transition lock bits 0b01 => 0b00 545 movptr(rax_reg, mark); 546 orptr(rax_reg, markWord::unlocked_value); 547 andptr(mark, ~(int32_t)markWord::unlocked_value); 548 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 549 jcc(Assembler::notEqual, slow_path); 550 551 if (UseObjectMonitorTable) { 552 // Need to reload top, clobbered by CAS. 553 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 554 } 555 bind(push); 556 // After successful lock, push object on lock-stack. 557 movptr(Address(thread, top), obj); 558 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 559 jmpb(locked); 560 } 561 562 { // Handle inflated monitor. 563 bind(inflated); 564 565 const Register monitor = t; 566 567 if (!UseObjectMonitorTable) { 568 assert(mark == monitor, "should be the same here"); 569 } else { 570 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 571 // Fetch ObjectMonitor* from the cache or take the slow-path. 572 Label monitor_found; 573 574 // Load cache address 575 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 576 577 const int num_unrolled = 2; 578 for (int i = 0; i < num_unrolled; i++) { 579 cmpptr(obj, Address(t)); 580 jccb(Assembler::equal, monitor_found); 581 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 582 } 583 584 Label loop; 585 586 // Search for obj in cache. 587 bind(loop); 588 589 // Check for match. 590 cmpptr(obj, Address(t)); 591 jccb(Assembler::equal, monitor_found); 592 593 // Search until null encountered, guaranteed _null_sentinel at end. 594 cmpptr(Address(t), 1); 595 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 596 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 597 jmpb(loop); 598 599 // Cache hit. 600 bind(monitor_found); 601 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 602 } 603 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 604 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 605 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 606 607 Label monitor_locked; 608 // Lock the monitor. 609 610 if (UseObjectMonitorTable) { 611 // Cache the monitor for unlock before trashing box. On failure to acquire 612 // the lock, the slow path will reset the entry accordingly (see CacheSetter). 613 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 614 } 615 616 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 617 xorptr(rax_reg, rax_reg); 618 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset())); 619 lock(); cmpxchgptr(box, owner_address); 620 jccb(Assembler::equal, monitor_locked); 621 622 // Check if recursive. 623 cmpptr(box, rax_reg); 624 jccb(Assembler::notEqual, slow_path); 625 626 // Recursive. 627 increment(recursions_address); 628 629 bind(monitor_locked); 630 } 631 632 bind(locked); 633 // Set ZF = 1 634 xorl(rax_reg, rax_reg); 635 636 #ifdef ASSERT 637 // Check that locked label is reached with ZF set. 638 Label zf_correct; 639 Label zf_bad_zero; 640 jcc(Assembler::zero, zf_correct); 641 jmp(zf_bad_zero); 642 #endif 643 644 bind(slow_path); 645 #ifdef ASSERT 646 // Check that slow_path label is reached with ZF not set. 647 jcc(Assembler::notZero, zf_correct); 648 stop("Fast Lock ZF != 0"); 649 bind(zf_bad_zero); 650 stop("Fast Lock ZF != 1"); 651 bind(zf_correct); 652 #endif 653 // C2 uses the value of ZF to determine the continuation. 654 } 655 656 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 657 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 658 assert(reg_rax == rax, "Used for CAS"); 659 assert_different_registers(obj, reg_rax, t); 660 661 // Handle inflated monitor. 662 Label inflated, inflated_check_lock_stack; 663 // Finish fast unlock successfully. MUST jump with ZF == 1 664 Label unlocked, slow_path; 665 666 const Register mark = t; 667 const Register monitor = t; 668 const Register top = UseObjectMonitorTable ? t : reg_rax; 669 const Register box = reg_rax; 670 671 Label dummy; 672 C2FastUnlockLightweightStub* stub = nullptr; 673 674 if (!Compile::current()->output()->in_scratch_emit_size()) { 675 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 676 Compile::current()->output()->add_stub(stub); 677 } 678 679 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 680 681 { // Lightweight Unlock 682 683 // Load top. 684 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 685 686 if (!UseObjectMonitorTable) { 687 // Prefetch mark. 688 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 689 } 690 691 // Check if obj is top of lock-stack. 692 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 693 // Top of lock stack was not obj. Must be monitor. 694 jcc(Assembler::notEqual, inflated_check_lock_stack); 695 696 // Pop lock-stack. 697 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 698 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 699 700 // Check if recursive. 701 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 702 jcc(Assembler::equal, unlocked); 703 704 // We elide the monitor check, let the CAS fail instead. 705 706 if (UseObjectMonitorTable) { 707 // Load mark. 708 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 709 } 710 711 // Try to unlock. Transition lock bits 0b00 => 0b01 712 movptr(reg_rax, mark); 713 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 714 orptr(mark, markWord::unlocked_value); 715 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 716 jcc(Assembler::notEqual, push_and_slow_path); 717 jmp(unlocked); 718 } 719 720 721 { // Handle inflated monitor. 722 bind(inflated_check_lock_stack); 723 #ifdef ASSERT 724 Label check_done; 725 subl(top, oopSize); 726 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 727 jcc(Assembler::below, check_done); 728 cmpptr(obj, Address(thread, top)); 729 jccb(Assembler::notEqual, inflated_check_lock_stack); 730 stop("Fast Unlock lock on stack"); 731 bind(check_done); 732 if (UseObjectMonitorTable) { 733 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 734 } 735 testptr(mark, markWord::monitor_value); 736 jccb(Assembler::notZero, inflated); 737 stop("Fast Unlock not monitor"); 738 #endif 739 740 bind(inflated); 741 742 if (!UseObjectMonitorTable) { 743 assert(mark == monitor, "should be the same here"); 744 } else { 745 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 746 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 747 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 748 cmpptr(monitor, alignof(ObjectMonitor*)); 749 jcc(Assembler::below, slow_path); 750 } 751 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 752 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 753 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 754 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag}; 755 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 756 757 Label recursive; 758 759 // Check if recursive. 760 cmpptr(recursions_address, 0); 761 jccb(Assembler::notZero, recursive); 762 763 // Set owner to null. 764 // Release to satisfy the JMM 765 movptr(owner_address, NULL_WORD); 766 // We need a full fence after clearing owner to avoid stranding. 767 // StoreLoad achieves this. 768 membar(StoreLoad); 769 770 // Check if the entry_list is empty. 771 cmpptr(entry_list_address, NULL_WORD); 772 jccb(Assembler::zero, unlocked); // If so we are done. 773 774 // Check if there is a successor. 775 cmpptr(succ_address, NULL_WORD); 776 jccb(Assembler::notZero, unlocked); // If so we are done. 777 778 // Save the monitor pointer in the current thread, so we can try to 779 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 780 if (!UseObjectMonitorTable) { 781 andptr(monitor, ~(int32_t)markWord::monitor_value); 782 } 783 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 784 785 orl(t, 1); // Fast Unlock ZF = 0 786 jmpb(slow_path); 787 788 // Recursive unlock. 789 bind(recursive); 790 decrement(recursions_address); 791 } 792 793 bind(unlocked); 794 xorl(t, t); // Fast Unlock ZF = 1 795 796 #ifdef ASSERT 797 // Check that unlocked label is reached with ZF set. 798 Label zf_correct; 799 Label zf_bad_zero; 800 jcc(Assembler::zero, zf_correct); 801 jmp(zf_bad_zero); 802 #endif 803 804 bind(slow_path); 805 if (stub != nullptr) { 806 bind(stub->slow_path_continuation()); 807 } 808 #ifdef ASSERT 809 // Check that stub->continuation() label is reached with ZF not set. 810 jcc(Assembler::notZero, zf_correct); 811 stop("Fast Unlock ZF != 0"); 812 bind(zf_bad_zero); 813 stop("Fast Unlock ZF != 1"); 814 bind(zf_correct); 815 #endif 816 // C2 uses the value of ZF to determine the continuation. 817 } 818 819 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) { 820 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi); 821 } 822 823 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) { 824 const int framesize = Compile::current()->output()->frame_size_in_bytes(); 825 masm->movptr(dst, rsp); 826 if (framesize > 2 * wordSize) { 827 masm->addptr(dst, framesize - 2 * wordSize); 828 } 829 } 830 831 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) { 832 if (PreserveFramePointer) { 833 // frame pointer is valid 834 #ifdef ASSERT 835 // Verify frame pointer value in rbp. 836 reconstruct_frame_pointer_helper(this, rtmp); 837 Label L_success; 838 cmpq(rbp, rtmp); 839 jccb(Assembler::equal, L_success); 840 STOP("frame pointer mismatch"); 841 bind(L_success); 842 #endif // ASSERT 843 } else { 844 reconstruct_frame_pointer_helper(this, rbp); 845 } 846 } 847 848 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) { 849 jint lo = t->_lo; 850 jint hi = t->_hi; 851 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi); 852 if (t == TypeInt::INT) { 853 return; 854 } 855 856 BLOCK_COMMENT("CastII {"); 857 Label fail; 858 Label succeed; 859 if (hi == max_jint) { 860 cmpl(val, lo); 861 jccb(Assembler::greaterEqual, succeed); 862 } else { 863 if (lo != min_jint) { 864 cmpl(val, lo); 865 jccb(Assembler::less, fail); 866 } 867 cmpl(val, hi); 868 jccb(Assembler::lessEqual, succeed); 869 } 870 871 bind(fail); 872 movl(c_rarg0, idx); 873 movl(c_rarg1, val); 874 movl(c_rarg2, lo); 875 movl(c_rarg3, hi); 876 reconstruct_frame_pointer(rscratch1); 877 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range))); 878 hlt(); 879 bind(succeed); 880 BLOCK_COMMENT("} // CastII"); 881 } 882 883 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) { 884 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi); 885 } 886 887 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) { 888 jlong lo = t->_lo; 889 jlong hi = t->_hi; 890 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi); 891 if (t == TypeLong::LONG) { 892 return; 893 } 894 895 BLOCK_COMMENT("CastLL {"); 896 Label fail; 897 Label succeed; 898 899 auto cmp_val = [&](jlong bound) { 900 if (is_simm32(bound)) { 901 cmpq(val, checked_cast<int>(bound)); 902 } else { 903 mov64(tmp, bound); 904 cmpq(val, tmp); 905 } 906 }; 907 908 if (hi == max_jlong) { 909 cmp_val(lo); 910 jccb(Assembler::greaterEqual, succeed); 911 } else { 912 if (lo != min_jlong) { 913 cmp_val(lo); 914 jccb(Assembler::less, fail); 915 } 916 cmp_val(hi); 917 jccb(Assembler::lessEqual, succeed); 918 } 919 920 bind(fail); 921 movl(c_rarg0, idx); 922 movq(c_rarg1, val); 923 mov64(c_rarg2, lo); 924 mov64(c_rarg3, hi); 925 reconstruct_frame_pointer(rscratch1); 926 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range))); 927 hlt(); 928 bind(succeed); 929 BLOCK_COMMENT("} // CastLL"); 930 } 931 932 //------------------------------------------------------------------------------------------- 933 // Generic instructions support for use in .ad files C2 code generation 934 935 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 936 if (dst != src) { 937 movdqu(dst, src); 938 } 939 if (opcode == Op_AbsVD) { 940 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 941 } else { 942 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 943 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 944 } 945 } 946 947 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 948 if (opcode == Op_AbsVD) { 949 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 950 } else { 951 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 952 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 953 } 954 } 955 956 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 957 if (dst != src) { 958 movdqu(dst, src); 959 } 960 if (opcode == Op_AbsVF) { 961 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 962 } else { 963 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 964 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 965 } 966 } 967 968 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 969 if (opcode == Op_AbsVF) { 970 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 971 } else { 972 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 973 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 974 } 975 } 976 977 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 978 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 979 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 980 981 if (opcode == Op_MinV) { 982 if (elem_bt == T_BYTE) { 983 pminsb(dst, src); 984 } else if (elem_bt == T_SHORT) { 985 pminsw(dst, src); 986 } else if (elem_bt == T_INT) { 987 pminsd(dst, src); 988 } else { 989 assert(elem_bt == T_LONG, "required"); 990 assert(tmp == xmm0, "required"); 991 assert_different_registers(dst, src, tmp); 992 movdqu(xmm0, dst); 993 pcmpgtq(xmm0, src); 994 blendvpd(dst, src); // xmm0 as mask 995 } 996 } else { // opcode == Op_MaxV 997 if (elem_bt == T_BYTE) { 998 pmaxsb(dst, src); 999 } else if (elem_bt == T_SHORT) { 1000 pmaxsw(dst, src); 1001 } else if (elem_bt == T_INT) { 1002 pmaxsd(dst, src); 1003 } else { 1004 assert(elem_bt == T_LONG, "required"); 1005 assert(tmp == xmm0, "required"); 1006 assert_different_registers(dst, src, tmp); 1007 movdqu(xmm0, src); 1008 pcmpgtq(xmm0, dst); 1009 blendvpd(dst, src); // xmm0 as mask 1010 } 1011 } 1012 } 1013 1014 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 1015 XMMRegister src1, Address src2, int vlen_enc) { 1016 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 1017 if (opcode == Op_UMinV) { 1018 switch(elem_bt) { 1019 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 1020 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 1021 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 1022 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 1023 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 1024 } 1025 } else { 1026 assert(opcode == Op_UMaxV, "required"); 1027 switch(elem_bt) { 1028 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 1029 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 1030 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 1031 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 1032 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 1033 } 1034 } 1035 } 1036 1037 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 1038 // For optimality, leverage a full vector width of 512 bits 1039 // for operations over smaller vector sizes on AVX512 targets. 1040 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { 1041 if (opcode == Op_UMaxV) { 1042 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 1043 } else { 1044 assert(opcode == Op_UMinV, "required"); 1045 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 1046 } 1047 } else { 1048 // T1 = -1 1049 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc); 1050 // T1 = -1 << 63 1051 vpsllq(xtmp1, xtmp1, 63, vlen_enc); 1052 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2 1053 vpaddq(xtmp2, xtmp1, src2, vlen_enc); 1054 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1 1055 vpaddq(xtmp1, xtmp1, src1, vlen_enc); 1056 // Mask = T2 > T1 1057 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc); 1058 if (opcode == Op_UMaxV) { 1059 // Res = Mask ? Src2 : Src1 1060 vpblendvb(dst, src1, src2, xtmp1, vlen_enc); 1061 } else { 1062 // Res = Mask ? Src1 : Src2 1063 vpblendvb(dst, src2, src1, xtmp1, vlen_enc); 1064 } 1065 } 1066 } 1067 1068 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 1069 XMMRegister src1, XMMRegister src2, int vlen_enc) { 1070 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 1071 if (opcode == Op_UMinV) { 1072 switch(elem_bt) { 1073 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 1074 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 1075 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 1076 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 1077 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 1078 } 1079 } else { 1080 assert(opcode == Op_UMaxV, "required"); 1081 switch(elem_bt) { 1082 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 1083 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 1084 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 1085 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 1086 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 1087 } 1088 } 1089 } 1090 1091 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1092 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1093 int vlen_enc) { 1094 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1095 1096 if (opcode == Op_MinV) { 1097 if (elem_bt == T_BYTE) { 1098 vpminsb(dst, src1, src2, vlen_enc); 1099 } else if (elem_bt == T_SHORT) { 1100 vpminsw(dst, src1, src2, vlen_enc); 1101 } else if (elem_bt == T_INT) { 1102 vpminsd(dst, src1, src2, vlen_enc); 1103 } else { 1104 assert(elem_bt == T_LONG, "required"); 1105 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1106 vpminsq(dst, src1, src2, vlen_enc); 1107 } else { 1108 assert_different_registers(dst, src1, src2); 1109 vpcmpgtq(dst, src1, src2, vlen_enc); 1110 vblendvpd(dst, src1, src2, dst, vlen_enc); 1111 } 1112 } 1113 } else { // opcode == Op_MaxV 1114 if (elem_bt == T_BYTE) { 1115 vpmaxsb(dst, src1, src2, vlen_enc); 1116 } else if (elem_bt == T_SHORT) { 1117 vpmaxsw(dst, src1, src2, vlen_enc); 1118 } else if (elem_bt == T_INT) { 1119 vpmaxsd(dst, src1, src2, vlen_enc); 1120 } else { 1121 assert(elem_bt == T_LONG, "required"); 1122 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1123 vpmaxsq(dst, src1, src2, vlen_enc); 1124 } else { 1125 assert_different_registers(dst, src1, src2); 1126 vpcmpgtq(dst, src1, src2, vlen_enc); 1127 vblendvpd(dst, src2, src1, dst, vlen_enc); 1128 } 1129 } 1130 } 1131 } 1132 1133 // Float/Double min max 1134 1135 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1136 XMMRegister dst, XMMRegister a, XMMRegister b, 1137 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1138 int vlen_enc) { 1139 assert(UseAVX > 0, "required"); 1140 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1141 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1142 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1143 assert_different_registers(a, tmp, atmp, btmp); 1144 assert_different_registers(b, tmp, atmp, btmp); 1145 1146 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1147 bool is_double_word = is_double_word_type(elem_bt); 1148 1149 /* Note on 'non-obvious' assembly sequence: 1150 * 1151 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1152 * and Java on how they handle floats: 1153 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1154 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1155 * 1156 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1157 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1158 * (only useful when signs differ, noop otherwise) 1159 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1160 1161 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1162 * btmp = (b < +0.0) ? a : b 1163 * atmp = (b < +0.0) ? b : a 1164 * Tmp = Max_Float(atmp , btmp) 1165 * Res = (atmp == NaN) ? atmp : Tmp 1166 */ 1167 1168 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1169 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1170 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1171 XMMRegister mask; 1172 1173 if (!is_double_word && is_min) { 1174 mask = a; 1175 vblend = &MacroAssembler::vblendvps; 1176 vmaxmin = &MacroAssembler::vminps; 1177 vcmp = &MacroAssembler::vcmpps; 1178 } else if (!is_double_word && !is_min) { 1179 mask = b; 1180 vblend = &MacroAssembler::vblendvps; 1181 vmaxmin = &MacroAssembler::vmaxps; 1182 vcmp = &MacroAssembler::vcmpps; 1183 } else if (is_double_word && is_min) { 1184 mask = a; 1185 vblend = &MacroAssembler::vblendvpd; 1186 vmaxmin = &MacroAssembler::vminpd; 1187 vcmp = &MacroAssembler::vcmppd; 1188 } else { 1189 assert(is_double_word && !is_min, "sanity"); 1190 mask = b; 1191 vblend = &MacroAssembler::vblendvpd; 1192 vmaxmin = &MacroAssembler::vmaxpd; 1193 vcmp = &MacroAssembler::vcmppd; 1194 } 1195 1196 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1197 XMMRegister maxmin, scratch; 1198 if (dst == btmp) { 1199 maxmin = btmp; 1200 scratch = tmp; 1201 } else { 1202 maxmin = tmp; 1203 scratch = btmp; 1204 } 1205 1206 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1207 if (precompute_mask && !is_double_word) { 1208 vpsrad(tmp, mask, 32, vlen_enc); 1209 mask = tmp; 1210 } else if (precompute_mask && is_double_word) { 1211 vpxor(tmp, tmp, tmp, vlen_enc); 1212 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1213 mask = tmp; 1214 } 1215 1216 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1217 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1218 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1219 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1220 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1221 } 1222 1223 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1224 XMMRegister dst, XMMRegister a, XMMRegister b, 1225 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1226 int vlen_enc) { 1227 assert(UseAVX > 2, "required"); 1228 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1229 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1230 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1231 assert_different_registers(dst, a, atmp, btmp); 1232 assert_different_registers(dst, b, atmp, btmp); 1233 1234 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1235 bool is_double_word = is_double_word_type(elem_bt); 1236 bool merge = true; 1237 1238 if (!is_double_word && is_min) { 1239 evpmovd2m(ktmp, a, vlen_enc); 1240 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1241 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1242 vminps(dst, atmp, btmp, vlen_enc); 1243 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1244 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1245 } else if (!is_double_word && !is_min) { 1246 evpmovd2m(ktmp, b, vlen_enc); 1247 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1248 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1249 vmaxps(dst, atmp, btmp, vlen_enc); 1250 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1251 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1252 } else if (is_double_word && is_min) { 1253 evpmovq2m(ktmp, a, vlen_enc); 1254 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1255 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1256 vminpd(dst, atmp, btmp, vlen_enc); 1257 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1258 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1259 } else { 1260 assert(is_double_word && !is_min, "sanity"); 1261 evpmovq2m(ktmp, b, vlen_enc); 1262 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1263 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1264 vmaxpd(dst, atmp, btmp, vlen_enc); 1265 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1266 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1267 } 1268 } 1269 1270 // Float/Double signum 1271 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1272 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1273 1274 Label DONE_LABEL; 1275 1276 if (opcode == Op_SignumF) { 1277 ucomiss(dst, zero); 1278 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1279 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1280 movflt(dst, one); 1281 jcc(Assembler::above, DONE_LABEL); 1282 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1283 } else if (opcode == Op_SignumD) { 1284 ucomisd(dst, zero); 1285 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1286 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1287 movdbl(dst, one); 1288 jcc(Assembler::above, DONE_LABEL); 1289 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1290 } 1291 1292 bind(DONE_LABEL); 1293 } 1294 1295 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1296 if (sign) { 1297 pmovsxbw(dst, src); 1298 } else { 1299 pmovzxbw(dst, src); 1300 } 1301 } 1302 1303 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1304 if (sign) { 1305 vpmovsxbw(dst, src, vector_len); 1306 } else { 1307 vpmovzxbw(dst, src, vector_len); 1308 } 1309 } 1310 1311 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1312 if (sign) { 1313 vpmovsxbd(dst, src, vector_len); 1314 } else { 1315 vpmovzxbd(dst, src, vector_len); 1316 } 1317 } 1318 1319 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1320 if (sign) { 1321 vpmovsxwd(dst, src, vector_len); 1322 } else { 1323 vpmovzxwd(dst, src, vector_len); 1324 } 1325 } 1326 1327 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1328 int shift, int vector_len) { 1329 if (opcode == Op_RotateLeftV) { 1330 if (etype == T_INT) { 1331 evprold(dst, src, shift, vector_len); 1332 } else { 1333 assert(etype == T_LONG, "expected type T_LONG"); 1334 evprolq(dst, src, shift, vector_len); 1335 } 1336 } else { 1337 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1338 if (etype == T_INT) { 1339 evprord(dst, src, shift, vector_len); 1340 } else { 1341 assert(etype == T_LONG, "expected type T_LONG"); 1342 evprorq(dst, src, shift, vector_len); 1343 } 1344 } 1345 } 1346 1347 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1348 XMMRegister shift, int vector_len) { 1349 if (opcode == Op_RotateLeftV) { 1350 if (etype == T_INT) { 1351 evprolvd(dst, src, shift, vector_len); 1352 } else { 1353 assert(etype == T_LONG, "expected type T_LONG"); 1354 evprolvq(dst, src, shift, vector_len); 1355 } 1356 } else { 1357 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1358 if (etype == T_INT) { 1359 evprorvd(dst, src, shift, vector_len); 1360 } else { 1361 assert(etype == T_LONG, "expected type T_LONG"); 1362 evprorvq(dst, src, shift, vector_len); 1363 } 1364 } 1365 } 1366 1367 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1368 if (opcode == Op_RShiftVI) { 1369 psrad(dst, shift); 1370 } else if (opcode == Op_LShiftVI) { 1371 pslld(dst, shift); 1372 } else { 1373 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1374 psrld(dst, shift); 1375 } 1376 } 1377 1378 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1379 switch (opcode) { 1380 case Op_RShiftVI: psrad(dst, shift); break; 1381 case Op_LShiftVI: pslld(dst, shift); break; 1382 case Op_URShiftVI: psrld(dst, shift); break; 1383 1384 default: assert(false, "%s", NodeClassNames[opcode]); 1385 } 1386 } 1387 1388 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1389 if (opcode == Op_RShiftVI) { 1390 vpsrad(dst, nds, shift, vector_len); 1391 } else if (opcode == Op_LShiftVI) { 1392 vpslld(dst, nds, shift, vector_len); 1393 } else { 1394 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1395 vpsrld(dst, nds, shift, vector_len); 1396 } 1397 } 1398 1399 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1400 switch (opcode) { 1401 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1402 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1403 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1404 1405 default: assert(false, "%s", NodeClassNames[opcode]); 1406 } 1407 } 1408 1409 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1410 switch (opcode) { 1411 case Op_RShiftVB: // fall-through 1412 case Op_RShiftVS: psraw(dst, shift); break; 1413 1414 case Op_LShiftVB: // fall-through 1415 case Op_LShiftVS: psllw(dst, shift); break; 1416 1417 case Op_URShiftVS: // fall-through 1418 case Op_URShiftVB: psrlw(dst, shift); break; 1419 1420 default: assert(false, "%s", NodeClassNames[opcode]); 1421 } 1422 } 1423 1424 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1425 switch (opcode) { 1426 case Op_RShiftVB: // fall-through 1427 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1428 1429 case Op_LShiftVB: // fall-through 1430 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1431 1432 case Op_URShiftVS: // fall-through 1433 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1434 1435 default: assert(false, "%s", NodeClassNames[opcode]); 1436 } 1437 } 1438 1439 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1440 switch (opcode) { 1441 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1442 case Op_LShiftVL: psllq(dst, shift); break; 1443 case Op_URShiftVL: psrlq(dst, shift); break; 1444 1445 default: assert(false, "%s", NodeClassNames[opcode]); 1446 } 1447 } 1448 1449 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1450 if (opcode == Op_RShiftVL) { 1451 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1452 } else if (opcode == Op_LShiftVL) { 1453 psllq(dst, shift); 1454 } else { 1455 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1456 psrlq(dst, shift); 1457 } 1458 } 1459 1460 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1461 switch (opcode) { 1462 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1463 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1464 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1465 1466 default: assert(false, "%s", NodeClassNames[opcode]); 1467 } 1468 } 1469 1470 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1471 if (opcode == Op_RShiftVL) { 1472 evpsraq(dst, nds, shift, vector_len); 1473 } else if (opcode == Op_LShiftVL) { 1474 vpsllq(dst, nds, shift, vector_len); 1475 } else { 1476 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1477 vpsrlq(dst, nds, shift, vector_len); 1478 } 1479 } 1480 1481 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1482 switch (opcode) { 1483 case Op_RShiftVB: // fall-through 1484 case Op_RShiftVS: // fall-through 1485 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1486 1487 case Op_LShiftVB: // fall-through 1488 case Op_LShiftVS: // fall-through 1489 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1490 1491 case Op_URShiftVB: // fall-through 1492 case Op_URShiftVS: // fall-through 1493 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1494 1495 default: assert(false, "%s", NodeClassNames[opcode]); 1496 } 1497 } 1498 1499 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1500 switch (opcode) { 1501 case Op_RShiftVB: // fall-through 1502 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1503 1504 case Op_LShiftVB: // fall-through 1505 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1506 1507 case Op_URShiftVB: // fall-through 1508 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1509 1510 default: assert(false, "%s", NodeClassNames[opcode]); 1511 } 1512 } 1513 1514 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1515 assert(UseAVX >= 2, "required"); 1516 switch (opcode) { 1517 case Op_RShiftVL: { 1518 if (UseAVX > 2) { 1519 assert(tmp == xnoreg, "not used"); 1520 if (!VM_Version::supports_avx512vl()) { 1521 vlen_enc = Assembler::AVX_512bit; 1522 } 1523 evpsravq(dst, src, shift, vlen_enc); 1524 } else { 1525 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1526 vpsrlvq(dst, src, shift, vlen_enc); 1527 vpsrlvq(tmp, tmp, shift, vlen_enc); 1528 vpxor(dst, dst, tmp, vlen_enc); 1529 vpsubq(dst, dst, tmp, vlen_enc); 1530 } 1531 break; 1532 } 1533 case Op_LShiftVL: { 1534 assert(tmp == xnoreg, "not used"); 1535 vpsllvq(dst, src, shift, vlen_enc); 1536 break; 1537 } 1538 case Op_URShiftVL: { 1539 assert(tmp == xnoreg, "not used"); 1540 vpsrlvq(dst, src, shift, vlen_enc); 1541 break; 1542 } 1543 default: assert(false, "%s", NodeClassNames[opcode]); 1544 } 1545 } 1546 1547 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1548 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1549 assert(opcode == Op_LShiftVB || 1550 opcode == Op_RShiftVB || 1551 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1552 bool sign = (opcode != Op_URShiftVB); 1553 assert(vector_len == 0, "required"); 1554 vextendbd(sign, dst, src, 1); 1555 vpmovzxbd(vtmp, shift, 1); 1556 varshiftd(opcode, dst, dst, vtmp, 1); 1557 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1558 vextracti128_high(vtmp, dst); 1559 vpackusdw(dst, dst, vtmp, 0); 1560 } 1561 1562 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1563 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1564 assert(opcode == Op_LShiftVB || 1565 opcode == Op_RShiftVB || 1566 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1567 bool sign = (opcode != Op_URShiftVB); 1568 int ext_vector_len = vector_len + 1; 1569 vextendbw(sign, dst, src, ext_vector_len); 1570 vpmovzxbw(vtmp, shift, ext_vector_len); 1571 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1572 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1573 if (vector_len == 0) { 1574 vextracti128_high(vtmp, dst); 1575 vpackuswb(dst, dst, vtmp, vector_len); 1576 } else { 1577 vextracti64x4_high(vtmp, dst); 1578 vpackuswb(dst, dst, vtmp, vector_len); 1579 vpermq(dst, dst, 0xD8, vector_len); 1580 } 1581 } 1582 1583 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1584 switch(typ) { 1585 case T_BYTE: 1586 pinsrb(dst, val, idx); 1587 break; 1588 case T_SHORT: 1589 pinsrw(dst, val, idx); 1590 break; 1591 case T_INT: 1592 pinsrd(dst, val, idx); 1593 break; 1594 case T_LONG: 1595 pinsrq(dst, val, idx); 1596 break; 1597 default: 1598 assert(false,"Should not reach here."); 1599 break; 1600 } 1601 } 1602 1603 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1604 switch(typ) { 1605 case T_BYTE: 1606 vpinsrb(dst, src, val, idx); 1607 break; 1608 case T_SHORT: 1609 vpinsrw(dst, src, val, idx); 1610 break; 1611 case T_INT: 1612 vpinsrd(dst, src, val, idx); 1613 break; 1614 case T_LONG: 1615 vpinsrq(dst, src, val, idx); 1616 break; 1617 default: 1618 assert(false,"Should not reach here."); 1619 break; 1620 } 1621 } 1622 1623 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1624 XMMRegister dst, Register base, 1625 Register idx_base, 1626 Register offset, Register mask, 1627 Register mask_idx, Register rtmp, 1628 int vlen_enc) { 1629 vpxor(dst, dst, dst, vlen_enc); 1630 if (elem_bt == T_SHORT) { 1631 for (int i = 0; i < 4; i++) { 1632 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1633 Label skip_load; 1634 btq(mask, mask_idx); 1635 jccb(Assembler::carryClear, skip_load); 1636 movl(rtmp, Address(idx_base, i * 4)); 1637 if (offset != noreg) { 1638 addl(rtmp, offset); 1639 } 1640 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1641 bind(skip_load); 1642 incq(mask_idx); 1643 } 1644 } else { 1645 assert(elem_bt == T_BYTE, ""); 1646 for (int i = 0; i < 8; i++) { 1647 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1648 Label skip_load; 1649 btq(mask, mask_idx); 1650 jccb(Assembler::carryClear, skip_load); 1651 movl(rtmp, Address(idx_base, i * 4)); 1652 if (offset != noreg) { 1653 addl(rtmp, offset); 1654 } 1655 pinsrb(dst, Address(base, rtmp), i); 1656 bind(skip_load); 1657 incq(mask_idx); 1658 } 1659 } 1660 } 1661 1662 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1663 Register base, Register idx_base, 1664 Register offset, Register rtmp, 1665 int vlen_enc) { 1666 vpxor(dst, dst, dst, vlen_enc); 1667 if (elem_bt == T_SHORT) { 1668 for (int i = 0; i < 4; i++) { 1669 // dst[i] = src[offset + idx_base[i]] 1670 movl(rtmp, Address(idx_base, i * 4)); 1671 if (offset != noreg) { 1672 addl(rtmp, offset); 1673 } 1674 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1675 } 1676 } else { 1677 assert(elem_bt == T_BYTE, ""); 1678 for (int i = 0; i < 8; i++) { 1679 // dst[i] = src[offset + idx_base[i]] 1680 movl(rtmp, Address(idx_base, i * 4)); 1681 if (offset != noreg) { 1682 addl(rtmp, offset); 1683 } 1684 pinsrb(dst, Address(base, rtmp), i); 1685 } 1686 } 1687 } 1688 1689 /* 1690 * Gather using hybrid algorithm, first partially unroll scalar loop 1691 * to accumulate values from gather indices into a quad-word(64bit) slice. 1692 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1693 * permutation to place the slice into appropriate vector lane 1694 * locations in destination vector. Following pseudo code describes the 1695 * algorithm in detail: 1696 * 1697 * DST_VEC = ZERO_VEC 1698 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1699 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1700 * FOREACH_ITER: 1701 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1702 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1703 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1704 * PERM_INDEX = PERM_INDEX - TWO_VEC 1705 * 1706 * With each iteration, doubleword permute indices (0,1) corresponding 1707 * to gathered quadword gets right shifted by two lane positions. 1708 * 1709 */ 1710 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1711 Register base, Register idx_base, 1712 Register offset, Register mask, 1713 XMMRegister xtmp1, XMMRegister xtmp2, 1714 XMMRegister temp_dst, Register rtmp, 1715 Register mask_idx, Register length, 1716 int vector_len, int vlen_enc) { 1717 Label GATHER8_LOOP; 1718 assert(is_subword_type(elem_ty), ""); 1719 movl(length, vector_len); 1720 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1721 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1722 vallones(xtmp2, vlen_enc); 1723 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1724 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1725 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1726 1727 bind(GATHER8_LOOP); 1728 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1729 if (mask == noreg) { 1730 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1731 } else { 1732 vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc); 1733 } 1734 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1735 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1736 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1737 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1738 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1739 vpor(dst, dst, temp_dst, vlen_enc); 1740 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1741 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1742 jcc(Assembler::notEqual, GATHER8_LOOP); 1743 } 1744 1745 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1746 switch(typ) { 1747 case T_INT: 1748 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1749 break; 1750 case T_FLOAT: 1751 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1752 break; 1753 case T_LONG: 1754 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1755 break; 1756 case T_DOUBLE: 1757 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1758 break; 1759 default: 1760 assert(false,"Should not reach here."); 1761 break; 1762 } 1763 } 1764 1765 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1766 switch(typ) { 1767 case T_INT: 1768 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1769 break; 1770 case T_FLOAT: 1771 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1772 break; 1773 case T_LONG: 1774 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1775 break; 1776 case T_DOUBLE: 1777 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1778 break; 1779 default: 1780 assert(false,"Should not reach here."); 1781 break; 1782 } 1783 } 1784 1785 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1786 switch(typ) { 1787 case T_INT: 1788 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1789 break; 1790 case T_FLOAT: 1791 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1792 break; 1793 case T_LONG: 1794 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1795 break; 1796 case T_DOUBLE: 1797 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1798 break; 1799 default: 1800 assert(false,"Should not reach here."); 1801 break; 1802 } 1803 } 1804 1805 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1806 if (vlen_in_bytes <= 16) { 1807 pxor (dst, dst); 1808 psubb(dst, src); 1809 switch (elem_bt) { 1810 case T_BYTE: /* nothing to do */ break; 1811 case T_SHORT: pmovsxbw(dst, dst); break; 1812 case T_INT: pmovsxbd(dst, dst); break; 1813 case T_FLOAT: pmovsxbd(dst, dst); break; 1814 case T_LONG: pmovsxbq(dst, dst); break; 1815 case T_DOUBLE: pmovsxbq(dst, dst); break; 1816 1817 default: assert(false, "%s", type2name(elem_bt)); 1818 } 1819 } else { 1820 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1821 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1822 1823 vpxor (dst, dst, dst, vlen_enc); 1824 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1825 1826 switch (elem_bt) { 1827 case T_BYTE: /* nothing to do */ break; 1828 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1829 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1830 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1831 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1832 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1833 1834 default: assert(false, "%s", type2name(elem_bt)); 1835 } 1836 } 1837 } 1838 1839 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1840 if (novlbwdq) { 1841 vpmovsxbd(xtmp, src, vlen_enc); 1842 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1843 Assembler::eq, true, vlen_enc, noreg); 1844 } else { 1845 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1846 vpsubb(xtmp, xtmp, src, vlen_enc); 1847 evpmovb2m(dst, xtmp, vlen_enc); 1848 } 1849 } 1850 1851 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) { 1852 if (is_integral_type(bt)) { 1853 switch (vlen_in_bytes) { 1854 case 4: movdl(dst, src); break; 1855 case 8: movq(dst, src); break; 1856 case 16: movdqu(dst, src); break; 1857 case 32: vmovdqu(dst, src); break; 1858 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1859 default: ShouldNotReachHere(); 1860 } 1861 } else { 1862 switch (vlen_in_bytes) { 1863 case 4: movflt(dst, src); break; 1864 case 8: movdbl(dst, src); break; 1865 case 16: movups(dst, src); break; 1866 case 32: vmovups(dst, src, Assembler::AVX_256bit); break; 1867 case 64: vmovups(dst, src, Assembler::AVX_512bit); break; 1868 default: ShouldNotReachHere(); 1869 } 1870 } 1871 } 1872 1873 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1874 assert(rscratch != noreg || always_reachable(src), "missing"); 1875 1876 if (reachable(src)) { 1877 load_vector(bt, dst, as_Address(src), vlen_in_bytes); 1878 } else { 1879 lea(rscratch, src); 1880 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes); 1881 } 1882 } 1883 1884 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1885 int vlen_enc = vector_length_encoding(vlen); 1886 if (VM_Version::supports_avx()) { 1887 if (bt == T_LONG) { 1888 if (VM_Version::supports_avx2()) { 1889 vpbroadcastq(dst, src, vlen_enc); 1890 } else { 1891 vmovddup(dst, src, vlen_enc); 1892 } 1893 } else if (bt == T_DOUBLE) { 1894 if (vlen_enc != Assembler::AVX_128bit) { 1895 vbroadcastsd(dst, src, vlen_enc, noreg); 1896 } else { 1897 vmovddup(dst, src, vlen_enc); 1898 } 1899 } else { 1900 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1901 vpbroadcastd(dst, src, vlen_enc); 1902 } else { 1903 vbroadcastss(dst, src, vlen_enc); 1904 } 1905 } 1906 } else if (VM_Version::supports_sse3()) { 1907 movddup(dst, src); 1908 } else { 1909 load_vector(bt, dst, src, vlen); 1910 } 1911 } 1912 1913 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1914 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1915 int offset = exact_log2(type2aelembytes(bt)) << 6; 1916 if (is_floating_point_type(bt)) { 1917 offset += 128; 1918 } 1919 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1920 load_vector(T_BYTE, dst, addr, vlen_in_bytes); 1921 } 1922 1923 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1924 1925 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1926 int vector_len = Assembler::AVX_128bit; 1927 1928 switch (opcode) { 1929 case Op_AndReductionV: pand(dst, src); break; 1930 case Op_OrReductionV: por (dst, src); break; 1931 case Op_XorReductionV: pxor(dst, src); break; 1932 case Op_MinReductionV: 1933 switch (typ) { 1934 case T_BYTE: pminsb(dst, src); break; 1935 case T_SHORT: pminsw(dst, src); break; 1936 case T_INT: pminsd(dst, src); break; 1937 case T_LONG: assert(UseAVX > 2, "required"); 1938 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1939 default: assert(false, "wrong type"); 1940 } 1941 break; 1942 case Op_MaxReductionV: 1943 switch (typ) { 1944 case T_BYTE: pmaxsb(dst, src); break; 1945 case T_SHORT: pmaxsw(dst, src); break; 1946 case T_INT: pmaxsd(dst, src); break; 1947 case T_LONG: assert(UseAVX > 2, "required"); 1948 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1949 default: assert(false, "wrong type"); 1950 } 1951 break; 1952 case Op_AddReductionVF: addss(dst, src); break; 1953 case Op_AddReductionVD: addsd(dst, src); break; 1954 case Op_AddReductionVI: 1955 switch (typ) { 1956 case T_BYTE: paddb(dst, src); break; 1957 case T_SHORT: paddw(dst, src); break; 1958 case T_INT: paddd(dst, src); break; 1959 default: assert(false, "wrong type"); 1960 } 1961 break; 1962 case Op_AddReductionVL: paddq(dst, src); break; 1963 case Op_MulReductionVF: mulss(dst, src); break; 1964 case Op_MulReductionVD: mulsd(dst, src); break; 1965 case Op_MulReductionVI: 1966 switch (typ) { 1967 case T_SHORT: pmullw(dst, src); break; 1968 case T_INT: pmulld(dst, src); break; 1969 default: assert(false, "wrong type"); 1970 } 1971 break; 1972 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1973 evpmullq(dst, dst, src, vector_len); break; 1974 default: assert(false, "wrong opcode"); 1975 } 1976 } 1977 1978 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1979 switch (opcode) { 1980 case Op_AddReductionVF: addps(dst, src); break; 1981 case Op_AddReductionVD: addpd(dst, src); break; 1982 case Op_MulReductionVF: mulps(dst, src); break; 1983 case Op_MulReductionVD: mulpd(dst, src); break; 1984 default: assert(false, "%s", NodeClassNames[opcode]); 1985 } 1986 } 1987 1988 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1989 int vector_len = Assembler::AVX_256bit; 1990 1991 switch (opcode) { 1992 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1993 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1994 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1995 case Op_MinReductionV: 1996 switch (typ) { 1997 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1998 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1999 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 2000 case T_LONG: assert(UseAVX > 2, "required"); 2001 vpminsq(dst, src1, src2, vector_len); break; 2002 default: assert(false, "wrong type"); 2003 } 2004 break; 2005 case Op_MaxReductionV: 2006 switch (typ) { 2007 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 2008 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 2009 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 2010 case T_LONG: assert(UseAVX > 2, "required"); 2011 vpmaxsq(dst, src1, src2, vector_len); break; 2012 default: assert(false, "wrong type"); 2013 } 2014 break; 2015 case Op_AddReductionVI: 2016 switch (typ) { 2017 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 2018 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 2019 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 2020 default: assert(false, "wrong type"); 2021 } 2022 break; 2023 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 2024 case Op_MulReductionVI: 2025 switch (typ) { 2026 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 2027 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 2028 default: assert(false, "wrong type"); 2029 } 2030 break; 2031 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 2032 default: assert(false, "wrong opcode"); 2033 } 2034 } 2035 2036 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 2037 int vector_len = Assembler::AVX_256bit; 2038 2039 switch (opcode) { 2040 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 2041 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 2042 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 2043 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 2044 default: assert(false, "%s", NodeClassNames[opcode]); 2045 } 2046 } 2047 2048 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 2049 XMMRegister dst, XMMRegister src, 2050 XMMRegister vtmp1, XMMRegister vtmp2) { 2051 switch (opcode) { 2052 case Op_AddReductionVF: 2053 case Op_MulReductionVF: 2054 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 2055 break; 2056 2057 case Op_AddReductionVD: 2058 case Op_MulReductionVD: 2059 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 2060 break; 2061 2062 default: assert(false, "wrong opcode"); 2063 } 2064 } 2065 2066 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 2067 XMMRegister dst, XMMRegister src, 2068 XMMRegister vtmp1, XMMRegister vtmp2) { 2069 switch (opcode) { 2070 case Op_AddReductionVF: 2071 case Op_MulReductionVF: 2072 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 2073 break; 2074 2075 case Op_AddReductionVD: 2076 case Op_MulReductionVD: 2077 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 2078 break; 2079 2080 default: assert(false, "%s", NodeClassNames[opcode]); 2081 } 2082 } 2083 2084 void C2_MacroAssembler::reduceB(int opcode, int vlen, 2085 Register dst, Register src1, XMMRegister src2, 2086 XMMRegister vtmp1, XMMRegister vtmp2) { 2087 switch (vlen) { 2088 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2089 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2090 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2091 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2092 2093 default: assert(false, "wrong vector length"); 2094 } 2095 } 2096 2097 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 2098 Register dst, Register src1, XMMRegister src2, 2099 XMMRegister vtmp1, XMMRegister vtmp2) { 2100 switch (vlen) { 2101 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2102 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2103 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2104 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2105 2106 default: assert(false, "wrong vector length"); 2107 } 2108 } 2109 2110 void C2_MacroAssembler::reduceS(int opcode, int vlen, 2111 Register dst, Register src1, XMMRegister src2, 2112 XMMRegister vtmp1, XMMRegister vtmp2) { 2113 switch (vlen) { 2114 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2115 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2116 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2117 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2118 2119 default: assert(false, "wrong vector length"); 2120 } 2121 } 2122 2123 void C2_MacroAssembler::reduceI(int opcode, int vlen, 2124 Register dst, Register src1, XMMRegister src2, 2125 XMMRegister vtmp1, XMMRegister vtmp2) { 2126 switch (vlen) { 2127 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2128 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2129 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2130 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2131 2132 default: assert(false, "wrong vector length"); 2133 } 2134 } 2135 2136 void C2_MacroAssembler::reduceL(int opcode, int vlen, 2137 Register dst, Register src1, XMMRegister src2, 2138 XMMRegister vtmp1, XMMRegister vtmp2) { 2139 switch (vlen) { 2140 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2141 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2142 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2143 2144 default: assert(false, "wrong vector length"); 2145 } 2146 } 2147 2148 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2149 switch (vlen) { 2150 case 2: 2151 assert(vtmp2 == xnoreg, ""); 2152 reduce2F(opcode, dst, src, vtmp1); 2153 break; 2154 case 4: 2155 assert(vtmp2 == xnoreg, ""); 2156 reduce4F(opcode, dst, src, vtmp1); 2157 break; 2158 case 8: 2159 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2160 break; 2161 case 16: 2162 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2163 break; 2164 default: assert(false, "wrong vector length"); 2165 } 2166 } 2167 2168 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2169 switch (vlen) { 2170 case 2: 2171 assert(vtmp2 == xnoreg, ""); 2172 reduce2D(opcode, dst, src, vtmp1); 2173 break; 2174 case 4: 2175 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2176 break; 2177 case 8: 2178 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2179 break; 2180 default: assert(false, "wrong vector length"); 2181 } 2182 } 2183 2184 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2185 switch (vlen) { 2186 case 2: 2187 assert(vtmp1 == xnoreg, ""); 2188 assert(vtmp2 == xnoreg, ""); 2189 unorderedReduce2F(opcode, dst, src); 2190 break; 2191 case 4: 2192 assert(vtmp2 == xnoreg, ""); 2193 unorderedReduce4F(opcode, dst, src, vtmp1); 2194 break; 2195 case 8: 2196 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 2197 break; 2198 case 16: 2199 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 2200 break; 2201 default: assert(false, "wrong vector length"); 2202 } 2203 } 2204 2205 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2206 switch (vlen) { 2207 case 2: 2208 assert(vtmp1 == xnoreg, ""); 2209 assert(vtmp2 == xnoreg, ""); 2210 unorderedReduce2D(opcode, dst, src); 2211 break; 2212 case 4: 2213 assert(vtmp2 == xnoreg, ""); 2214 unorderedReduce4D(opcode, dst, src, vtmp1); 2215 break; 2216 case 8: 2217 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2218 break; 2219 default: assert(false, "wrong vector length"); 2220 } 2221 } 2222 2223 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2224 if (opcode == Op_AddReductionVI) { 2225 if (vtmp1 != src2) { 2226 movdqu(vtmp1, src2); 2227 } 2228 phaddd(vtmp1, vtmp1); 2229 } else { 2230 pshufd(vtmp1, src2, 0x1); 2231 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2232 } 2233 movdl(vtmp2, src1); 2234 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2235 movdl(dst, vtmp1); 2236 } 2237 2238 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2239 if (opcode == Op_AddReductionVI) { 2240 if (vtmp1 != src2) { 2241 movdqu(vtmp1, src2); 2242 } 2243 phaddd(vtmp1, src2); 2244 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2245 } else { 2246 pshufd(vtmp2, src2, 0xE); 2247 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2248 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2249 } 2250 } 2251 2252 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2253 if (opcode == Op_AddReductionVI) { 2254 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2255 vextracti128_high(vtmp2, vtmp1); 2256 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2257 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2258 } else { 2259 vextracti128_high(vtmp1, src2); 2260 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2261 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2262 } 2263 } 2264 2265 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2266 vextracti64x4_high(vtmp2, src2); 2267 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2268 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2269 } 2270 2271 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2272 pshufd(vtmp2, src2, 0x1); 2273 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2274 movdqu(vtmp1, vtmp2); 2275 psrldq(vtmp1, 2); 2276 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2277 movdqu(vtmp2, vtmp1); 2278 psrldq(vtmp2, 1); 2279 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2280 movdl(vtmp2, src1); 2281 pmovsxbd(vtmp1, vtmp1); 2282 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2283 pextrb(dst, vtmp1, 0x0); 2284 movsbl(dst, dst); 2285 } 2286 2287 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2288 pshufd(vtmp1, src2, 0xE); 2289 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2290 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2291 } 2292 2293 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2294 vextracti128_high(vtmp2, src2); 2295 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2296 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2297 } 2298 2299 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2300 vextracti64x4_high(vtmp1, src2); 2301 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2302 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2303 } 2304 2305 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2306 pmovsxbw(vtmp2, src2); 2307 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2308 } 2309 2310 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2311 if (UseAVX > 1) { 2312 int vector_len = Assembler::AVX_256bit; 2313 vpmovsxbw(vtmp1, src2, vector_len); 2314 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2315 } else { 2316 pmovsxbw(vtmp2, src2); 2317 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2318 pshufd(vtmp2, src2, 0x1); 2319 pmovsxbw(vtmp2, src2); 2320 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2321 } 2322 } 2323 2324 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2325 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2326 int vector_len = Assembler::AVX_512bit; 2327 vpmovsxbw(vtmp1, src2, vector_len); 2328 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2329 } else { 2330 assert(UseAVX >= 2,"Should not reach here."); 2331 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2332 vextracti128_high(vtmp2, src2); 2333 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2334 } 2335 } 2336 2337 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2338 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2339 vextracti64x4_high(vtmp2, src2); 2340 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2341 } 2342 2343 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2344 if (opcode == Op_AddReductionVI) { 2345 if (vtmp1 != src2) { 2346 movdqu(vtmp1, src2); 2347 } 2348 phaddw(vtmp1, vtmp1); 2349 phaddw(vtmp1, vtmp1); 2350 } else { 2351 pshufd(vtmp2, src2, 0x1); 2352 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2353 movdqu(vtmp1, vtmp2); 2354 psrldq(vtmp1, 2); 2355 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2356 } 2357 movdl(vtmp2, src1); 2358 pmovsxwd(vtmp1, vtmp1); 2359 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2360 pextrw(dst, vtmp1, 0x0); 2361 movswl(dst, dst); 2362 } 2363 2364 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2365 if (opcode == Op_AddReductionVI) { 2366 if (vtmp1 != src2) { 2367 movdqu(vtmp1, src2); 2368 } 2369 phaddw(vtmp1, src2); 2370 } else { 2371 pshufd(vtmp1, src2, 0xE); 2372 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2373 } 2374 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2375 } 2376 2377 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2378 if (opcode == Op_AddReductionVI) { 2379 int vector_len = Assembler::AVX_256bit; 2380 vphaddw(vtmp2, src2, src2, vector_len); 2381 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2382 } else { 2383 vextracti128_high(vtmp2, src2); 2384 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2385 } 2386 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2387 } 2388 2389 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2390 int vector_len = Assembler::AVX_256bit; 2391 vextracti64x4_high(vtmp1, src2); 2392 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2393 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2394 } 2395 2396 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2397 pshufd(vtmp2, src2, 0xE); 2398 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2399 movdq(vtmp1, src1); 2400 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2401 movdq(dst, vtmp1); 2402 } 2403 2404 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2405 vextracti128_high(vtmp1, src2); 2406 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2407 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2408 } 2409 2410 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2411 vextracti64x4_high(vtmp2, src2); 2412 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2413 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2414 } 2415 2416 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2417 mov64(temp, -1L); 2418 bzhiq(temp, temp, len); 2419 kmovql(dst, temp); 2420 } 2421 2422 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2423 reduce_operation_128(T_FLOAT, opcode, dst, src); 2424 pshufd(vtmp, src, 0x1); 2425 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2426 } 2427 2428 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2429 reduce2F(opcode, dst, src, vtmp); 2430 pshufd(vtmp, src, 0x2); 2431 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2432 pshufd(vtmp, src, 0x3); 2433 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2434 } 2435 2436 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2437 reduce4F(opcode, dst, src, vtmp2); 2438 vextractf128_high(vtmp2, src); 2439 reduce4F(opcode, dst, vtmp2, vtmp1); 2440 } 2441 2442 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2443 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2444 vextracti64x4_high(vtmp1, src); 2445 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2446 } 2447 2448 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2449 pshufd(dst, src, 0x1); 2450 reduce_operation_128(T_FLOAT, opcode, dst, src); 2451 } 2452 2453 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2454 pshufd(vtmp, src, 0xE); 2455 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2456 unorderedReduce2F(opcode, dst, vtmp); 2457 } 2458 2459 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2460 vextractf128_high(vtmp1, src); 2461 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2462 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2463 } 2464 2465 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2466 vextractf64x4_high(vtmp2, src); 2467 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2468 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2469 } 2470 2471 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2472 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2473 pshufd(vtmp, src, 0xE); 2474 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2475 } 2476 2477 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2478 reduce2D(opcode, dst, src, vtmp2); 2479 vextractf128_high(vtmp2, src); 2480 reduce2D(opcode, dst, vtmp2, vtmp1); 2481 } 2482 2483 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2484 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2485 vextracti64x4_high(vtmp1, src); 2486 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2487 } 2488 2489 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2490 pshufd(dst, src, 0xE); 2491 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2492 } 2493 2494 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2495 vextractf128_high(vtmp, src); 2496 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2497 unorderedReduce2D(opcode, dst, vtmp); 2498 } 2499 2500 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2501 vextractf64x4_high(vtmp2, src); 2502 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2503 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2504 } 2505 2506 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2507 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2508 } 2509 2510 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2511 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2512 } 2513 2514 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 2515 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2516 } 2517 2518 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2519 int vec_enc) { 2520 switch(elem_bt) { 2521 case T_INT: 2522 case T_FLOAT: 2523 vmaskmovps(dst, src, mask, vec_enc); 2524 break; 2525 case T_LONG: 2526 case T_DOUBLE: 2527 vmaskmovpd(dst, src, mask, vec_enc); 2528 break; 2529 default: 2530 fatal("Unsupported type %s", type2name(elem_bt)); 2531 break; 2532 } 2533 } 2534 2535 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2536 int vec_enc) { 2537 switch(elem_bt) { 2538 case T_INT: 2539 case T_FLOAT: 2540 vmaskmovps(dst, src, mask, vec_enc); 2541 break; 2542 case T_LONG: 2543 case T_DOUBLE: 2544 vmaskmovpd(dst, src, mask, vec_enc); 2545 break; 2546 default: 2547 fatal("Unsupported type %s", type2name(elem_bt)); 2548 break; 2549 } 2550 } 2551 2552 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2553 XMMRegister dst, XMMRegister src, 2554 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2555 XMMRegister xmm_0, XMMRegister xmm_1) { 2556 const int permconst[] = {1, 14}; 2557 XMMRegister wsrc = src; 2558 XMMRegister wdst = xmm_0; 2559 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2560 2561 int vlen_enc = Assembler::AVX_128bit; 2562 if (vlen == 16) { 2563 vlen_enc = Assembler::AVX_256bit; 2564 } 2565 2566 for (int i = log2(vlen) - 1; i >=0; i--) { 2567 if (i == 0 && !is_dst_valid) { 2568 wdst = dst; 2569 } 2570 if (i == 3) { 2571 vextracti64x4_high(wtmp, wsrc); 2572 } else if (i == 2) { 2573 vextracti128_high(wtmp, wsrc); 2574 } else { // i = [0,1] 2575 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2576 } 2577 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2578 wsrc = wdst; 2579 vlen_enc = Assembler::AVX_128bit; 2580 } 2581 if (is_dst_valid) { 2582 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2583 } 2584 } 2585 2586 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2587 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2588 XMMRegister xmm_0, XMMRegister xmm_1) { 2589 XMMRegister wsrc = src; 2590 XMMRegister wdst = xmm_0; 2591 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2592 int vlen_enc = Assembler::AVX_128bit; 2593 if (vlen == 8) { 2594 vlen_enc = Assembler::AVX_256bit; 2595 } 2596 for (int i = log2(vlen) - 1; i >=0; i--) { 2597 if (i == 0 && !is_dst_valid) { 2598 wdst = dst; 2599 } 2600 if (i == 1) { 2601 vextracti128_high(wtmp, wsrc); 2602 } else if (i == 2) { 2603 vextracti64x4_high(wtmp, wsrc); 2604 } else { 2605 assert(i == 0, "%d", i); 2606 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2607 } 2608 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2609 wsrc = wdst; 2610 vlen_enc = Assembler::AVX_128bit; 2611 } 2612 if (is_dst_valid) { 2613 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2614 } 2615 } 2616 2617 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2618 switch (bt) { 2619 case T_BYTE: pextrb(dst, src, idx); break; 2620 case T_SHORT: pextrw(dst, src, idx); break; 2621 case T_INT: pextrd(dst, src, idx); break; 2622 case T_LONG: pextrq(dst, src, idx); break; 2623 2624 default: 2625 assert(false,"Should not reach here."); 2626 break; 2627 } 2628 } 2629 2630 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2631 int esize = type2aelembytes(typ); 2632 int elem_per_lane = 16/esize; 2633 int lane = elemindex / elem_per_lane; 2634 int eindex = elemindex % elem_per_lane; 2635 2636 if (lane >= 2) { 2637 assert(UseAVX > 2, "required"); 2638 vextractf32x4(dst, src, lane & 3); 2639 return dst; 2640 } else if (lane > 0) { 2641 assert(UseAVX > 0, "required"); 2642 vextractf128(dst, src, lane); 2643 return dst; 2644 } else { 2645 return src; 2646 } 2647 } 2648 2649 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2650 if (typ == T_BYTE) { 2651 movsbl(dst, dst); 2652 } else if (typ == T_SHORT) { 2653 movswl(dst, dst); 2654 } 2655 } 2656 2657 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2658 int esize = type2aelembytes(typ); 2659 int elem_per_lane = 16/esize; 2660 int eindex = elemindex % elem_per_lane; 2661 assert(is_integral_type(typ),"required"); 2662 2663 if (eindex == 0) { 2664 if (typ == T_LONG) { 2665 movq(dst, src); 2666 } else { 2667 movdl(dst, src); 2668 movsxl(typ, dst); 2669 } 2670 } else { 2671 extract(typ, dst, src, eindex); 2672 movsxl(typ, dst); 2673 } 2674 } 2675 2676 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2677 int esize = type2aelembytes(typ); 2678 int elem_per_lane = 16/esize; 2679 int eindex = elemindex % elem_per_lane; 2680 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2681 2682 if (eindex == 0) { 2683 movq(dst, src); 2684 } else { 2685 if (typ == T_FLOAT) { 2686 if (UseAVX == 0) { 2687 movdqu(dst, src); 2688 shufps(dst, dst, eindex); 2689 } else { 2690 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2691 } 2692 } else { 2693 if (UseAVX == 0) { 2694 movdqu(dst, src); 2695 psrldq(dst, eindex*esize); 2696 } else { 2697 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2698 } 2699 movq(dst, dst); 2700 } 2701 } 2702 // Zero upper bits 2703 if (typ == T_FLOAT) { 2704 if (UseAVX == 0) { 2705 assert(vtmp != xnoreg, "required."); 2706 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2707 pand(dst, vtmp); 2708 } else { 2709 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2710 } 2711 } 2712 } 2713 2714 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2715 switch(typ) { 2716 case T_BYTE: 2717 case T_BOOLEAN: 2718 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2719 break; 2720 case T_SHORT: 2721 case T_CHAR: 2722 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2723 break; 2724 case T_INT: 2725 case T_FLOAT: 2726 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2727 break; 2728 case T_LONG: 2729 case T_DOUBLE: 2730 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2731 break; 2732 default: 2733 assert(false,"Should not reach here."); 2734 break; 2735 } 2736 } 2737 2738 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2739 assert(rscratch != noreg || always_reachable(src2), "missing"); 2740 2741 switch(typ) { 2742 case T_BOOLEAN: 2743 case T_BYTE: 2744 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2745 break; 2746 case T_CHAR: 2747 case T_SHORT: 2748 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2749 break; 2750 case T_INT: 2751 case T_FLOAT: 2752 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2753 break; 2754 case T_LONG: 2755 case T_DOUBLE: 2756 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2757 break; 2758 default: 2759 assert(false,"Should not reach here."); 2760 break; 2761 } 2762 } 2763 2764 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2765 switch(typ) { 2766 case T_BYTE: 2767 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2768 break; 2769 case T_SHORT: 2770 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2771 break; 2772 case T_INT: 2773 case T_FLOAT: 2774 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2775 break; 2776 case T_LONG: 2777 case T_DOUBLE: 2778 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2779 break; 2780 default: 2781 assert(false,"Should not reach here."); 2782 break; 2783 } 2784 } 2785 2786 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2787 assert(vlen_in_bytes <= 32, ""); 2788 int esize = type2aelembytes(bt); 2789 if (vlen_in_bytes == 32) { 2790 assert(vtmp == xnoreg, "required."); 2791 if (esize >= 4) { 2792 vtestps(src1, src2, AVX_256bit); 2793 } else { 2794 vptest(src1, src2, AVX_256bit); 2795 } 2796 return; 2797 } 2798 if (vlen_in_bytes < 16) { 2799 // Duplicate the lower part to fill the whole register, 2800 // Don't need to do so for src2 2801 assert(vtmp != xnoreg, "required"); 2802 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2803 pshufd(vtmp, src1, shuffle_imm); 2804 } else { 2805 assert(vtmp == xnoreg, "required"); 2806 vtmp = src1; 2807 } 2808 if (esize >= 4 && VM_Version::supports_avx()) { 2809 vtestps(vtmp, src2, AVX_128bit); 2810 } else { 2811 ptest(vtmp, src2); 2812 } 2813 } 2814 2815 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2816 #ifdef ASSERT 2817 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2818 bool is_bw_supported = VM_Version::supports_avx512bw(); 2819 if (is_bw && !is_bw_supported) { 2820 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2821 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2822 "XMM register should be 0-15"); 2823 } 2824 #endif // ASSERT 2825 switch (elem_bt) { 2826 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2827 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2828 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2829 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2830 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2831 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2832 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2833 } 2834 } 2835 2836 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2837 assert(UseAVX >= 2, "required"); 2838 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2839 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2840 if ((UseAVX > 2) && 2841 (!is_bw || VM_Version::supports_avx512bw()) && 2842 (!is_vl || VM_Version::supports_avx512vl())) { 2843 switch (elem_bt) { 2844 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2845 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2846 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2847 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2848 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2849 } 2850 } else { 2851 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2852 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2853 switch (elem_bt) { 2854 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2855 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2856 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2857 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2858 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2859 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2860 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2861 } 2862 } 2863 } 2864 2865 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2866 switch (to_elem_bt) { 2867 case T_SHORT: 2868 vpmovsxbw(dst, src, vlen_enc); 2869 break; 2870 case T_INT: 2871 vpmovsxbd(dst, src, vlen_enc); 2872 break; 2873 case T_FLOAT: 2874 vpmovsxbd(dst, src, vlen_enc); 2875 vcvtdq2ps(dst, dst, vlen_enc); 2876 break; 2877 case T_LONG: 2878 vpmovsxbq(dst, src, vlen_enc); 2879 break; 2880 case T_DOUBLE: { 2881 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2882 vpmovsxbd(dst, src, mid_vlen_enc); 2883 vcvtdq2pd(dst, dst, vlen_enc); 2884 break; 2885 } 2886 default: 2887 fatal("Unsupported type %s", type2name(to_elem_bt)); 2888 break; 2889 } 2890 } 2891 2892 //------------------------------------------------------------------------------------------- 2893 2894 // IndexOf for constant substrings with size >= 8 chars 2895 // which don't need to be loaded through stack. 2896 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2897 Register cnt1, Register cnt2, 2898 int int_cnt2, Register result, 2899 XMMRegister vec, Register tmp, 2900 int ae) { 2901 ShortBranchVerifier sbv(this); 2902 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2903 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2904 2905 // This method uses the pcmpestri instruction with bound registers 2906 // inputs: 2907 // xmm - substring 2908 // rax - substring length (elements count) 2909 // mem - scanned string 2910 // rdx - string length (elements count) 2911 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2912 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2913 // outputs: 2914 // rcx - matched index in string 2915 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2916 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2917 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2918 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2919 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2920 2921 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2922 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2923 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2924 2925 // Note, inline_string_indexOf() generates checks: 2926 // if (substr.count > string.count) return -1; 2927 // if (substr.count == 0) return 0; 2928 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2929 2930 // Load substring. 2931 if (ae == StrIntrinsicNode::UL) { 2932 pmovzxbw(vec, Address(str2, 0)); 2933 } else { 2934 movdqu(vec, Address(str2, 0)); 2935 } 2936 movl(cnt2, int_cnt2); 2937 movptr(result, str1); // string addr 2938 2939 if (int_cnt2 > stride) { 2940 jmpb(SCAN_TO_SUBSTR); 2941 2942 // Reload substr for rescan, this code 2943 // is executed only for large substrings (> 8 chars) 2944 bind(RELOAD_SUBSTR); 2945 if (ae == StrIntrinsicNode::UL) { 2946 pmovzxbw(vec, Address(str2, 0)); 2947 } else { 2948 movdqu(vec, Address(str2, 0)); 2949 } 2950 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2951 2952 bind(RELOAD_STR); 2953 // We came here after the beginning of the substring was 2954 // matched but the rest of it was not so we need to search 2955 // again. Start from the next element after the previous match. 2956 2957 // cnt2 is number of substring reminding elements and 2958 // cnt1 is number of string reminding elements when cmp failed. 2959 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2960 subl(cnt1, cnt2); 2961 addl(cnt1, int_cnt2); 2962 movl(cnt2, int_cnt2); // Now restore cnt2 2963 2964 decrementl(cnt1); // Shift to next element 2965 cmpl(cnt1, cnt2); 2966 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2967 2968 addptr(result, (1<<scale1)); 2969 2970 } // (int_cnt2 > 8) 2971 2972 // Scan string for start of substr in 16-byte vectors 2973 bind(SCAN_TO_SUBSTR); 2974 pcmpestri(vec, Address(result, 0), mode); 2975 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2976 subl(cnt1, stride); 2977 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2978 cmpl(cnt1, cnt2); 2979 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2980 addptr(result, 16); 2981 jmpb(SCAN_TO_SUBSTR); 2982 2983 // Found a potential substr 2984 bind(FOUND_CANDIDATE); 2985 // Matched whole vector if first element matched (tmp(rcx) == 0). 2986 if (int_cnt2 == stride) { 2987 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2988 } else { // int_cnt2 > 8 2989 jccb(Assembler::overflow, FOUND_SUBSTR); 2990 } 2991 // After pcmpestri tmp(rcx) contains matched element index 2992 // Compute start addr of substr 2993 lea(result, Address(result, tmp, scale1)); 2994 2995 // Make sure string is still long enough 2996 subl(cnt1, tmp); 2997 cmpl(cnt1, cnt2); 2998 if (int_cnt2 == stride) { 2999 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3000 } else { // int_cnt2 > 8 3001 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 3002 } 3003 // Left less then substring. 3004 3005 bind(RET_NOT_FOUND); 3006 movl(result, -1); 3007 jmp(EXIT); 3008 3009 if (int_cnt2 > stride) { 3010 // This code is optimized for the case when whole substring 3011 // is matched if its head is matched. 3012 bind(MATCH_SUBSTR_HEAD); 3013 pcmpestri(vec, Address(result, 0), mode); 3014 // Reload only string if does not match 3015 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 3016 3017 Label CONT_SCAN_SUBSTR; 3018 // Compare the rest of substring (> 8 chars). 3019 bind(FOUND_SUBSTR); 3020 // First 8 chars are already matched. 3021 negptr(cnt2); 3022 addptr(cnt2, stride); 3023 3024 bind(SCAN_SUBSTR); 3025 subl(cnt1, stride); 3026 cmpl(cnt2, -stride); // Do not read beyond substring 3027 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 3028 // Back-up strings to avoid reading beyond substring: 3029 // cnt1 = cnt1 - cnt2 + 8 3030 addl(cnt1, cnt2); // cnt2 is negative 3031 addl(cnt1, stride); 3032 movl(cnt2, stride); negptr(cnt2); 3033 bind(CONT_SCAN_SUBSTR); 3034 if (int_cnt2 < (int)G) { 3035 int tail_off1 = int_cnt2<<scale1; 3036 int tail_off2 = int_cnt2<<scale2; 3037 if (ae == StrIntrinsicNode::UL) { 3038 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 3039 } else { 3040 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 3041 } 3042 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 3043 } else { 3044 // calculate index in register to avoid integer overflow (int_cnt2*2) 3045 movl(tmp, int_cnt2); 3046 addptr(tmp, cnt2); 3047 if (ae == StrIntrinsicNode::UL) { 3048 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 3049 } else { 3050 movdqu(vec, Address(str2, tmp, scale2, 0)); 3051 } 3052 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 3053 } 3054 // Need to reload strings pointers if not matched whole vector 3055 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3056 addptr(cnt2, stride); 3057 jcc(Assembler::negative, SCAN_SUBSTR); 3058 // Fall through if found full substring 3059 3060 } // (int_cnt2 > 8) 3061 3062 bind(RET_FOUND); 3063 // Found result if we matched full small substring. 3064 // Compute substr offset 3065 subptr(result, str1); 3066 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3067 shrl(result, 1); // index 3068 } 3069 bind(EXIT); 3070 3071 } // string_indexofC8 3072 3073 // Small strings are loaded through stack if they cross page boundary. 3074 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 3075 Register cnt1, Register cnt2, 3076 int int_cnt2, Register result, 3077 XMMRegister vec, Register tmp, 3078 int ae) { 3079 ShortBranchVerifier sbv(this); 3080 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3081 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3082 3083 // 3084 // int_cnt2 is length of small (< 8 chars) constant substring 3085 // or (-1) for non constant substring in which case its length 3086 // is in cnt2 register. 3087 // 3088 // Note, inline_string_indexOf() generates checks: 3089 // if (substr.count > string.count) return -1; 3090 // if (substr.count == 0) return 0; 3091 // 3092 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 3093 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 3094 // This method uses the pcmpestri instruction with bound registers 3095 // inputs: 3096 // xmm - substring 3097 // rax - substring length (elements count) 3098 // mem - scanned string 3099 // rdx - string length (elements count) 3100 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 3101 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 3102 // outputs: 3103 // rcx - matched index in string 3104 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3105 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 3106 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 3107 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 3108 3109 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 3110 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 3111 FOUND_CANDIDATE; 3112 3113 { //======================================================== 3114 // We don't know where these strings are located 3115 // and we can't read beyond them. Load them through stack. 3116 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 3117 3118 movptr(tmp, rsp); // save old SP 3119 3120 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 3121 if (int_cnt2 == (1>>scale2)) { // One byte 3122 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 3123 load_unsigned_byte(result, Address(str2, 0)); 3124 movdl(vec, result); // move 32 bits 3125 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 3126 // Not enough header space in 32-bit VM: 12+3 = 15. 3127 movl(result, Address(str2, -1)); 3128 shrl(result, 8); 3129 movdl(vec, result); // move 32 bits 3130 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 3131 load_unsigned_short(result, Address(str2, 0)); 3132 movdl(vec, result); // move 32 bits 3133 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 3134 movdl(vec, Address(str2, 0)); // move 32 bits 3135 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 3136 movq(vec, Address(str2, 0)); // move 64 bits 3137 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 3138 // Array header size is 12 bytes in 32-bit VM 3139 // + 6 bytes for 3 chars == 18 bytes, 3140 // enough space to load vec and shift. 3141 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3142 if (ae == StrIntrinsicNode::UL) { 3143 int tail_off = int_cnt2-8; 3144 pmovzxbw(vec, Address(str2, tail_off)); 3145 psrldq(vec, -2*tail_off); 3146 } 3147 else { 3148 int tail_off = int_cnt2*(1<<scale2); 3149 movdqu(vec, Address(str2, tail_off-16)); 3150 psrldq(vec, 16-tail_off); 3151 } 3152 } 3153 } else { // not constant substring 3154 cmpl(cnt2, stride); 3155 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3156 3157 // We can read beyond string if srt+16 does not cross page boundary 3158 // since heaps are aligned and mapped by pages. 3159 assert(os::vm_page_size() < (int)G, "default page should be small"); 3160 movl(result, str2); // We need only low 32 bits 3161 andl(result, ((int)os::vm_page_size()-1)); 3162 cmpl(result, ((int)os::vm_page_size()-16)); 3163 jccb(Assembler::belowEqual, CHECK_STR); 3164 3165 // Move small strings to stack to allow load 16 bytes into vec. 3166 subptr(rsp, 16); 3167 int stk_offset = wordSize-(1<<scale2); 3168 push(cnt2); 3169 3170 bind(COPY_SUBSTR); 3171 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3172 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3173 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3174 } else if (ae == StrIntrinsicNode::UU) { 3175 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3176 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3177 } 3178 decrement(cnt2); 3179 jccb(Assembler::notZero, COPY_SUBSTR); 3180 3181 pop(cnt2); 3182 movptr(str2, rsp); // New substring address 3183 } // non constant 3184 3185 bind(CHECK_STR); 3186 cmpl(cnt1, stride); 3187 jccb(Assembler::aboveEqual, BIG_STRINGS); 3188 3189 // Check cross page boundary. 3190 movl(result, str1); // We need only low 32 bits 3191 andl(result, ((int)os::vm_page_size()-1)); 3192 cmpl(result, ((int)os::vm_page_size()-16)); 3193 jccb(Assembler::belowEqual, BIG_STRINGS); 3194 3195 subptr(rsp, 16); 3196 int stk_offset = -(1<<scale1); 3197 if (int_cnt2 < 0) { // not constant 3198 push(cnt2); 3199 stk_offset += wordSize; 3200 } 3201 movl(cnt2, cnt1); 3202 3203 bind(COPY_STR); 3204 if (ae == StrIntrinsicNode::LL) { 3205 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3206 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3207 } else { 3208 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3209 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3210 } 3211 decrement(cnt2); 3212 jccb(Assembler::notZero, COPY_STR); 3213 3214 if (int_cnt2 < 0) { // not constant 3215 pop(cnt2); 3216 } 3217 movptr(str1, rsp); // New string address 3218 3219 bind(BIG_STRINGS); 3220 // Load substring. 3221 if (int_cnt2 < 0) { // -1 3222 if (ae == StrIntrinsicNode::UL) { 3223 pmovzxbw(vec, Address(str2, 0)); 3224 } else { 3225 movdqu(vec, Address(str2, 0)); 3226 } 3227 push(cnt2); // substr count 3228 push(str2); // substr addr 3229 push(str1); // string addr 3230 } else { 3231 // Small (< 8 chars) constant substrings are loaded already. 3232 movl(cnt2, int_cnt2); 3233 } 3234 push(tmp); // original SP 3235 3236 } // Finished loading 3237 3238 //======================================================== 3239 // Start search 3240 // 3241 3242 movptr(result, str1); // string addr 3243 3244 if (int_cnt2 < 0) { // Only for non constant substring 3245 jmpb(SCAN_TO_SUBSTR); 3246 3247 // SP saved at sp+0 3248 // String saved at sp+1*wordSize 3249 // Substr saved at sp+2*wordSize 3250 // Substr count saved at sp+3*wordSize 3251 3252 // Reload substr for rescan, this code 3253 // is executed only for large substrings (> 8 chars) 3254 bind(RELOAD_SUBSTR); 3255 movptr(str2, Address(rsp, 2*wordSize)); 3256 movl(cnt2, Address(rsp, 3*wordSize)); 3257 if (ae == StrIntrinsicNode::UL) { 3258 pmovzxbw(vec, Address(str2, 0)); 3259 } else { 3260 movdqu(vec, Address(str2, 0)); 3261 } 3262 // We came here after the beginning of the substring was 3263 // matched but the rest of it was not so we need to search 3264 // again. Start from the next element after the previous match. 3265 subptr(str1, result); // Restore counter 3266 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3267 shrl(str1, 1); 3268 } 3269 addl(cnt1, str1); 3270 decrementl(cnt1); // Shift to next element 3271 cmpl(cnt1, cnt2); 3272 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3273 3274 addptr(result, (1<<scale1)); 3275 } // non constant 3276 3277 // Scan string for start of substr in 16-byte vectors 3278 bind(SCAN_TO_SUBSTR); 3279 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3280 pcmpestri(vec, Address(result, 0), mode); 3281 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3282 subl(cnt1, stride); 3283 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3284 cmpl(cnt1, cnt2); 3285 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3286 addptr(result, 16); 3287 3288 bind(ADJUST_STR); 3289 cmpl(cnt1, stride); // Do not read beyond string 3290 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3291 // Back-up string to avoid reading beyond string. 3292 lea(result, Address(result, cnt1, scale1, -16)); 3293 movl(cnt1, stride); 3294 jmpb(SCAN_TO_SUBSTR); 3295 3296 // Found a potential substr 3297 bind(FOUND_CANDIDATE); 3298 // After pcmpestri tmp(rcx) contains matched element index 3299 3300 // Make sure string is still long enough 3301 subl(cnt1, tmp); 3302 cmpl(cnt1, cnt2); 3303 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3304 // Left less then substring. 3305 3306 bind(RET_NOT_FOUND); 3307 movl(result, -1); 3308 jmp(CLEANUP); 3309 3310 bind(FOUND_SUBSTR); 3311 // Compute start addr of substr 3312 lea(result, Address(result, tmp, scale1)); 3313 if (int_cnt2 > 0) { // Constant substring 3314 // Repeat search for small substring (< 8 chars) 3315 // from new point without reloading substring. 3316 // Have to check that we don't read beyond string. 3317 cmpl(tmp, stride-int_cnt2); 3318 jccb(Assembler::greater, ADJUST_STR); 3319 // Fall through if matched whole substring. 3320 } else { // non constant 3321 assert(int_cnt2 == -1, "should be != 0"); 3322 3323 addl(tmp, cnt2); 3324 // Found result if we matched whole substring. 3325 cmpl(tmp, stride); 3326 jcc(Assembler::lessEqual, RET_FOUND); 3327 3328 // Repeat search for small substring (<= 8 chars) 3329 // from new point 'str1' without reloading substring. 3330 cmpl(cnt2, stride); 3331 // Have to check that we don't read beyond string. 3332 jccb(Assembler::lessEqual, ADJUST_STR); 3333 3334 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3335 // Compare the rest of substring (> 8 chars). 3336 movptr(str1, result); 3337 3338 cmpl(tmp, cnt2); 3339 // First 8 chars are already matched. 3340 jccb(Assembler::equal, CHECK_NEXT); 3341 3342 bind(SCAN_SUBSTR); 3343 pcmpestri(vec, Address(str1, 0), mode); 3344 // Need to reload strings pointers if not matched whole vector 3345 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3346 3347 bind(CHECK_NEXT); 3348 subl(cnt2, stride); 3349 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3350 addptr(str1, 16); 3351 if (ae == StrIntrinsicNode::UL) { 3352 addptr(str2, 8); 3353 } else { 3354 addptr(str2, 16); 3355 } 3356 subl(cnt1, stride); 3357 cmpl(cnt2, stride); // Do not read beyond substring 3358 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3359 // Back-up strings to avoid reading beyond substring. 3360 3361 if (ae == StrIntrinsicNode::UL) { 3362 lea(str2, Address(str2, cnt2, scale2, -8)); 3363 lea(str1, Address(str1, cnt2, scale1, -16)); 3364 } else { 3365 lea(str2, Address(str2, cnt2, scale2, -16)); 3366 lea(str1, Address(str1, cnt2, scale1, -16)); 3367 } 3368 subl(cnt1, cnt2); 3369 movl(cnt2, stride); 3370 addl(cnt1, stride); 3371 bind(CONT_SCAN_SUBSTR); 3372 if (ae == StrIntrinsicNode::UL) { 3373 pmovzxbw(vec, Address(str2, 0)); 3374 } else { 3375 movdqu(vec, Address(str2, 0)); 3376 } 3377 jmp(SCAN_SUBSTR); 3378 3379 bind(RET_FOUND_LONG); 3380 movptr(str1, Address(rsp, wordSize)); 3381 } // non constant 3382 3383 bind(RET_FOUND); 3384 // Compute substr offset 3385 subptr(result, str1); 3386 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3387 shrl(result, 1); // index 3388 } 3389 bind(CLEANUP); 3390 pop(rsp); // restore SP 3391 3392 } // string_indexof 3393 3394 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3395 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3396 ShortBranchVerifier sbv(this); 3397 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3398 3399 int stride = 8; 3400 3401 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3402 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3403 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3404 FOUND_SEQ_CHAR, DONE_LABEL; 3405 3406 movptr(result, str1); 3407 if (UseAVX >= 2) { 3408 cmpl(cnt1, stride); 3409 jcc(Assembler::less, SCAN_TO_CHAR); 3410 cmpl(cnt1, 2*stride); 3411 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3412 movdl(vec1, ch); 3413 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3414 vpxor(vec2, vec2); 3415 movl(tmp, cnt1); 3416 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3417 andl(cnt1,0x0000000F); //tail count (in chars) 3418 3419 bind(SCAN_TO_16_CHAR_LOOP); 3420 vmovdqu(vec3, Address(result, 0)); 3421 vpcmpeqw(vec3, vec3, vec1, 1); 3422 vptest(vec2, vec3); 3423 jcc(Assembler::carryClear, FOUND_CHAR); 3424 addptr(result, 32); 3425 subl(tmp, 2*stride); 3426 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3427 jmp(SCAN_TO_8_CHAR); 3428 bind(SCAN_TO_8_CHAR_INIT); 3429 movdl(vec1, ch); 3430 pshuflw(vec1, vec1, 0x00); 3431 pshufd(vec1, vec1, 0); 3432 pxor(vec2, vec2); 3433 } 3434 bind(SCAN_TO_8_CHAR); 3435 cmpl(cnt1, stride); 3436 jcc(Assembler::less, SCAN_TO_CHAR); 3437 if (UseAVX < 2) { 3438 movdl(vec1, ch); 3439 pshuflw(vec1, vec1, 0x00); 3440 pshufd(vec1, vec1, 0); 3441 pxor(vec2, vec2); 3442 } 3443 movl(tmp, cnt1); 3444 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3445 andl(cnt1,0x00000007); //tail count (in chars) 3446 3447 bind(SCAN_TO_8_CHAR_LOOP); 3448 movdqu(vec3, Address(result, 0)); 3449 pcmpeqw(vec3, vec1); 3450 ptest(vec2, vec3); 3451 jcc(Assembler::carryClear, FOUND_CHAR); 3452 addptr(result, 16); 3453 subl(tmp, stride); 3454 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3455 bind(SCAN_TO_CHAR); 3456 testl(cnt1, cnt1); 3457 jcc(Assembler::zero, RET_NOT_FOUND); 3458 bind(SCAN_TO_CHAR_LOOP); 3459 load_unsigned_short(tmp, Address(result, 0)); 3460 cmpl(ch, tmp); 3461 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3462 addptr(result, 2); 3463 subl(cnt1, 1); 3464 jccb(Assembler::zero, RET_NOT_FOUND); 3465 jmp(SCAN_TO_CHAR_LOOP); 3466 3467 bind(RET_NOT_FOUND); 3468 movl(result, -1); 3469 jmpb(DONE_LABEL); 3470 3471 bind(FOUND_CHAR); 3472 if (UseAVX >= 2) { 3473 vpmovmskb(tmp, vec3); 3474 } else { 3475 pmovmskb(tmp, vec3); 3476 } 3477 bsfl(ch, tmp); 3478 addptr(result, ch); 3479 3480 bind(FOUND_SEQ_CHAR); 3481 subptr(result, str1); 3482 shrl(result, 1); 3483 3484 bind(DONE_LABEL); 3485 } // string_indexof_char 3486 3487 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3488 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3489 ShortBranchVerifier sbv(this); 3490 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3491 3492 int stride = 16; 3493 3494 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3495 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3496 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3497 FOUND_SEQ_CHAR, DONE_LABEL; 3498 3499 movptr(result, str1); 3500 if (UseAVX >= 2) { 3501 cmpl(cnt1, stride); 3502 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3503 cmpl(cnt1, stride*2); 3504 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3505 movdl(vec1, ch); 3506 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3507 vpxor(vec2, vec2); 3508 movl(tmp, cnt1); 3509 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3510 andl(cnt1,0x0000001F); //tail count (in chars) 3511 3512 bind(SCAN_TO_32_CHAR_LOOP); 3513 vmovdqu(vec3, Address(result, 0)); 3514 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3515 vptest(vec2, vec3); 3516 jcc(Assembler::carryClear, FOUND_CHAR); 3517 addptr(result, 32); 3518 subl(tmp, stride*2); 3519 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3520 jmp(SCAN_TO_16_CHAR); 3521 3522 bind(SCAN_TO_16_CHAR_INIT); 3523 movdl(vec1, ch); 3524 pxor(vec2, vec2); 3525 pshufb(vec1, vec2); 3526 } 3527 3528 bind(SCAN_TO_16_CHAR); 3529 cmpl(cnt1, stride); 3530 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3531 if (UseAVX < 2) { 3532 movdl(vec1, ch); 3533 pxor(vec2, vec2); 3534 pshufb(vec1, vec2); 3535 } 3536 movl(tmp, cnt1); 3537 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3538 andl(cnt1,0x0000000F); //tail count (in bytes) 3539 3540 bind(SCAN_TO_16_CHAR_LOOP); 3541 movdqu(vec3, Address(result, 0)); 3542 pcmpeqb(vec3, vec1); 3543 ptest(vec2, vec3); 3544 jcc(Assembler::carryClear, FOUND_CHAR); 3545 addptr(result, 16); 3546 subl(tmp, stride); 3547 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3548 3549 bind(SCAN_TO_CHAR_INIT); 3550 testl(cnt1, cnt1); 3551 jcc(Assembler::zero, RET_NOT_FOUND); 3552 bind(SCAN_TO_CHAR_LOOP); 3553 load_unsigned_byte(tmp, Address(result, 0)); 3554 cmpl(ch, tmp); 3555 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3556 addptr(result, 1); 3557 subl(cnt1, 1); 3558 jccb(Assembler::zero, RET_NOT_FOUND); 3559 jmp(SCAN_TO_CHAR_LOOP); 3560 3561 bind(RET_NOT_FOUND); 3562 movl(result, -1); 3563 jmpb(DONE_LABEL); 3564 3565 bind(FOUND_CHAR); 3566 if (UseAVX >= 2) { 3567 vpmovmskb(tmp, vec3); 3568 } else { 3569 pmovmskb(tmp, vec3); 3570 } 3571 bsfl(ch, tmp); 3572 addptr(result, ch); 3573 3574 bind(FOUND_SEQ_CHAR); 3575 subptr(result, str1); 3576 3577 bind(DONE_LABEL); 3578 } // stringL_indexof_char 3579 3580 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3581 switch (eltype) { 3582 case T_BOOLEAN: return sizeof(jboolean); 3583 case T_BYTE: return sizeof(jbyte); 3584 case T_SHORT: return sizeof(jshort); 3585 case T_CHAR: return sizeof(jchar); 3586 case T_INT: return sizeof(jint); 3587 default: 3588 ShouldNotReachHere(); 3589 return -1; 3590 } 3591 } 3592 3593 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3594 switch (eltype) { 3595 // T_BOOLEAN used as surrogate for unsigned byte 3596 case T_BOOLEAN: movzbl(dst, src); break; 3597 case T_BYTE: movsbl(dst, src); break; 3598 case T_SHORT: movswl(dst, src); break; 3599 case T_CHAR: movzwl(dst, src); break; 3600 case T_INT: movl(dst, src); break; 3601 default: 3602 ShouldNotReachHere(); 3603 } 3604 } 3605 3606 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3607 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3608 } 3609 3610 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3611 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3612 } 3613 3614 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3615 const int vlen = Assembler::AVX_256bit; 3616 switch (eltype) { 3617 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3618 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3619 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3620 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3621 case T_INT: 3622 // do nothing 3623 break; 3624 default: 3625 ShouldNotReachHere(); 3626 } 3627 } 3628 3629 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3630 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3631 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3632 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3633 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3634 BasicType eltype) { 3635 ShortBranchVerifier sbv(this); 3636 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3637 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3638 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3639 3640 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3641 SHORT_UNROLLED_LOOP_EXIT, 3642 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3643 UNROLLED_VECTOR_LOOP_BEGIN, 3644 END; 3645 switch (eltype) { 3646 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3647 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3648 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3649 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3650 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3651 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3652 } 3653 3654 // For "renaming" for readibility of the code 3655 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3656 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3657 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3658 3659 const int elsize = arrays_hashcode_elsize(eltype); 3660 3661 /* 3662 if (cnt1 >= 2) { 3663 if (cnt1 >= 32) { 3664 UNROLLED VECTOR LOOP 3665 } 3666 UNROLLED SCALAR LOOP 3667 } 3668 SINGLE SCALAR 3669 */ 3670 3671 cmpl(cnt1, 32); 3672 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3673 3674 // cnt1 >= 32 && generate_vectorized_loop 3675 xorl(index, index); 3676 3677 // vresult = IntVector.zero(I256); 3678 for (int idx = 0; idx < 4; idx++) { 3679 vpxor(vresult[idx], vresult[idx]); 3680 } 3681 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3682 Register bound = tmp2; 3683 Register next = tmp3; 3684 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3685 movl(next, Address(tmp2, 0)); 3686 movdl(vnext, next); 3687 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3688 3689 // index = 0; 3690 // bound = cnt1 & ~(32 - 1); 3691 movl(bound, cnt1); 3692 andl(bound, ~(32 - 1)); 3693 // for (; index < bound; index += 32) { 3694 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3695 // result *= next; 3696 imull(result, next); 3697 // loop fission to upfront the cost of fetching from memory, OOO execution 3698 // can then hopefully do a better job of prefetching 3699 for (int idx = 0; idx < 4; idx++) { 3700 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3701 } 3702 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3703 for (int idx = 0; idx < 4; idx++) { 3704 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3705 arrays_hashcode_elvcast(vtmp[idx], eltype); 3706 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3707 } 3708 // index += 32; 3709 addl(index, 32); 3710 // index < bound; 3711 cmpl(index, bound); 3712 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3713 // } 3714 3715 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3716 subl(cnt1, bound); 3717 // release bound 3718 3719 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3720 for (int idx = 0; idx < 4; idx++) { 3721 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3722 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3723 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3724 } 3725 // result += vresult.reduceLanes(ADD); 3726 for (int idx = 0; idx < 4; idx++) { 3727 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3728 } 3729 3730 // } else if (cnt1 < 32) { 3731 3732 bind(SHORT_UNROLLED_BEGIN); 3733 // int i = 1; 3734 movl(index, 1); 3735 cmpl(index, cnt1); 3736 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3737 3738 // for (; i < cnt1 ; i += 2) { 3739 bind(SHORT_UNROLLED_LOOP_BEGIN); 3740 movl(tmp3, 961); 3741 imull(result, tmp3); 3742 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3743 movl(tmp3, tmp2); 3744 shll(tmp3, 5); 3745 subl(tmp3, tmp2); 3746 addl(result, tmp3); 3747 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3748 addl(result, tmp3); 3749 addl(index, 2); 3750 cmpl(index, cnt1); 3751 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3752 3753 // } 3754 // if (i >= cnt1) { 3755 bind(SHORT_UNROLLED_LOOP_EXIT); 3756 jccb(Assembler::greater, END); 3757 movl(tmp2, result); 3758 shll(result, 5); 3759 subl(result, tmp2); 3760 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3761 addl(result, tmp3); 3762 // } 3763 bind(END); 3764 3765 BLOCK_COMMENT("} // arrays_hashcode"); 3766 3767 } // arrays_hashcode 3768 3769 // helper function for string_compare 3770 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3771 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3772 Address::ScaleFactor scale2, Register index, int ae) { 3773 if (ae == StrIntrinsicNode::LL) { 3774 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3775 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3776 } else if (ae == StrIntrinsicNode::UU) { 3777 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3778 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3779 } else { 3780 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3781 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3782 } 3783 } 3784 3785 // Compare strings, used for char[] and byte[]. 3786 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3787 Register cnt1, Register cnt2, Register result, 3788 XMMRegister vec1, int ae, KRegister mask) { 3789 ShortBranchVerifier sbv(this); 3790 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3791 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3 3792 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3793 int stride2x2 = 0x40; 3794 Address::ScaleFactor scale = Address::no_scale; 3795 Address::ScaleFactor scale1 = Address::no_scale; 3796 Address::ScaleFactor scale2 = Address::no_scale; 3797 3798 if (ae != StrIntrinsicNode::LL) { 3799 stride2x2 = 0x20; 3800 } 3801 3802 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3803 shrl(cnt2, 1); 3804 } 3805 // Compute the minimum of the string lengths and the 3806 // difference of the string lengths (stack). 3807 // Do the conditional move stuff 3808 movl(result, cnt1); 3809 subl(cnt1, cnt2); 3810 push(cnt1); 3811 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3812 3813 // Is the minimum length zero? 3814 testl(cnt2, cnt2); 3815 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3816 if (ae == StrIntrinsicNode::LL) { 3817 // Load first bytes 3818 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3819 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3820 } else if (ae == StrIntrinsicNode::UU) { 3821 // Load first characters 3822 load_unsigned_short(result, Address(str1, 0)); 3823 load_unsigned_short(cnt1, Address(str2, 0)); 3824 } else { 3825 load_unsigned_byte(result, Address(str1, 0)); 3826 load_unsigned_short(cnt1, Address(str2, 0)); 3827 } 3828 subl(result, cnt1); 3829 jcc(Assembler::notZero, POP_LABEL); 3830 3831 if (ae == StrIntrinsicNode::UU) { 3832 // Divide length by 2 to get number of chars 3833 shrl(cnt2, 1); 3834 } 3835 cmpl(cnt2, 1); 3836 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3837 3838 // Check if the strings start at the same location and setup scale and stride 3839 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3840 cmpptr(str1, str2); 3841 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3842 if (ae == StrIntrinsicNode::LL) { 3843 scale = Address::times_1; 3844 stride = 16; 3845 } else { 3846 scale = Address::times_2; 3847 stride = 8; 3848 } 3849 } else { 3850 scale1 = Address::times_1; 3851 scale2 = Address::times_2; 3852 // scale not used 3853 stride = 8; 3854 } 3855 3856 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3857 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3858 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3859 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3860 Label COMPARE_TAIL_LONG; 3861 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3 3862 3863 int pcmpmask = 0x19; 3864 if (ae == StrIntrinsicNode::LL) { 3865 pcmpmask &= ~0x01; 3866 } 3867 3868 // Setup to compare 16-chars (32-bytes) vectors, 3869 // start from first character again because it has aligned address. 3870 if (ae == StrIntrinsicNode::LL) { 3871 stride2 = 32; 3872 } else { 3873 stride2 = 16; 3874 } 3875 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3876 adr_stride = stride << scale; 3877 } else { 3878 adr_stride1 = 8; //stride << scale1; 3879 adr_stride2 = 16; //stride << scale2; 3880 } 3881 3882 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3883 // rax and rdx are used by pcmpestri as elements counters 3884 movl(result, cnt2); 3885 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3886 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3887 3888 // fast path : compare first 2 8-char vectors. 3889 bind(COMPARE_16_CHARS); 3890 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3891 movdqu(vec1, Address(str1, 0)); 3892 } else { 3893 pmovzxbw(vec1, Address(str1, 0)); 3894 } 3895 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3896 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3897 3898 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3899 movdqu(vec1, Address(str1, adr_stride)); 3900 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3901 } else { 3902 pmovzxbw(vec1, Address(str1, adr_stride1)); 3903 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3904 } 3905 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3906 addl(cnt1, stride); 3907 3908 // Compare the characters at index in cnt1 3909 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3910 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3911 subl(result, cnt2); 3912 jmp(POP_LABEL); 3913 3914 // Setup the registers to start vector comparison loop 3915 bind(COMPARE_WIDE_VECTORS); 3916 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3917 lea(str1, Address(str1, result, scale)); 3918 lea(str2, Address(str2, result, scale)); 3919 } else { 3920 lea(str1, Address(str1, result, scale1)); 3921 lea(str2, Address(str2, result, scale2)); 3922 } 3923 subl(result, stride2); 3924 subl(cnt2, stride2); 3925 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3926 negptr(result); 3927 3928 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3929 bind(COMPARE_WIDE_VECTORS_LOOP); 3930 3931 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3932 cmpl(cnt2, stride2x2); 3933 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3934 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3935 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3936 3937 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3938 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3939 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3940 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3941 } else { 3942 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3943 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3944 } 3945 kortestql(mask, mask); 3946 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3947 addptr(result, stride2x2); // update since we already compared at this addr 3948 subl(cnt2, stride2x2); // and sub the size too 3949 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3950 3951 vpxor(vec1, vec1); 3952 jmpb(COMPARE_WIDE_TAIL); 3953 }//if (VM_Version::supports_avx512vlbw()) 3954 3955 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3956 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3957 vmovdqu(vec1, Address(str1, result, scale)); 3958 vpxor(vec1, Address(str2, result, scale)); 3959 } else { 3960 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3961 vpxor(vec1, Address(str2, result, scale2)); 3962 } 3963 vptest(vec1, vec1); 3964 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3965 addptr(result, stride2); 3966 subl(cnt2, stride2); 3967 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3968 // clean upper bits of YMM registers 3969 vpxor(vec1, vec1); 3970 3971 // compare wide vectors tail 3972 bind(COMPARE_WIDE_TAIL); 3973 testptr(result, result); 3974 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3975 3976 movl(result, stride2); 3977 movl(cnt2, result); 3978 negptr(result); 3979 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3980 3981 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3982 bind(VECTOR_NOT_EQUAL); 3983 // clean upper bits of YMM registers 3984 vpxor(vec1, vec1); 3985 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3986 lea(str1, Address(str1, result, scale)); 3987 lea(str2, Address(str2, result, scale)); 3988 } else { 3989 lea(str1, Address(str1, result, scale1)); 3990 lea(str2, Address(str2, result, scale2)); 3991 } 3992 jmp(COMPARE_16_CHARS); 3993 3994 // Compare tail chars, length between 1 to 15 chars 3995 bind(COMPARE_TAIL_LONG); 3996 movl(cnt2, result); 3997 cmpl(cnt2, stride); 3998 jcc(Assembler::less, COMPARE_SMALL_STR); 3999 4000 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4001 movdqu(vec1, Address(str1, 0)); 4002 } else { 4003 pmovzxbw(vec1, Address(str1, 0)); 4004 } 4005 pcmpestri(vec1, Address(str2, 0), pcmpmask); 4006 jcc(Assembler::below, COMPARE_INDEX_CHAR); 4007 subptr(cnt2, stride); 4008 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 4009 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4010 lea(str1, Address(str1, result, scale)); 4011 lea(str2, Address(str2, result, scale)); 4012 } else { 4013 lea(str1, Address(str1, result, scale1)); 4014 lea(str2, Address(str2, result, scale2)); 4015 } 4016 negptr(cnt2); 4017 jmpb(WHILE_HEAD_LABEL); 4018 4019 bind(COMPARE_SMALL_STR); 4020 } else if (UseSSE42Intrinsics) { 4021 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 4022 int pcmpmask = 0x19; 4023 // Setup to compare 8-char (16-byte) vectors, 4024 // start from first character again because it has aligned address. 4025 movl(result, cnt2); 4026 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 4027 if (ae == StrIntrinsicNode::LL) { 4028 pcmpmask &= ~0x01; 4029 } 4030 jcc(Assembler::zero, COMPARE_TAIL); 4031 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4032 lea(str1, Address(str1, result, scale)); 4033 lea(str2, Address(str2, result, scale)); 4034 } else { 4035 lea(str1, Address(str1, result, scale1)); 4036 lea(str2, Address(str2, result, scale2)); 4037 } 4038 negptr(result); 4039 4040 // pcmpestri 4041 // inputs: 4042 // vec1- substring 4043 // rax - negative string length (elements count) 4044 // mem - scanned string 4045 // rdx - string length (elements count) 4046 // pcmpmask - cmp mode: 11000 (string compare with negated result) 4047 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 4048 // outputs: 4049 // rcx - first mismatched element index 4050 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 4051 4052 bind(COMPARE_WIDE_VECTORS); 4053 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4054 movdqu(vec1, Address(str1, result, scale)); 4055 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 4056 } else { 4057 pmovzxbw(vec1, Address(str1, result, scale1)); 4058 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4059 } 4060 // After pcmpestri cnt1(rcx) contains mismatched element index 4061 4062 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 4063 addptr(result, stride); 4064 subptr(cnt2, stride); 4065 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4066 4067 // compare wide vectors tail 4068 testptr(result, result); 4069 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 4070 4071 movl(cnt2, stride); 4072 movl(result, stride); 4073 negptr(result); 4074 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4075 movdqu(vec1, Address(str1, result, scale)); 4076 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 4077 } else { 4078 pmovzxbw(vec1, Address(str1, result, scale1)); 4079 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4080 } 4081 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 4082 4083 // Mismatched characters in the vectors 4084 bind(VECTOR_NOT_EQUAL); 4085 addptr(cnt1, result); 4086 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 4087 subl(result, cnt2); 4088 jmpb(POP_LABEL); 4089 4090 bind(COMPARE_TAIL); // limit is zero 4091 movl(cnt2, result); 4092 // Fallthru to tail compare 4093 } 4094 // Shift str2 and str1 to the end of the arrays, negate min 4095 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4096 lea(str1, Address(str1, cnt2, scale)); 4097 lea(str2, Address(str2, cnt2, scale)); 4098 } else { 4099 lea(str1, Address(str1, cnt2, scale1)); 4100 lea(str2, Address(str2, cnt2, scale2)); 4101 } 4102 decrementl(cnt2); // first character was compared already 4103 negptr(cnt2); 4104 4105 // Compare the rest of the elements 4106 bind(WHILE_HEAD_LABEL); 4107 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 4108 subl(result, cnt1); 4109 jccb(Assembler::notZero, POP_LABEL); 4110 increment(cnt2); 4111 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 4112 4113 // Strings are equal up to min length. Return the length difference. 4114 bind(LENGTH_DIFF_LABEL); 4115 pop(result); 4116 if (ae == StrIntrinsicNode::UU) { 4117 // Divide diff by 2 to get number of chars 4118 sarl(result, 1); 4119 } 4120 jmpb(DONE_LABEL); 4121 4122 if (VM_Version::supports_avx512vlbw()) { 4123 4124 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 4125 4126 kmovql(cnt1, mask); 4127 notq(cnt1); 4128 bsfq(cnt2, cnt1); 4129 if (ae != StrIntrinsicNode::LL) { 4130 // Divide diff by 2 to get number of chars 4131 sarl(cnt2, 1); 4132 } 4133 addq(result, cnt2); 4134 if (ae == StrIntrinsicNode::LL) { 4135 load_unsigned_byte(cnt1, Address(str2, result)); 4136 load_unsigned_byte(result, Address(str1, result)); 4137 } else if (ae == StrIntrinsicNode::UU) { 4138 load_unsigned_short(cnt1, Address(str2, result, scale)); 4139 load_unsigned_short(result, Address(str1, result, scale)); 4140 } else { 4141 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4142 load_unsigned_byte(result, Address(str1, result, scale1)); 4143 } 4144 subl(result, cnt1); 4145 jmpb(POP_LABEL); 4146 }//if (VM_Version::supports_avx512vlbw()) 4147 4148 // Discard the stored length difference 4149 bind(POP_LABEL); 4150 pop(cnt1); 4151 4152 // That's it 4153 bind(DONE_LABEL); 4154 if(ae == StrIntrinsicNode::UL) { 4155 negl(result); 4156 } 4157 4158 } 4159 4160 // Search for Non-ASCII character (Negative byte value) in a byte array, 4161 // return the index of the first such character, otherwise the length 4162 // of the array segment searched. 4163 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4164 // @IntrinsicCandidate 4165 // public static int countPositives(byte[] ba, int off, int len) { 4166 // for (int i = off; i < off + len; i++) { 4167 // if (ba[i] < 0) { 4168 // return i - off; 4169 // } 4170 // } 4171 // return len; 4172 // } 4173 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4174 Register result, Register tmp1, 4175 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4176 // rsi: byte array 4177 // rcx: len 4178 // rax: result 4179 ShortBranchVerifier sbv(this); 4180 assert_different_registers(ary1, len, result, tmp1); 4181 assert_different_registers(vec1, vec2); 4182 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4183 4184 movl(result, len); // copy 4185 // len == 0 4186 testl(len, len); 4187 jcc(Assembler::zero, DONE); 4188 4189 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4190 VM_Version::supports_avx512vlbw() && 4191 VM_Version::supports_bmi2()) { 4192 4193 Label test_64_loop, test_tail, BREAK_LOOP; 4194 movl(tmp1, len); 4195 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4196 4197 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4198 andl(len, 0xffffffc0); // vector count (in chars) 4199 jccb(Assembler::zero, test_tail); 4200 4201 lea(ary1, Address(ary1, len, Address::times_1)); 4202 negptr(len); 4203 4204 bind(test_64_loop); 4205 // Check whether our 64 elements of size byte contain negatives 4206 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4207 kortestql(mask1, mask1); 4208 jcc(Assembler::notZero, BREAK_LOOP); 4209 4210 addptr(len, 64); 4211 jccb(Assembler::notZero, test_64_loop); 4212 4213 bind(test_tail); 4214 // bail out when there is nothing to be done 4215 testl(tmp1, -1); 4216 jcc(Assembler::zero, DONE); 4217 4218 4219 // check the tail for absense of negatives 4220 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4221 { 4222 Register tmp3_aliased = len; 4223 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4224 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4225 notq(tmp3_aliased); 4226 kmovql(mask2, tmp3_aliased); 4227 } 4228 4229 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4230 ktestq(mask1, mask2); 4231 jcc(Assembler::zero, DONE); 4232 4233 // do a full check for negative registers in the tail 4234 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4235 // ary1 already pointing to the right place 4236 jmpb(TAIL_START); 4237 4238 bind(BREAK_LOOP); 4239 // At least one byte in the last 64 byte block was negative. 4240 // Set up to look at the last 64 bytes as if they were a tail 4241 lea(ary1, Address(ary1, len, Address::times_1)); 4242 addptr(result, len); 4243 // Ignore the very last byte: if all others are positive, 4244 // it must be negative, so we can skip right to the 2+1 byte 4245 // end comparison at this point 4246 orl(result, 63); 4247 movl(len, 63); 4248 // Fallthru to tail compare 4249 } else { 4250 4251 if (UseAVX >= 2) { 4252 // With AVX2, use 32-byte vector compare 4253 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4254 4255 // Compare 32-byte vectors 4256 testl(len, 0xffffffe0); // vector count (in bytes) 4257 jccb(Assembler::zero, TAIL_START); 4258 4259 andl(len, 0xffffffe0); 4260 lea(ary1, Address(ary1, len, Address::times_1)); 4261 negptr(len); 4262 4263 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4264 movdl(vec2, tmp1); 4265 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4266 4267 bind(COMPARE_WIDE_VECTORS); 4268 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4269 vptest(vec1, vec2); 4270 jccb(Assembler::notZero, BREAK_LOOP); 4271 addptr(len, 32); 4272 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4273 4274 testl(result, 0x0000001f); // any bytes remaining? 4275 jcc(Assembler::zero, DONE); 4276 4277 // Quick test using the already prepared vector mask 4278 movl(len, result); 4279 andl(len, 0x0000001f); 4280 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4281 vptest(vec1, vec2); 4282 jcc(Assembler::zero, DONE); 4283 // There are zeros, jump to the tail to determine exactly where 4284 jmpb(TAIL_START); 4285 4286 bind(BREAK_LOOP); 4287 // At least one byte in the last 32-byte vector is negative. 4288 // Set up to look at the last 32 bytes as if they were a tail 4289 lea(ary1, Address(ary1, len, Address::times_1)); 4290 addptr(result, len); 4291 // Ignore the very last byte: if all others are positive, 4292 // it must be negative, so we can skip right to the 2+1 byte 4293 // end comparison at this point 4294 orl(result, 31); 4295 movl(len, 31); 4296 // Fallthru to tail compare 4297 } else if (UseSSE42Intrinsics) { 4298 // With SSE4.2, use double quad vector compare 4299 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4300 4301 // Compare 16-byte vectors 4302 testl(len, 0xfffffff0); // vector count (in bytes) 4303 jcc(Assembler::zero, TAIL_START); 4304 4305 andl(len, 0xfffffff0); 4306 lea(ary1, Address(ary1, len, Address::times_1)); 4307 negptr(len); 4308 4309 movl(tmp1, 0x80808080); 4310 movdl(vec2, tmp1); 4311 pshufd(vec2, vec2, 0); 4312 4313 bind(COMPARE_WIDE_VECTORS); 4314 movdqu(vec1, Address(ary1, len, Address::times_1)); 4315 ptest(vec1, vec2); 4316 jccb(Assembler::notZero, BREAK_LOOP); 4317 addptr(len, 16); 4318 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4319 4320 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4321 jcc(Assembler::zero, DONE); 4322 4323 // Quick test using the already prepared vector mask 4324 movl(len, result); 4325 andl(len, 0x0000000f); // tail count (in bytes) 4326 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4327 ptest(vec1, vec2); 4328 jcc(Assembler::zero, DONE); 4329 jmpb(TAIL_START); 4330 4331 bind(BREAK_LOOP); 4332 // At least one byte in the last 16-byte vector is negative. 4333 // Set up and look at the last 16 bytes as if they were a tail 4334 lea(ary1, Address(ary1, len, Address::times_1)); 4335 addptr(result, len); 4336 // Ignore the very last byte: if all others are positive, 4337 // it must be negative, so we can skip right to the 2+1 byte 4338 // end comparison at this point 4339 orl(result, 15); 4340 movl(len, 15); 4341 // Fallthru to tail compare 4342 } 4343 } 4344 4345 bind(TAIL_START); 4346 // Compare 4-byte vectors 4347 andl(len, 0xfffffffc); // vector count (in bytes) 4348 jccb(Assembler::zero, COMPARE_CHAR); 4349 4350 lea(ary1, Address(ary1, len, Address::times_1)); 4351 negptr(len); 4352 4353 bind(COMPARE_VECTORS); 4354 movl(tmp1, Address(ary1, len, Address::times_1)); 4355 andl(tmp1, 0x80808080); 4356 jccb(Assembler::notZero, TAIL_ADJUST); 4357 addptr(len, 4); 4358 jccb(Assembler::notZero, COMPARE_VECTORS); 4359 4360 // Compare trailing char (final 2-3 bytes), if any 4361 bind(COMPARE_CHAR); 4362 4363 testl(result, 0x2); // tail char 4364 jccb(Assembler::zero, COMPARE_BYTE); 4365 load_unsigned_short(tmp1, Address(ary1, 0)); 4366 andl(tmp1, 0x00008080); 4367 jccb(Assembler::notZero, CHAR_ADJUST); 4368 lea(ary1, Address(ary1, 2)); 4369 4370 bind(COMPARE_BYTE); 4371 testl(result, 0x1); // tail byte 4372 jccb(Assembler::zero, DONE); 4373 load_unsigned_byte(tmp1, Address(ary1, 0)); 4374 testl(tmp1, 0x00000080); 4375 jccb(Assembler::zero, DONE); 4376 subptr(result, 1); 4377 jmpb(DONE); 4378 4379 bind(TAIL_ADJUST); 4380 // there are negative bits in the last 4 byte block. 4381 // Adjust result and check the next three bytes 4382 addptr(result, len); 4383 orl(result, 3); 4384 lea(ary1, Address(ary1, len, Address::times_1)); 4385 jmpb(COMPARE_CHAR); 4386 4387 bind(CHAR_ADJUST); 4388 // We are looking at a char + optional byte tail, and found that one 4389 // of the bytes in the char is negative. Adjust the result, check the 4390 // first byte and readjust if needed. 4391 andl(result, 0xfffffffc); 4392 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4393 jccb(Assembler::notZero, DONE); 4394 addptr(result, 1); 4395 4396 // That's it 4397 bind(DONE); 4398 if (UseAVX >= 2) { 4399 // clean upper bits of YMM registers 4400 vpxor(vec1, vec1); 4401 vpxor(vec2, vec2); 4402 } 4403 } 4404 4405 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4406 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4407 Register limit, Register result, Register chr, 4408 XMMRegister vec1, XMMRegister vec2, bool is_char, 4409 KRegister mask, bool expand_ary2) { 4410 // for expand_ary2, limit is the (smaller) size of the second array. 4411 ShortBranchVerifier sbv(this); 4412 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4413 4414 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4415 "Expansion only implemented for AVX2"); 4416 4417 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4418 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4419 4420 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4421 int scaleIncr = expand_ary2 ? 8 : 16; 4422 4423 if (is_array_equ) { 4424 // Check the input args 4425 cmpoop(ary1, ary2); 4426 jcc(Assembler::equal, TRUE_LABEL); 4427 4428 // Need additional checks for arrays_equals. 4429 testptr(ary1, ary1); 4430 jcc(Assembler::zero, FALSE_LABEL); 4431 testptr(ary2, ary2); 4432 jcc(Assembler::zero, FALSE_LABEL); 4433 4434 // Check the lengths 4435 movl(limit, Address(ary1, length_offset)); 4436 cmpl(limit, Address(ary2, length_offset)); 4437 jcc(Assembler::notEqual, FALSE_LABEL); 4438 } 4439 4440 // count == 0 4441 testl(limit, limit); 4442 jcc(Assembler::zero, TRUE_LABEL); 4443 4444 if (is_array_equ) { 4445 // Load array address 4446 lea(ary1, Address(ary1, base_offset)); 4447 lea(ary2, Address(ary2, base_offset)); 4448 } 4449 4450 if (is_array_equ && is_char) { 4451 // arrays_equals when used for char[]. 4452 shll(limit, 1); // byte count != 0 4453 } 4454 movl(result, limit); // copy 4455 4456 if (UseAVX >= 2) { 4457 // With AVX2, use 32-byte vector compare 4458 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4459 4460 // Compare 32-byte vectors 4461 if (expand_ary2) { 4462 andl(result, 0x0000000f); // tail count (in bytes) 4463 andl(limit, 0xfffffff0); // vector count (in bytes) 4464 jcc(Assembler::zero, COMPARE_TAIL); 4465 } else { 4466 andl(result, 0x0000001f); // tail count (in bytes) 4467 andl(limit, 0xffffffe0); // vector count (in bytes) 4468 jcc(Assembler::zero, COMPARE_TAIL_16); 4469 } 4470 4471 lea(ary1, Address(ary1, limit, scaleFactor)); 4472 lea(ary2, Address(ary2, limit, Address::times_1)); 4473 negptr(limit); 4474 4475 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4476 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4477 4478 cmpl(limit, -64); 4479 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4480 4481 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4482 4483 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4484 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4485 kortestql(mask, mask); 4486 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4487 addptr(limit, 64); // update since we already compared at this addr 4488 cmpl(limit, -64); 4489 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4490 4491 // At this point we may still need to compare -limit+result bytes. 4492 // We could execute the next two instruction and just continue via non-wide path: 4493 // cmpl(limit, 0); 4494 // jcc(Assembler::equal, COMPARE_TAIL); // true 4495 // But since we stopped at the points ary{1,2}+limit which are 4496 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4497 // (|limit| <= 32 and result < 32), 4498 // we may just compare the last 64 bytes. 4499 // 4500 addptr(result, -64); // it is safe, bc we just came from this area 4501 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4502 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4503 kortestql(mask, mask); 4504 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4505 4506 jmp(TRUE_LABEL); 4507 4508 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4509 4510 }//if (VM_Version::supports_avx512vlbw()) 4511 4512 bind(COMPARE_WIDE_VECTORS); 4513 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4514 if (expand_ary2) { 4515 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4516 } else { 4517 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4518 } 4519 vpxor(vec1, vec2); 4520 4521 vptest(vec1, vec1); 4522 jcc(Assembler::notZero, FALSE_LABEL); 4523 addptr(limit, scaleIncr * 2); 4524 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4525 4526 testl(result, result); 4527 jcc(Assembler::zero, TRUE_LABEL); 4528 4529 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4530 if (expand_ary2) { 4531 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4532 } else { 4533 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4534 } 4535 vpxor(vec1, vec2); 4536 4537 vptest(vec1, vec1); 4538 jcc(Assembler::notZero, FALSE_LABEL); 4539 jmp(TRUE_LABEL); 4540 4541 bind(COMPARE_TAIL_16); // limit is zero 4542 movl(limit, result); 4543 4544 // Compare 16-byte chunks 4545 andl(result, 0x0000000f); // tail count (in bytes) 4546 andl(limit, 0xfffffff0); // vector count (in bytes) 4547 jcc(Assembler::zero, COMPARE_TAIL); 4548 4549 lea(ary1, Address(ary1, limit, scaleFactor)); 4550 lea(ary2, Address(ary2, limit, Address::times_1)); 4551 negptr(limit); 4552 4553 bind(COMPARE_WIDE_VECTORS_16); 4554 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4555 if (expand_ary2) { 4556 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4557 } else { 4558 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4559 } 4560 pxor(vec1, vec2); 4561 4562 ptest(vec1, vec1); 4563 jcc(Assembler::notZero, FALSE_LABEL); 4564 addptr(limit, scaleIncr); 4565 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4566 4567 bind(COMPARE_TAIL); // limit is zero 4568 movl(limit, result); 4569 // Fallthru to tail compare 4570 } else if (UseSSE42Intrinsics) { 4571 // With SSE4.2, use double quad vector compare 4572 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4573 4574 // Compare 16-byte vectors 4575 andl(result, 0x0000000f); // tail count (in bytes) 4576 andl(limit, 0xfffffff0); // vector count (in bytes) 4577 jcc(Assembler::zero, COMPARE_TAIL); 4578 4579 lea(ary1, Address(ary1, limit, Address::times_1)); 4580 lea(ary2, Address(ary2, limit, Address::times_1)); 4581 negptr(limit); 4582 4583 bind(COMPARE_WIDE_VECTORS); 4584 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4585 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4586 pxor(vec1, vec2); 4587 4588 ptest(vec1, vec1); 4589 jcc(Assembler::notZero, FALSE_LABEL); 4590 addptr(limit, 16); 4591 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4592 4593 testl(result, result); 4594 jcc(Assembler::zero, TRUE_LABEL); 4595 4596 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4597 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4598 pxor(vec1, vec2); 4599 4600 ptest(vec1, vec1); 4601 jccb(Assembler::notZero, FALSE_LABEL); 4602 jmpb(TRUE_LABEL); 4603 4604 bind(COMPARE_TAIL); // limit is zero 4605 movl(limit, result); 4606 // Fallthru to tail compare 4607 } 4608 4609 // Compare 4-byte vectors 4610 if (expand_ary2) { 4611 testl(result, result); 4612 jccb(Assembler::zero, TRUE_LABEL); 4613 } else { 4614 andl(limit, 0xfffffffc); // vector count (in bytes) 4615 jccb(Assembler::zero, COMPARE_CHAR); 4616 } 4617 4618 lea(ary1, Address(ary1, limit, scaleFactor)); 4619 lea(ary2, Address(ary2, limit, Address::times_1)); 4620 negptr(limit); 4621 4622 bind(COMPARE_VECTORS); 4623 if (expand_ary2) { 4624 // There are no "vector" operations for bytes to shorts 4625 movzbl(chr, Address(ary2, limit, Address::times_1)); 4626 cmpw(Address(ary1, limit, Address::times_2), chr); 4627 jccb(Assembler::notEqual, FALSE_LABEL); 4628 addptr(limit, 1); 4629 jcc(Assembler::notZero, COMPARE_VECTORS); 4630 jmp(TRUE_LABEL); 4631 } else { 4632 movl(chr, Address(ary1, limit, Address::times_1)); 4633 cmpl(chr, Address(ary2, limit, Address::times_1)); 4634 jccb(Assembler::notEqual, FALSE_LABEL); 4635 addptr(limit, 4); 4636 jcc(Assembler::notZero, COMPARE_VECTORS); 4637 } 4638 4639 // Compare trailing char (final 2 bytes), if any 4640 bind(COMPARE_CHAR); 4641 testl(result, 0x2); // tail char 4642 jccb(Assembler::zero, COMPARE_BYTE); 4643 load_unsigned_short(chr, Address(ary1, 0)); 4644 load_unsigned_short(limit, Address(ary2, 0)); 4645 cmpl(chr, limit); 4646 jccb(Assembler::notEqual, FALSE_LABEL); 4647 4648 if (is_array_equ && is_char) { 4649 bind(COMPARE_BYTE); 4650 } else { 4651 lea(ary1, Address(ary1, 2)); 4652 lea(ary2, Address(ary2, 2)); 4653 4654 bind(COMPARE_BYTE); 4655 testl(result, 0x1); // tail byte 4656 jccb(Assembler::zero, TRUE_LABEL); 4657 load_unsigned_byte(chr, Address(ary1, 0)); 4658 load_unsigned_byte(limit, Address(ary2, 0)); 4659 cmpl(chr, limit); 4660 jccb(Assembler::notEqual, FALSE_LABEL); 4661 } 4662 bind(TRUE_LABEL); 4663 movl(result, 1); // return true 4664 jmpb(DONE); 4665 4666 bind(FALSE_LABEL); 4667 xorl(result, result); // return false 4668 4669 // That's it 4670 bind(DONE); 4671 if (UseAVX >= 2) { 4672 // clean upper bits of YMM registers 4673 vpxor(vec1, vec1); 4674 vpxor(vec2, vec2); 4675 } 4676 } 4677 4678 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4679 #define __ masm. 4680 Register dst = stub.data<0>(); 4681 XMMRegister src = stub.data<1>(); 4682 address target = stub.data<2>(); 4683 __ bind(stub.entry()); 4684 __ subptr(rsp, 8); 4685 __ movdbl(Address(rsp), src); 4686 __ call(RuntimeAddress(target)); 4687 __ pop(dst); 4688 __ jmp(stub.continuation()); 4689 #undef __ 4690 } 4691 4692 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4693 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4694 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4695 4696 address slowpath_target; 4697 if (dst_bt == T_INT) { 4698 if (src_bt == T_FLOAT) { 4699 cvttss2sil(dst, src); 4700 cmpl(dst, 0x80000000); 4701 slowpath_target = StubRoutines::x86::f2i_fixup(); 4702 } else { 4703 cvttsd2sil(dst, src); 4704 cmpl(dst, 0x80000000); 4705 slowpath_target = StubRoutines::x86::d2i_fixup(); 4706 } 4707 } else { 4708 if (src_bt == T_FLOAT) { 4709 cvttss2siq(dst, src); 4710 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4711 slowpath_target = StubRoutines::x86::f2l_fixup(); 4712 } else { 4713 cvttsd2siq(dst, src); 4714 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4715 slowpath_target = StubRoutines::x86::d2l_fixup(); 4716 } 4717 } 4718 4719 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4720 jcc(Assembler::equal, stub->entry()); 4721 bind(stub->continuation()); 4722 } 4723 4724 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4725 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4726 switch(ideal_opc) { 4727 case Op_LShiftVS: 4728 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4729 case Op_LShiftVI: 4730 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4731 case Op_LShiftVL: 4732 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4733 case Op_RShiftVS: 4734 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4735 case Op_RShiftVI: 4736 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4737 case Op_RShiftVL: 4738 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4739 case Op_URShiftVS: 4740 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4741 case Op_URShiftVI: 4742 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4743 case Op_URShiftVL: 4744 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4745 case Op_RotateRightV: 4746 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4747 case Op_RotateLeftV: 4748 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4749 default: 4750 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4751 break; 4752 } 4753 } 4754 4755 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4756 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) { 4757 if (is_unsigned) { 4758 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4759 } else { 4760 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4761 } 4762 } 4763 4764 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4765 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4766 switch (elem_bt) { 4767 case T_BYTE: 4768 if (ideal_opc == Op_SaturatingAddV) { 4769 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4770 } else { 4771 assert(ideal_opc == Op_SaturatingSubV, ""); 4772 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4773 } 4774 break; 4775 case T_SHORT: 4776 if (ideal_opc == Op_SaturatingAddV) { 4777 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4778 } else { 4779 assert(ideal_opc == Op_SaturatingSubV, ""); 4780 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4781 } 4782 break; 4783 default: 4784 fatal("Unsupported type %s", type2name(elem_bt)); 4785 break; 4786 } 4787 } 4788 4789 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4790 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4791 switch (elem_bt) { 4792 case T_BYTE: 4793 if (ideal_opc == Op_SaturatingAddV) { 4794 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4795 } else { 4796 assert(ideal_opc == Op_SaturatingSubV, ""); 4797 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4798 } 4799 break; 4800 case T_SHORT: 4801 if (ideal_opc == Op_SaturatingAddV) { 4802 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4803 } else { 4804 assert(ideal_opc == Op_SaturatingSubV, ""); 4805 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4806 } 4807 break; 4808 default: 4809 fatal("Unsupported type %s", type2name(elem_bt)); 4810 break; 4811 } 4812 } 4813 4814 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4815 Address src2, bool is_unsigned, bool merge, int vlen_enc) { 4816 if (is_unsigned) { 4817 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4818 } else { 4819 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4820 } 4821 } 4822 4823 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4824 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4825 switch (elem_bt) { 4826 case T_BYTE: 4827 if (ideal_opc == Op_SaturatingAddV) { 4828 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4829 } else { 4830 assert(ideal_opc == Op_SaturatingSubV, ""); 4831 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4832 } 4833 break; 4834 case T_SHORT: 4835 if (ideal_opc == Op_SaturatingAddV) { 4836 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4837 } else { 4838 assert(ideal_opc == Op_SaturatingSubV, ""); 4839 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4840 } 4841 break; 4842 default: 4843 fatal("Unsupported type %s", type2name(elem_bt)); 4844 break; 4845 } 4846 } 4847 4848 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4849 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4850 switch (elem_bt) { 4851 case T_BYTE: 4852 if (ideal_opc == Op_SaturatingAddV) { 4853 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4854 } else { 4855 assert(ideal_opc == Op_SaturatingSubV, ""); 4856 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4857 } 4858 break; 4859 case T_SHORT: 4860 if (ideal_opc == Op_SaturatingAddV) { 4861 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4862 } else { 4863 assert(ideal_opc == Op_SaturatingSubV, ""); 4864 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4865 } 4866 break; 4867 default: 4868 fatal("Unsupported type %s", type2name(elem_bt)); 4869 break; 4870 } 4871 } 4872 4873 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4874 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4875 bool is_varshift) { 4876 switch (ideal_opc) { 4877 case Op_AddVB: 4878 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4879 case Op_AddVS: 4880 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4881 case Op_AddVI: 4882 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4883 case Op_AddVL: 4884 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4885 case Op_AddVF: 4886 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4887 case Op_AddVD: 4888 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4889 case Op_SubVB: 4890 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4891 case Op_SubVS: 4892 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4893 case Op_SubVI: 4894 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4895 case Op_SubVL: 4896 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4897 case Op_SubVF: 4898 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4899 case Op_SubVD: 4900 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4901 case Op_MulVS: 4902 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4903 case Op_MulVI: 4904 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4905 case Op_MulVL: 4906 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4907 case Op_MulVF: 4908 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4909 case Op_MulVD: 4910 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4911 case Op_DivVF: 4912 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4913 case Op_DivVD: 4914 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4915 case Op_SqrtVF: 4916 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4917 case Op_SqrtVD: 4918 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4919 case Op_AbsVB: 4920 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4921 case Op_AbsVS: 4922 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4923 case Op_AbsVI: 4924 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4925 case Op_AbsVL: 4926 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4927 case Op_FmaVF: 4928 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4929 case Op_FmaVD: 4930 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4931 case Op_VectorRearrange: 4932 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4933 case Op_LShiftVS: 4934 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4935 case Op_LShiftVI: 4936 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4937 case Op_LShiftVL: 4938 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4939 case Op_RShiftVS: 4940 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4941 case Op_RShiftVI: 4942 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4943 case Op_RShiftVL: 4944 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4945 case Op_URShiftVS: 4946 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4947 case Op_URShiftVI: 4948 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4949 case Op_URShiftVL: 4950 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4951 case Op_RotateLeftV: 4952 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4953 case Op_RotateRightV: 4954 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4955 case Op_MaxV: 4956 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4957 case Op_MinV: 4958 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4959 case Op_UMinV: 4960 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4961 case Op_UMaxV: 4962 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4963 case Op_XorV: 4964 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4965 case Op_OrV: 4966 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4967 case Op_AndV: 4968 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4969 default: 4970 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4971 break; 4972 } 4973 } 4974 4975 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4976 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4977 switch (ideal_opc) { 4978 case Op_AddVB: 4979 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4980 case Op_AddVS: 4981 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4982 case Op_AddVI: 4983 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4984 case Op_AddVL: 4985 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4986 case Op_AddVF: 4987 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4988 case Op_AddVD: 4989 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4990 case Op_SubVB: 4991 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4992 case Op_SubVS: 4993 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4994 case Op_SubVI: 4995 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4996 case Op_SubVL: 4997 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4998 case Op_SubVF: 4999 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 5000 case Op_SubVD: 5001 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 5002 case Op_MulVS: 5003 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 5004 case Op_MulVI: 5005 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 5006 case Op_MulVL: 5007 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 5008 case Op_MulVF: 5009 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 5010 case Op_MulVD: 5011 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 5012 case Op_DivVF: 5013 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 5014 case Op_DivVD: 5015 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 5016 case Op_FmaVF: 5017 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 5018 case Op_FmaVD: 5019 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 5020 case Op_MaxV: 5021 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5022 case Op_MinV: 5023 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5024 case Op_UMaxV: 5025 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5026 case Op_UMinV: 5027 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5028 case Op_XorV: 5029 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5030 case Op_OrV: 5031 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5032 case Op_AndV: 5033 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5034 default: 5035 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 5036 break; 5037 } 5038 } 5039 5040 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 5041 KRegister src1, KRegister src2) { 5042 BasicType etype = T_ILLEGAL; 5043 switch(mask_len) { 5044 case 2: 5045 case 4: 5046 case 8: etype = T_BYTE; break; 5047 case 16: etype = T_SHORT; break; 5048 case 32: etype = T_INT; break; 5049 case 64: etype = T_LONG; break; 5050 default: fatal("Unsupported type"); break; 5051 } 5052 assert(etype != T_ILLEGAL, ""); 5053 switch(ideal_opc) { 5054 case Op_AndVMask: 5055 kand(etype, dst, src1, src2); break; 5056 case Op_OrVMask: 5057 kor(etype, dst, src1, src2); break; 5058 case Op_XorVMask: 5059 kxor(etype, dst, src1, src2); break; 5060 default: 5061 fatal("Unsupported masked operation"); break; 5062 } 5063 } 5064 5065 /* 5066 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5067 * If src is NaN, the result is 0. 5068 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 5069 * the result is equal to the value of Integer.MIN_VALUE. 5070 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 5071 * the result is equal to the value of Integer.MAX_VALUE. 5072 */ 5073 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5074 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5075 Register rscratch, AddressLiteral float_sign_flip, 5076 int vec_enc) { 5077 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5078 Label done; 5079 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 5080 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5081 vptest(xtmp2, xtmp2, vec_enc); 5082 jccb(Assembler::equal, done); 5083 5084 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 5085 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 5086 5087 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5088 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 5089 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 5090 5091 // Recompute the mask for remaining special value. 5092 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 5093 // Extract SRC values corresponding to TRUE mask lanes. 5094 vpand(xtmp4, xtmp2, src, vec_enc); 5095 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 5096 // values are set. 5097 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 5098 5099 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 5100 bind(done); 5101 } 5102 5103 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5104 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5105 Register rscratch, AddressLiteral float_sign_flip, 5106 int vec_enc) { 5107 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5108 Label done; 5109 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5110 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5111 kortestwl(ktmp1, ktmp1); 5112 jccb(Assembler::equal, done); 5113 5114 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5115 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5116 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5117 5118 kxorwl(ktmp1, ktmp1, ktmp2); 5119 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5120 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5121 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5122 bind(done); 5123 } 5124 5125 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5126 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5127 Register rscratch, AddressLiteral double_sign_flip, 5128 int vec_enc) { 5129 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5130 5131 Label done; 5132 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5133 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 5134 kortestwl(ktmp1, ktmp1); 5135 jccb(Assembler::equal, done); 5136 5137 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5138 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5139 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5140 5141 kxorwl(ktmp1, ktmp1, ktmp2); 5142 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5143 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5144 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5145 bind(done); 5146 } 5147 5148 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5149 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5150 Register rscratch, AddressLiteral float_sign_flip, 5151 int vec_enc) { 5152 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5153 Label done; 5154 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5155 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5156 kortestwl(ktmp1, ktmp1); 5157 jccb(Assembler::equal, done); 5158 5159 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5160 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5161 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5162 5163 kxorwl(ktmp1, ktmp1, ktmp2); 5164 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5165 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5166 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5167 bind(done); 5168 } 5169 5170 /* 5171 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5172 * If src is NaN, the result is 0. 5173 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 5174 * the result is equal to the value of Long.MIN_VALUE. 5175 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 5176 * the result is equal to the value of Long.MAX_VALUE. 5177 */ 5178 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5179 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5180 Register rscratch, AddressLiteral double_sign_flip, 5181 int vec_enc) { 5182 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5183 5184 Label done; 5185 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5186 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 5187 kortestwl(ktmp1, ktmp1); 5188 jccb(Assembler::equal, done); 5189 5190 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5191 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5192 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5193 5194 kxorwl(ktmp1, ktmp1, ktmp2); 5195 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5196 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5197 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5198 bind(done); 5199 } 5200 5201 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5202 XMMRegister xtmp, int index, int vec_enc) { 5203 assert(vec_enc < Assembler::AVX_512bit, ""); 5204 if (vec_enc == Assembler::AVX_256bit) { 5205 vextractf128_high(xtmp, src); 5206 vshufps(dst, src, xtmp, index, vec_enc); 5207 } else { 5208 vshufps(dst, src, zero, index, vec_enc); 5209 } 5210 } 5211 5212 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5213 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5214 AddressLiteral float_sign_flip, int src_vec_enc) { 5215 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5216 5217 Label done; 5218 // Compare the destination lanes with float_sign_flip 5219 // value to get mask for all special values. 5220 movdqu(xtmp1, float_sign_flip, rscratch); 5221 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5222 ptest(xtmp2, xtmp2); 5223 jccb(Assembler::equal, done); 5224 5225 // Flip float_sign_flip to get max integer value. 5226 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5227 pxor(xtmp1, xtmp4); 5228 5229 // Set detination lanes corresponding to unordered source lanes as zero. 5230 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5231 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5232 5233 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5234 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5235 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5236 5237 // Recompute the mask for remaining special value. 5238 pxor(xtmp2, xtmp3); 5239 // Extract mask corresponding to non-negative source lanes. 5240 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5241 5242 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5243 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5244 pand(xtmp3, xtmp2); 5245 5246 // Replace destination lanes holding special value(0x80000000) with max int 5247 // if corresponding source lane holds a +ve value. 5248 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5249 bind(done); 5250 } 5251 5252 5253 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5254 XMMRegister xtmp, Register rscratch, int vec_enc) { 5255 switch(to_elem_bt) { 5256 case T_SHORT: 5257 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5258 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5259 vpackusdw(dst, dst, zero, vec_enc); 5260 if (vec_enc == Assembler::AVX_256bit) { 5261 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5262 } 5263 break; 5264 case T_BYTE: 5265 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5266 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5267 vpackusdw(dst, dst, zero, vec_enc); 5268 if (vec_enc == Assembler::AVX_256bit) { 5269 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5270 } 5271 vpackuswb(dst, dst, zero, vec_enc); 5272 break; 5273 default: assert(false, "%s", type2name(to_elem_bt)); 5274 } 5275 } 5276 5277 /* 5278 * Algorithm for vector D2L and F2I conversions:- 5279 * a) Perform vector D2L/F2I cast. 5280 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5281 * It signifies that source value could be any of the special floating point 5282 * values(NaN,-Inf,Inf,Max,-Min). 5283 * c) Set destination to zero if source is NaN value. 5284 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5285 */ 5286 5287 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5288 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5289 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5290 int to_elem_sz = type2aelembytes(to_elem_bt); 5291 assert(to_elem_sz <= 4, ""); 5292 vcvttps2dq(dst, src, vec_enc); 5293 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5294 if (to_elem_sz < 4) { 5295 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5296 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5297 } 5298 } 5299 5300 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5301 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5302 Register rscratch, int vec_enc) { 5303 int to_elem_sz = type2aelembytes(to_elem_bt); 5304 assert(to_elem_sz <= 4, ""); 5305 vcvttps2dq(dst, src, vec_enc); 5306 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5307 switch(to_elem_bt) { 5308 case T_INT: 5309 break; 5310 case T_SHORT: 5311 evpmovdw(dst, dst, vec_enc); 5312 break; 5313 case T_BYTE: 5314 evpmovdb(dst, dst, vec_enc); 5315 break; 5316 default: assert(false, "%s", type2name(to_elem_bt)); 5317 } 5318 } 5319 5320 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5321 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5322 Register rscratch, int vec_enc) { 5323 evcvttps2qq(dst, src, vec_enc); 5324 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5325 } 5326 5327 // Handling for downcasting from double to integer or sub-word types on AVX2. 5328 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5329 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5330 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5331 int to_elem_sz = type2aelembytes(to_elem_bt); 5332 assert(to_elem_sz < 8, ""); 5333 vcvttpd2dq(dst, src, vec_enc); 5334 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5335 float_sign_flip, vec_enc); 5336 if (to_elem_sz < 4) { 5337 // xtmp4 holds all zero lanes. 5338 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5339 } 5340 } 5341 5342 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5343 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5344 KRegister ktmp2, AddressLiteral sign_flip, 5345 Register rscratch, int vec_enc) { 5346 if (VM_Version::supports_avx512dq()) { 5347 evcvttpd2qq(dst, src, vec_enc); 5348 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5349 switch(to_elem_bt) { 5350 case T_LONG: 5351 break; 5352 case T_INT: 5353 evpmovsqd(dst, dst, vec_enc); 5354 break; 5355 case T_SHORT: 5356 evpmovsqd(dst, dst, vec_enc); 5357 evpmovdw(dst, dst, vec_enc); 5358 break; 5359 case T_BYTE: 5360 evpmovsqd(dst, dst, vec_enc); 5361 evpmovdb(dst, dst, vec_enc); 5362 break; 5363 default: assert(false, "%s", type2name(to_elem_bt)); 5364 } 5365 } else { 5366 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5367 vcvttpd2dq(dst, src, vec_enc); 5368 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5369 switch(to_elem_bt) { 5370 case T_INT: 5371 break; 5372 case T_SHORT: 5373 evpmovdw(dst, dst, vec_enc); 5374 break; 5375 case T_BYTE: 5376 evpmovdb(dst, dst, vec_enc); 5377 break; 5378 default: assert(false, "%s", type2name(to_elem_bt)); 5379 } 5380 } 5381 } 5382 5383 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5384 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5385 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5386 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5387 // and re-instantiate original MXCSR.RC mode after that. 5388 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5389 5390 mov64(tmp, julong_cast(0.5L)); 5391 evpbroadcastq(xtmp1, tmp, vec_enc); 5392 vaddpd(xtmp1, src , xtmp1, vec_enc); 5393 evcvtpd2qq(dst, xtmp1, vec_enc); 5394 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5395 double_sign_flip, vec_enc);; 5396 5397 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5398 } 5399 5400 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5401 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5402 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5403 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5404 // and re-instantiate original MXCSR.RC mode after that. 5405 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5406 5407 movl(tmp, jint_cast(0.5)); 5408 movq(xtmp1, tmp); 5409 vbroadcastss(xtmp1, xtmp1, vec_enc); 5410 vaddps(xtmp1, src , xtmp1, vec_enc); 5411 vcvtps2dq(dst, xtmp1, vec_enc); 5412 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5413 float_sign_flip, vec_enc); 5414 5415 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5416 } 5417 5418 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5419 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5420 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5421 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5422 // and re-instantiate original MXCSR.RC mode after that. 5423 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5424 5425 movl(tmp, jint_cast(0.5)); 5426 movq(xtmp1, tmp); 5427 vbroadcastss(xtmp1, xtmp1, vec_enc); 5428 vaddps(xtmp1, src , xtmp1, vec_enc); 5429 vcvtps2dq(dst, xtmp1, vec_enc); 5430 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5431 5432 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5433 } 5434 5435 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5436 BasicType from_elem_bt, BasicType to_elem_bt) { 5437 switch (from_elem_bt) { 5438 case T_BYTE: 5439 switch (to_elem_bt) { 5440 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5441 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5442 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5443 default: ShouldNotReachHere(); 5444 } 5445 break; 5446 case T_SHORT: 5447 switch (to_elem_bt) { 5448 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5449 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5450 default: ShouldNotReachHere(); 5451 } 5452 break; 5453 case T_INT: 5454 assert(to_elem_bt == T_LONG, ""); 5455 vpmovzxdq(dst, src, vlen_enc); 5456 break; 5457 default: 5458 ShouldNotReachHere(); 5459 } 5460 } 5461 5462 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5463 BasicType from_elem_bt, BasicType to_elem_bt) { 5464 switch (from_elem_bt) { 5465 case T_BYTE: 5466 switch (to_elem_bt) { 5467 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5468 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5469 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5470 default: ShouldNotReachHere(); 5471 } 5472 break; 5473 case T_SHORT: 5474 switch (to_elem_bt) { 5475 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5476 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5477 default: ShouldNotReachHere(); 5478 } 5479 break; 5480 case T_INT: 5481 assert(to_elem_bt == T_LONG, ""); 5482 vpmovsxdq(dst, src, vlen_enc); 5483 break; 5484 default: 5485 ShouldNotReachHere(); 5486 } 5487 } 5488 5489 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5490 BasicType dst_bt, BasicType src_bt, int vlen) { 5491 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5492 assert(vlen_enc != AVX_512bit, ""); 5493 5494 int dst_bt_size = type2aelembytes(dst_bt); 5495 int src_bt_size = type2aelembytes(src_bt); 5496 if (dst_bt_size > src_bt_size) { 5497 switch (dst_bt_size / src_bt_size) { 5498 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5499 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5500 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5501 default: ShouldNotReachHere(); 5502 } 5503 } else { 5504 assert(dst_bt_size < src_bt_size, ""); 5505 switch (src_bt_size / dst_bt_size) { 5506 case 2: { 5507 if (vlen_enc == AVX_128bit) { 5508 vpacksswb(dst, src, src, vlen_enc); 5509 } else { 5510 vpacksswb(dst, src, src, vlen_enc); 5511 vpermq(dst, dst, 0x08, vlen_enc); 5512 } 5513 break; 5514 } 5515 case 4: { 5516 if (vlen_enc == AVX_128bit) { 5517 vpackssdw(dst, src, src, vlen_enc); 5518 vpacksswb(dst, dst, dst, vlen_enc); 5519 } else { 5520 vpackssdw(dst, src, src, vlen_enc); 5521 vpermq(dst, dst, 0x08, vlen_enc); 5522 vpacksswb(dst, dst, dst, AVX_128bit); 5523 } 5524 break; 5525 } 5526 case 8: { 5527 if (vlen_enc == AVX_128bit) { 5528 vpshufd(dst, src, 0x08, vlen_enc); 5529 vpackssdw(dst, dst, dst, vlen_enc); 5530 vpacksswb(dst, dst, dst, vlen_enc); 5531 } else { 5532 vpshufd(dst, src, 0x08, vlen_enc); 5533 vpermq(dst, dst, 0x08, vlen_enc); 5534 vpackssdw(dst, dst, dst, AVX_128bit); 5535 vpacksswb(dst, dst, dst, AVX_128bit); 5536 } 5537 break; 5538 } 5539 default: ShouldNotReachHere(); 5540 } 5541 } 5542 } 5543 5544 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5545 bool merge, BasicType bt, int vlen_enc) { 5546 if (bt == T_INT) { 5547 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5548 } else { 5549 assert(bt == T_LONG, ""); 5550 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5551 } 5552 } 5553 5554 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5555 bool merge, BasicType bt, int vlen_enc) { 5556 if (bt == T_INT) { 5557 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5558 } else { 5559 assert(bt == T_LONG, ""); 5560 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5561 } 5562 } 5563 5564 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5565 Register rtmp2, XMMRegister xtmp, int mask_len, 5566 int vec_enc) { 5567 int index = 0; 5568 int vindex = 0; 5569 mov64(rtmp1, 0x0101010101010101L); 5570 pdepq(rtmp1, src, rtmp1); 5571 if (mask_len > 8) { 5572 movq(rtmp2, src); 5573 vpxor(xtmp, xtmp, xtmp, vec_enc); 5574 movq(xtmp, rtmp1); 5575 } 5576 movq(dst, rtmp1); 5577 5578 mask_len -= 8; 5579 while (mask_len > 0) { 5580 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5581 index++; 5582 if ((index % 2) == 0) { 5583 pxor(xtmp, xtmp); 5584 } 5585 mov64(rtmp1, 0x0101010101010101L); 5586 shrq(rtmp2, 8); 5587 pdepq(rtmp1, rtmp2, rtmp1); 5588 pinsrq(xtmp, rtmp1, index % 2); 5589 vindex = index / 2; 5590 if (vindex) { 5591 // Write entire 16 byte vector when both 64 bit 5592 // lanes are update to save redundant instructions. 5593 if (index % 2) { 5594 vinsertf128(dst, dst, xtmp, vindex); 5595 } 5596 } else { 5597 vmovdqu(dst, xtmp); 5598 } 5599 mask_len -= 8; 5600 } 5601 } 5602 5603 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5604 switch(opc) { 5605 case Op_VectorMaskTrueCount: 5606 popcntq(dst, tmp); 5607 break; 5608 case Op_VectorMaskLastTrue: 5609 if (VM_Version::supports_lzcnt()) { 5610 lzcntq(tmp, tmp); 5611 movl(dst, 63); 5612 subl(dst, tmp); 5613 } else { 5614 movl(dst, -1); 5615 bsrq(tmp, tmp); 5616 cmov32(Assembler::notZero, dst, tmp); 5617 } 5618 break; 5619 case Op_VectorMaskFirstTrue: 5620 if (VM_Version::supports_bmi1()) { 5621 if (masklen < 32) { 5622 orl(tmp, 1 << masklen); 5623 tzcntl(dst, tmp); 5624 } else if (masklen == 32) { 5625 tzcntl(dst, tmp); 5626 } else { 5627 assert(masklen == 64, ""); 5628 tzcntq(dst, tmp); 5629 } 5630 } else { 5631 if (masklen < 32) { 5632 orl(tmp, 1 << masklen); 5633 bsfl(dst, tmp); 5634 } else { 5635 assert(masklen == 32 || masklen == 64, ""); 5636 movl(dst, masklen); 5637 if (masklen == 32) { 5638 bsfl(tmp, tmp); 5639 } else { 5640 bsfq(tmp, tmp); 5641 } 5642 cmov32(Assembler::notZero, dst, tmp); 5643 } 5644 } 5645 break; 5646 case Op_VectorMaskToLong: 5647 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5648 break; 5649 default: assert(false, "Unhandled mask operation"); 5650 } 5651 } 5652 5653 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5654 int masklen, int masksize, int vec_enc) { 5655 assert(VM_Version::supports_popcnt(), ""); 5656 5657 if(VM_Version::supports_avx512bw()) { 5658 kmovql(tmp, mask); 5659 } else { 5660 assert(masklen <= 16, ""); 5661 kmovwl(tmp, mask); 5662 } 5663 5664 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5665 // operations needs to be clipped. 5666 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5667 andq(tmp, (1 << masklen) - 1); 5668 } 5669 5670 vector_mask_operation_helper(opc, dst, tmp, masklen); 5671 } 5672 5673 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5674 Register tmp, int masklen, BasicType bt, int vec_enc) { 5675 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5676 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5677 assert(VM_Version::supports_popcnt(), ""); 5678 5679 bool need_clip = false; 5680 switch(bt) { 5681 case T_BOOLEAN: 5682 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5683 vpxor(xtmp, xtmp, xtmp, vec_enc); 5684 vpsubb(xtmp, xtmp, mask, vec_enc); 5685 vpmovmskb(tmp, xtmp, vec_enc); 5686 need_clip = masklen < 16; 5687 break; 5688 case T_BYTE: 5689 vpmovmskb(tmp, mask, vec_enc); 5690 need_clip = masklen < 16; 5691 break; 5692 case T_SHORT: 5693 vpacksswb(xtmp, mask, mask, vec_enc); 5694 if (masklen >= 16) { 5695 vpermpd(xtmp, xtmp, 8, vec_enc); 5696 } 5697 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5698 need_clip = masklen < 16; 5699 break; 5700 case T_INT: 5701 case T_FLOAT: 5702 vmovmskps(tmp, mask, vec_enc); 5703 need_clip = masklen < 4; 5704 break; 5705 case T_LONG: 5706 case T_DOUBLE: 5707 vmovmskpd(tmp, mask, vec_enc); 5708 need_clip = masklen < 2; 5709 break; 5710 default: assert(false, "Unhandled type, %s", type2name(bt)); 5711 } 5712 5713 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5714 // operations needs to be clipped. 5715 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5716 // need_clip implies masklen < 32 5717 andq(tmp, (1 << masklen) - 1); 5718 } 5719 5720 vector_mask_operation_helper(opc, dst, tmp, masklen); 5721 } 5722 5723 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5724 Register rtmp2, int mask_len) { 5725 kmov(rtmp1, src); 5726 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5727 mov64(rtmp2, -1L); 5728 pextq(rtmp2, rtmp2, rtmp1); 5729 kmov(dst, rtmp2); 5730 } 5731 5732 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5733 XMMRegister mask, Register rtmp, Register rscratch, 5734 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5735 int vec_enc) { 5736 assert(type2aelembytes(bt) >= 4, ""); 5737 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5738 address compress_perm_table = nullptr; 5739 address expand_perm_table = nullptr; 5740 if (type2aelembytes(bt) == 8) { 5741 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5742 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5743 vmovmskpd(rtmp, mask, vec_enc); 5744 } else { 5745 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5746 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5747 vmovmskps(rtmp, mask, vec_enc); 5748 } 5749 shlq(rtmp, 5); // for 32 byte permute row. 5750 if (opcode == Op_CompressV) { 5751 lea(rscratch, ExternalAddress(compress_perm_table)); 5752 } else { 5753 lea(rscratch, ExternalAddress(expand_perm_table)); 5754 } 5755 addptr(rtmp, rscratch); 5756 vmovdqu(permv, Address(rtmp)); 5757 vpermps(dst, permv, src, Assembler::AVX_256bit); 5758 vpxor(xtmp, xtmp, xtmp, vec_enc); 5759 // Blend the result with zero vector using permute mask, each column entry 5760 // in a permute table row contains either a valid permute index or a -1 (default) 5761 // value, this can potentially be used as a blending mask after 5762 // compressing/expanding the source vector lanes. 5763 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5764 } 5765 5766 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5767 bool merge, BasicType bt, int vec_enc) { 5768 if (opcode == Op_CompressV) { 5769 switch(bt) { 5770 case T_BYTE: 5771 evpcompressb(dst, mask, src, merge, vec_enc); 5772 break; 5773 case T_CHAR: 5774 case T_SHORT: 5775 evpcompressw(dst, mask, src, merge, vec_enc); 5776 break; 5777 case T_INT: 5778 evpcompressd(dst, mask, src, merge, vec_enc); 5779 break; 5780 case T_FLOAT: 5781 evcompressps(dst, mask, src, merge, vec_enc); 5782 break; 5783 case T_LONG: 5784 evpcompressq(dst, mask, src, merge, vec_enc); 5785 break; 5786 case T_DOUBLE: 5787 evcompresspd(dst, mask, src, merge, vec_enc); 5788 break; 5789 default: 5790 fatal("Unsupported type %s", type2name(bt)); 5791 break; 5792 } 5793 } else { 5794 assert(opcode == Op_ExpandV, ""); 5795 switch(bt) { 5796 case T_BYTE: 5797 evpexpandb(dst, mask, src, merge, vec_enc); 5798 break; 5799 case T_CHAR: 5800 case T_SHORT: 5801 evpexpandw(dst, mask, src, merge, vec_enc); 5802 break; 5803 case T_INT: 5804 evpexpandd(dst, mask, src, merge, vec_enc); 5805 break; 5806 case T_FLOAT: 5807 evexpandps(dst, mask, src, merge, vec_enc); 5808 break; 5809 case T_LONG: 5810 evpexpandq(dst, mask, src, merge, vec_enc); 5811 break; 5812 case T_DOUBLE: 5813 evexpandpd(dst, mask, src, merge, vec_enc); 5814 break; 5815 default: 5816 fatal("Unsupported type %s", type2name(bt)); 5817 break; 5818 } 5819 } 5820 } 5821 5822 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5823 KRegister ktmp1, int vec_enc) { 5824 if (opcode == Op_SignumVD) { 5825 vsubpd(dst, zero, one, vec_enc); 5826 // if src < 0 ? -1 : 1 5827 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5828 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5829 // if src == NaN, -0.0 or 0.0 return src. 5830 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5831 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5832 } else { 5833 assert(opcode == Op_SignumVF, ""); 5834 vsubps(dst, zero, one, vec_enc); 5835 // if src < 0 ? -1 : 1 5836 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5837 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5838 // if src == NaN, -0.0 or 0.0 return src. 5839 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5840 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5841 } 5842 } 5843 5844 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5845 XMMRegister xtmp1, int vec_enc) { 5846 if (opcode == Op_SignumVD) { 5847 vsubpd(dst, zero, one, vec_enc); 5848 // if src < 0 ? -1 : 1 5849 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5850 // if src == NaN, -0.0 or 0.0 return src. 5851 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5852 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5853 } else { 5854 assert(opcode == Op_SignumVF, ""); 5855 vsubps(dst, zero, one, vec_enc); 5856 // if src < 0 ? -1 : 1 5857 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5858 // if src == NaN, -0.0 or 0.0 return src. 5859 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5860 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5861 } 5862 } 5863 5864 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5865 if (VM_Version::supports_avx512bw()) { 5866 if (mask_len > 32) { 5867 kmovql(dst, src); 5868 } else { 5869 kmovdl(dst, src); 5870 if (mask_len != 32) { 5871 kshiftrdl(dst, dst, 32 - mask_len); 5872 } 5873 } 5874 } else { 5875 assert(mask_len <= 16, ""); 5876 kmovwl(dst, src); 5877 if (mask_len != 16) { 5878 kshiftrwl(dst, dst, 16 - mask_len); 5879 } 5880 } 5881 } 5882 5883 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5884 int lane_size = type2aelembytes(bt); 5885 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5886 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) { 5887 movptr(rtmp, imm32); 5888 switch(lane_size) { 5889 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5890 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5891 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5892 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5893 fatal("Unsupported lane size %d", lane_size); 5894 break; 5895 } 5896 } else { 5897 movptr(rtmp, imm32); 5898 movq(dst, rtmp); 5899 switch(lane_size) { 5900 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5901 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5902 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5903 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5904 fatal("Unsupported lane size %d", lane_size); 5905 break; 5906 } 5907 } 5908 } 5909 5910 // 5911 // Following is lookup table based popcount computation algorithm:- 5912 // Index Bit set count 5913 // [ 0000 -> 0, 5914 // 0001 -> 1, 5915 // 0010 -> 1, 5916 // 0011 -> 2, 5917 // 0100 -> 1, 5918 // 0101 -> 2, 5919 // 0110 -> 2, 5920 // 0111 -> 3, 5921 // 1000 -> 1, 5922 // 1001 -> 2, 5923 // 1010 -> 3, 5924 // 1011 -> 3, 5925 // 1100 -> 2, 5926 // 1101 -> 3, 5927 // 1111 -> 4 ] 5928 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5929 // shuffle indices for lookup table access. 5930 // b. Right shift each byte of vector lane by 4 positions. 5931 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5932 // shuffle indices for lookup table access. 5933 // d. Add the bitset count of upper and lower 4 bits of each byte. 5934 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5935 // count of all the bytes of a quadword. 5936 // f. Perform step e. for upper 128bit vector lane. 5937 // g. Pack the bitset count of quadwords back to double word. 5938 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5939 5940 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5941 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5942 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5943 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5944 vpsrlw(dst, src, 4, vec_enc); 5945 vpand(dst, dst, xtmp1, vec_enc); 5946 vpand(xtmp1, src, xtmp1, vec_enc); 5947 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5948 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5949 vpshufb(dst, xtmp2, dst, vec_enc); 5950 vpaddb(dst, dst, xtmp1, vec_enc); 5951 } 5952 5953 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5954 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5955 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5956 // Following code is as per steps e,f,g and h of above algorithm. 5957 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5958 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5959 vpsadbw(dst, dst, xtmp2, vec_enc); 5960 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5961 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5962 vpackuswb(dst, xtmp1, dst, vec_enc); 5963 } 5964 5965 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5966 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5967 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5968 // Add the popcount of upper and lower bytes of word. 5969 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5970 vpsrlw(dst, xtmp1, 8, vec_enc); 5971 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5972 vpaddw(dst, dst, xtmp1, vec_enc); 5973 } 5974 5975 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5976 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5977 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5978 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5979 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5980 } 5981 5982 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5983 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5984 switch(bt) { 5985 case T_LONG: 5986 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5987 break; 5988 case T_INT: 5989 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5990 break; 5991 case T_CHAR: 5992 case T_SHORT: 5993 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5994 break; 5995 case T_BYTE: 5996 case T_BOOLEAN: 5997 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5998 break; 5999 default: 6000 fatal("Unsupported type %s", type2name(bt)); 6001 break; 6002 } 6003 } 6004 6005 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6006 KRegister mask, bool merge, int vec_enc) { 6007 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6008 switch(bt) { 6009 case T_LONG: 6010 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 6011 evpopcntq(dst, mask, src, merge, vec_enc); 6012 break; 6013 case T_INT: 6014 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 6015 evpopcntd(dst, mask, src, merge, vec_enc); 6016 break; 6017 case T_CHAR: 6018 case T_SHORT: 6019 assert(VM_Version::supports_avx512_bitalg(), ""); 6020 evpopcntw(dst, mask, src, merge, vec_enc); 6021 break; 6022 case T_BYTE: 6023 case T_BOOLEAN: 6024 assert(VM_Version::supports_avx512_bitalg(), ""); 6025 evpopcntb(dst, mask, src, merge, vec_enc); 6026 break; 6027 default: 6028 fatal("Unsupported type %s", type2name(bt)); 6029 break; 6030 } 6031 } 6032 6033 // Bit reversal algorithm first reverses the bits of each byte followed by 6034 // a byte level reversal for multi-byte primitive types (short/int/long). 6035 // Algorithm performs a lookup table access to get reverse bit sequence 6036 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 6037 // is obtained by swapping the reverse bit sequences of upper and lower 6038 // nibble of a byte. 6039 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6040 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6041 if (VM_Version::supports_avx512vlbw()) { 6042 6043 // Get the reverse bit sequence of lower nibble of each byte. 6044 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 6045 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6046 evpandq(dst, xtmp2, src, vec_enc); 6047 vpshufb(dst, xtmp1, dst, vec_enc); 6048 vpsllq(dst, dst, 4, vec_enc); 6049 6050 // Get the reverse bit sequence of upper nibble of each byte. 6051 vpandn(xtmp2, xtmp2, src, vec_enc); 6052 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6053 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6054 6055 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6056 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6057 evporq(xtmp2, dst, xtmp2, vec_enc); 6058 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6059 6060 } else if(vec_enc == Assembler::AVX_512bit) { 6061 // Shift based bit reversal. 6062 assert(bt == T_LONG || bt == T_INT, ""); 6063 6064 // Swap lower and upper nibble of each byte. 6065 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 6066 6067 // Swap two least and most significant bits of each nibble. 6068 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 6069 6070 // Swap adjacent pair of bits. 6071 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6072 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 6073 6074 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6075 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 6076 } else { 6077 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 6078 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6079 6080 // Get the reverse bit sequence of lower nibble of each byte. 6081 vpand(dst, xtmp2, src, vec_enc); 6082 vpshufb(dst, xtmp1, dst, vec_enc); 6083 vpsllq(dst, dst, 4, vec_enc); 6084 6085 // Get the reverse bit sequence of upper nibble of each byte. 6086 vpandn(xtmp2, xtmp2, src, vec_enc); 6087 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6088 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6089 6090 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6091 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6092 vpor(xtmp2, dst, xtmp2, vec_enc); 6093 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6094 } 6095 } 6096 6097 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 6098 XMMRegister xtmp, Register rscratch) { 6099 assert(VM_Version::supports_gfni(), ""); 6100 assert(rscratch != noreg || always_reachable(mask), "missing"); 6101 6102 // Galois field instruction based bit reversal based on following algorithm. 6103 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6104 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 6105 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 6106 vector_reverse_byte(bt, dst, xtmp, vec_enc); 6107 } 6108 6109 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 6110 XMMRegister xtmp1, Register rtmp, int vec_enc) { 6111 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 6112 evpandq(dst, xtmp1, src, vec_enc); 6113 vpsllq(dst, dst, nbits, vec_enc); 6114 vpandn(xtmp1, xtmp1, src, vec_enc); 6115 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 6116 evporq(dst, dst, xtmp1, vec_enc); 6117 } 6118 6119 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6120 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6121 // Shift based bit reversal. 6122 assert(VM_Version::supports_evex(), ""); 6123 switch(bt) { 6124 case T_LONG: 6125 // Swap upper and lower double word of each quad word. 6126 evprorq(xtmp1, k0, src, 32, true, vec_enc); 6127 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 6128 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6129 break; 6130 case T_INT: 6131 // Swap upper and lower word of each double word. 6132 evprord(xtmp1, k0, src, 16, true, vec_enc); 6133 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6134 break; 6135 case T_CHAR: 6136 case T_SHORT: 6137 // Swap upper and lower byte of each word. 6138 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 6139 break; 6140 case T_BYTE: 6141 evmovdquq(dst, k0, src, true, vec_enc); 6142 break; 6143 default: 6144 fatal("Unsupported type %s", type2name(bt)); 6145 break; 6146 } 6147 } 6148 6149 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 6150 if (bt == T_BYTE) { 6151 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 6152 evmovdquq(dst, k0, src, true, vec_enc); 6153 } else { 6154 vmovdqu(dst, src); 6155 } 6156 return; 6157 } 6158 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 6159 // pre-computed shuffle indices. 6160 switch(bt) { 6161 case T_LONG: 6162 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 6163 break; 6164 case T_INT: 6165 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 6166 break; 6167 case T_CHAR: 6168 case T_SHORT: 6169 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6170 break; 6171 default: 6172 fatal("Unsupported type %s", type2name(bt)); 6173 break; 6174 } 6175 vpshufb(dst, src, dst, vec_enc); 6176 } 6177 6178 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6179 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6180 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6181 assert(is_integral_type(bt), ""); 6182 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6183 assert(VM_Version::supports_avx512cd(), ""); 6184 switch(bt) { 6185 case T_LONG: 6186 evplzcntq(dst, ktmp, src, merge, vec_enc); 6187 break; 6188 case T_INT: 6189 evplzcntd(dst, ktmp, src, merge, vec_enc); 6190 break; 6191 case T_SHORT: 6192 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6193 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6194 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6195 vpunpckhwd(dst, xtmp1, src, vec_enc); 6196 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6197 vpackusdw(dst, xtmp2, dst, vec_enc); 6198 break; 6199 case T_BYTE: 6200 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6201 // accessing the lookup table. 6202 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6203 // accessing the lookup table. 6204 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6205 assert(VM_Version::supports_avx512bw(), ""); 6206 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6207 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6208 vpand(xtmp2, dst, src, vec_enc); 6209 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6210 vpsrlw(xtmp3, src, 4, vec_enc); 6211 vpand(xtmp3, dst, xtmp3, vec_enc); 6212 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6213 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6214 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6215 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6216 break; 6217 default: 6218 fatal("Unsupported type %s", type2name(bt)); 6219 break; 6220 } 6221 } 6222 6223 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6224 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6225 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6226 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6227 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6228 // accessing the lookup table. 6229 vpand(dst, xtmp2, src, vec_enc); 6230 vpshufb(dst, xtmp1, dst, vec_enc); 6231 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6232 // accessing the lookup table. 6233 vpsrlw(xtmp3, src, 4, vec_enc); 6234 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6235 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6236 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6237 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6238 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6239 vpaddb(dst, dst, xtmp2, vec_enc); 6240 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6241 } 6242 6243 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6244 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6245 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6246 // Add zero counts of lower byte and upper byte of a word if 6247 // upper byte holds a zero value. 6248 vpsrlw(xtmp3, src, 8, vec_enc); 6249 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6250 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6251 vpsllw(xtmp2, dst, 8, vec_enc); 6252 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6253 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6254 vpsrlw(dst, dst, 8, vec_enc); 6255 } 6256 6257 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6258 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6259 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6260 // hence biased exponent can be used to compute leading zero count as per 6261 // following formula:- 6262 // LZCNT = 31 - (biased_exp - 127) 6263 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6264 6265 // Broadcast 0xFF 6266 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6267 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6268 6269 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher 6270 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit 6271 // contributes to the leading number of zeros. 6272 vpsrld(xtmp2, src, 1, vec_enc); 6273 vpandn(xtmp3, xtmp2, src, vec_enc); 6274 6275 // Extract biased exponent. 6276 vcvtdq2ps(dst, xtmp3, vec_enc); 6277 vpsrld(dst, dst, 23, vec_enc); 6278 vpand(dst, dst, xtmp1, vec_enc); 6279 6280 // Broadcast 127. 6281 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6282 // Exponent = biased_exp - 127 6283 vpsubd(dst, dst, xtmp1, vec_enc); 6284 6285 // Exponent_plus_one = Exponent + 1 6286 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6287 vpaddd(dst, dst, xtmp3, vec_enc); 6288 6289 // Replace -ve exponent with zero, exponent is -ve when src 6290 // lane contains a zero value. 6291 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6292 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6293 6294 // Rematerialize broadcast 32. 6295 vpslld(xtmp1, xtmp3, 5, vec_enc); 6296 // Exponent is 32 if corresponding source lane contains max_int value. 6297 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6298 // LZCNT = 32 - exponent_plus_one 6299 vpsubd(dst, xtmp1, dst, vec_enc); 6300 6301 // Replace LZCNT with a value 1 if corresponding source lane 6302 // contains max_int value. 6303 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6304 6305 // Replace biased_exp with 0 if source lane value is less than zero. 6306 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6307 vblendvps(dst, dst, xtmp2, src, vec_enc); 6308 } 6309 6310 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6311 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6312 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6313 // Add zero counts of lower word and upper word of a double word if 6314 // upper word holds a zero value. 6315 vpsrld(xtmp3, src, 16, vec_enc); 6316 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6317 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6318 vpslld(xtmp2, dst, 16, vec_enc); 6319 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6320 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6321 vpsrld(dst, dst, 16, vec_enc); 6322 // Add zero counts of lower doubleword and upper doubleword of a 6323 // quadword if upper doubleword holds a zero value. 6324 vpsrlq(xtmp3, src, 32, vec_enc); 6325 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6326 vpsllq(xtmp2, dst, 32, vec_enc); 6327 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6328 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6329 vpsrlq(dst, dst, 32, vec_enc); 6330 } 6331 6332 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6333 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6334 Register rtmp, int vec_enc) { 6335 assert(is_integral_type(bt), "unexpected type"); 6336 assert(vec_enc < Assembler::AVX_512bit, ""); 6337 switch(bt) { 6338 case T_LONG: 6339 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6340 break; 6341 case T_INT: 6342 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6343 break; 6344 case T_SHORT: 6345 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6346 break; 6347 case T_BYTE: 6348 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6349 break; 6350 default: 6351 fatal("Unsupported type %s", type2name(bt)); 6352 break; 6353 } 6354 } 6355 6356 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6357 switch(bt) { 6358 case T_BYTE: 6359 vpsubb(dst, src1, src2, vec_enc); 6360 break; 6361 case T_SHORT: 6362 vpsubw(dst, src1, src2, vec_enc); 6363 break; 6364 case T_INT: 6365 vpsubd(dst, src1, src2, vec_enc); 6366 break; 6367 case T_LONG: 6368 vpsubq(dst, src1, src2, vec_enc); 6369 break; 6370 default: 6371 fatal("Unsupported type %s", type2name(bt)); 6372 break; 6373 } 6374 } 6375 6376 // Trailing zero count computation is based on leading zero count operation as per 6377 // following equation. All AVX3 targets support AVX512CD feature which offers 6378 // direct vector instruction to compute leading zero count. 6379 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6380 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6381 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6382 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6383 assert(is_integral_type(bt), ""); 6384 // xtmp = -1 6385 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6386 // xtmp = xtmp + src 6387 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6388 // xtmp = xtmp & ~src 6389 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6390 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6391 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6392 vpsub(bt, dst, xtmp4, dst, vec_enc); 6393 } 6394 6395 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6396 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6397 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6398 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6399 assert(is_integral_type(bt), ""); 6400 // xtmp = 0 6401 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6402 // xtmp = 0 - src 6403 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6404 // xtmp = xtmp | src 6405 vpor(xtmp3, xtmp3, src, vec_enc); 6406 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6407 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6408 vpsub(bt, dst, xtmp1, dst, vec_enc); 6409 } 6410 6411 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6412 Label done; 6413 Label neg_divisor_fastpath; 6414 cmpl(divisor, 0); 6415 jccb(Assembler::less, neg_divisor_fastpath); 6416 xorl(rdx, rdx); 6417 divl(divisor); 6418 jmpb(done); 6419 bind(neg_divisor_fastpath); 6420 // Fastpath for divisor < 0: 6421 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6422 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6423 movl(rdx, rax); 6424 subl(rdx, divisor); 6425 if (VM_Version::supports_bmi1()) { 6426 andnl(rax, rdx, rax); 6427 } else { 6428 notl(rdx); 6429 andl(rax, rdx); 6430 } 6431 shrl(rax, 31); 6432 bind(done); 6433 } 6434 6435 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6436 Label done; 6437 Label neg_divisor_fastpath; 6438 cmpl(divisor, 0); 6439 jccb(Assembler::less, neg_divisor_fastpath); 6440 xorl(rdx, rdx); 6441 divl(divisor); 6442 jmpb(done); 6443 bind(neg_divisor_fastpath); 6444 // Fastpath when divisor < 0: 6445 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6446 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6447 movl(rdx, rax); 6448 subl(rax, divisor); 6449 if (VM_Version::supports_bmi1()) { 6450 andnl(rax, rax, rdx); 6451 } else { 6452 notl(rax); 6453 andl(rax, rdx); 6454 } 6455 sarl(rax, 31); 6456 andl(rax, divisor); 6457 subl(rdx, rax); 6458 bind(done); 6459 } 6460 6461 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6462 Label done; 6463 Label neg_divisor_fastpath; 6464 6465 cmpl(divisor, 0); 6466 jccb(Assembler::less, neg_divisor_fastpath); 6467 xorl(rdx, rdx); 6468 divl(divisor); 6469 jmpb(done); 6470 bind(neg_divisor_fastpath); 6471 // Fastpath for divisor < 0: 6472 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6473 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6474 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6475 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6476 movl(rdx, rax); 6477 subl(rax, divisor); 6478 if (VM_Version::supports_bmi1()) { 6479 andnl(rax, rax, rdx); 6480 } else { 6481 notl(rax); 6482 andl(rax, rdx); 6483 } 6484 movl(tmp, rax); 6485 shrl(rax, 31); // quotient 6486 sarl(tmp, 31); 6487 andl(tmp, divisor); 6488 subl(rdx, tmp); // remainder 6489 bind(done); 6490 } 6491 6492 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6493 XMMRegister xtmp2, Register rtmp) { 6494 if(VM_Version::supports_gfni()) { 6495 // Galois field instruction based bit reversal based on following algorithm. 6496 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6497 mov64(rtmp, 0x8040201008040201L); 6498 movq(xtmp1, src); 6499 movq(xtmp2, rtmp); 6500 gf2p8affineqb(xtmp1, xtmp2, 0); 6501 movq(dst, xtmp1); 6502 } else { 6503 // Swap even and odd numbered bits. 6504 movl(rtmp, src); 6505 andl(rtmp, 0x55555555); 6506 shll(rtmp, 1); 6507 movl(dst, src); 6508 andl(dst, 0xAAAAAAAA); 6509 shrl(dst, 1); 6510 orl(dst, rtmp); 6511 6512 // Swap LSB and MSB 2 bits of each nibble. 6513 movl(rtmp, dst); 6514 andl(rtmp, 0x33333333); 6515 shll(rtmp, 2); 6516 andl(dst, 0xCCCCCCCC); 6517 shrl(dst, 2); 6518 orl(dst, rtmp); 6519 6520 // Swap LSB and MSB 4 bits of each byte. 6521 movl(rtmp, dst); 6522 andl(rtmp, 0x0F0F0F0F); 6523 shll(rtmp, 4); 6524 andl(dst, 0xF0F0F0F0); 6525 shrl(dst, 4); 6526 orl(dst, rtmp); 6527 } 6528 bswapl(dst); 6529 } 6530 6531 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6532 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6533 if(VM_Version::supports_gfni()) { 6534 // Galois field instruction based bit reversal based on following algorithm. 6535 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6536 mov64(rtmp1, 0x8040201008040201L); 6537 movq(xtmp1, src); 6538 movq(xtmp2, rtmp1); 6539 gf2p8affineqb(xtmp1, xtmp2, 0); 6540 movq(dst, xtmp1); 6541 } else { 6542 // Swap even and odd numbered bits. 6543 movq(rtmp1, src); 6544 mov64(rtmp2, 0x5555555555555555L); 6545 andq(rtmp1, rtmp2); 6546 shlq(rtmp1, 1); 6547 movq(dst, src); 6548 notq(rtmp2); 6549 andq(dst, rtmp2); 6550 shrq(dst, 1); 6551 orq(dst, rtmp1); 6552 6553 // Swap LSB and MSB 2 bits of each nibble. 6554 movq(rtmp1, dst); 6555 mov64(rtmp2, 0x3333333333333333L); 6556 andq(rtmp1, rtmp2); 6557 shlq(rtmp1, 2); 6558 notq(rtmp2); 6559 andq(dst, rtmp2); 6560 shrq(dst, 2); 6561 orq(dst, rtmp1); 6562 6563 // Swap LSB and MSB 4 bits of each byte. 6564 movq(rtmp1, dst); 6565 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6566 andq(rtmp1, rtmp2); 6567 shlq(rtmp1, 4); 6568 notq(rtmp2); 6569 andq(dst, rtmp2); 6570 shrq(dst, 4); 6571 orq(dst, rtmp1); 6572 } 6573 bswapq(dst); 6574 } 6575 6576 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6577 Label done; 6578 Label neg_divisor_fastpath; 6579 cmpq(divisor, 0); 6580 jccb(Assembler::less, neg_divisor_fastpath); 6581 xorl(rdx, rdx); 6582 divq(divisor); 6583 jmpb(done); 6584 bind(neg_divisor_fastpath); 6585 // Fastpath for divisor < 0: 6586 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6587 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6588 movq(rdx, rax); 6589 subq(rdx, divisor); 6590 if (VM_Version::supports_bmi1()) { 6591 andnq(rax, rdx, rax); 6592 } else { 6593 notq(rdx); 6594 andq(rax, rdx); 6595 } 6596 shrq(rax, 63); 6597 bind(done); 6598 } 6599 6600 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6601 Label done; 6602 Label neg_divisor_fastpath; 6603 cmpq(divisor, 0); 6604 jccb(Assembler::less, neg_divisor_fastpath); 6605 xorq(rdx, rdx); 6606 divq(divisor); 6607 jmp(done); 6608 bind(neg_divisor_fastpath); 6609 // Fastpath when divisor < 0: 6610 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6611 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6612 movq(rdx, rax); 6613 subq(rax, divisor); 6614 if (VM_Version::supports_bmi1()) { 6615 andnq(rax, rax, rdx); 6616 } else { 6617 notq(rax); 6618 andq(rax, rdx); 6619 } 6620 sarq(rax, 63); 6621 andq(rax, divisor); 6622 subq(rdx, rax); 6623 bind(done); 6624 } 6625 6626 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6627 Label done; 6628 Label neg_divisor_fastpath; 6629 cmpq(divisor, 0); 6630 jccb(Assembler::less, neg_divisor_fastpath); 6631 xorq(rdx, rdx); 6632 divq(divisor); 6633 jmp(done); 6634 bind(neg_divisor_fastpath); 6635 // Fastpath for divisor < 0: 6636 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6637 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6638 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6639 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6640 movq(rdx, rax); 6641 subq(rax, divisor); 6642 if (VM_Version::supports_bmi1()) { 6643 andnq(rax, rax, rdx); 6644 } else { 6645 notq(rax); 6646 andq(rax, rdx); 6647 } 6648 movq(tmp, rax); 6649 shrq(rax, 63); // quotient 6650 sarq(tmp, 63); 6651 andq(tmp, divisor); 6652 subq(rdx, tmp); // remainder 6653 bind(done); 6654 } 6655 6656 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6657 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6658 int vlen_enc) { 6659 assert(VM_Version::supports_avx512bw(), ""); 6660 // Byte shuffles are inlane operations and indices are determined using 6661 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6662 // normalized to index range 0-15. This makes sure that all the multiples 6663 // of an index value are placed at same relative position in 128 bit 6664 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6665 // will be 16th element in their respective 128 bit lanes. 6666 movl(rtmp, 16); 6667 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6668 6669 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6670 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6671 // original shuffle indices and move the shuffled lanes corresponding to true 6672 // mask to destination vector. 6673 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6674 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6675 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6676 6677 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6678 // and broadcasting second 128 bit lane. 6679 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6680 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6681 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6682 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6683 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6684 6685 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6686 // and broadcasting third 128 bit lane. 6687 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6688 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6689 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6690 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6691 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6692 6693 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6694 // and broadcasting third 128 bit lane. 6695 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6696 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6697 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6698 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6699 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6700 } 6701 6702 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6703 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6704 if (vlen_enc == AVX_128bit) { 6705 vpermilps(dst, src, shuffle, vlen_enc); 6706 } else if (bt == T_INT) { 6707 vpermd(dst, shuffle, src, vlen_enc); 6708 } else { 6709 assert(bt == T_FLOAT, ""); 6710 vpermps(dst, shuffle, src, vlen_enc); 6711 } 6712 } 6713 6714 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 6715 switch(opcode) { 6716 case Op_AddHF: vaddsh(dst, src1, src2); break; 6717 case Op_SubHF: vsubsh(dst, src1, src2); break; 6718 case Op_MulHF: vmulsh(dst, src1, src2); break; 6719 case Op_DivHF: vdivsh(dst, src1, src2); break; 6720 default: assert(false, "%s", NodeClassNames[opcode]); break; 6721 } 6722 } 6723 6724 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6725 switch(elem_bt) { 6726 case T_BYTE: 6727 if (ideal_opc == Op_SaturatingAddV) { 6728 vpaddsb(dst, src1, src2, vlen_enc); 6729 } else { 6730 assert(ideal_opc == Op_SaturatingSubV, ""); 6731 vpsubsb(dst, src1, src2, vlen_enc); 6732 } 6733 break; 6734 case T_SHORT: 6735 if (ideal_opc == Op_SaturatingAddV) { 6736 vpaddsw(dst, src1, src2, vlen_enc); 6737 } else { 6738 assert(ideal_opc == Op_SaturatingSubV, ""); 6739 vpsubsw(dst, src1, src2, vlen_enc); 6740 } 6741 break; 6742 default: 6743 fatal("Unsupported type %s", type2name(elem_bt)); 6744 break; 6745 } 6746 } 6747 6748 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6749 switch(elem_bt) { 6750 case T_BYTE: 6751 if (ideal_opc == Op_SaturatingAddV) { 6752 vpaddusb(dst, src1, src2, vlen_enc); 6753 } else { 6754 assert(ideal_opc == Op_SaturatingSubV, ""); 6755 vpsubusb(dst, src1, src2, vlen_enc); 6756 } 6757 break; 6758 case T_SHORT: 6759 if (ideal_opc == Op_SaturatingAddV) { 6760 vpaddusw(dst, src1, src2, vlen_enc); 6761 } else { 6762 assert(ideal_opc == Op_SaturatingSubV, ""); 6763 vpsubusw(dst, src1, src2, vlen_enc); 6764 } 6765 break; 6766 default: 6767 fatal("Unsupported type %s", type2name(elem_bt)); 6768 break; 6769 } 6770 } 6771 6772 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6773 XMMRegister src2, KRegister ktmp, int vlen_enc) { 6774 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input. 6775 // overflow_mask = Inp1 <u Inp2 6776 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc); 6777 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative) 6778 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false); 6779 } 6780 6781 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6782 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6783 // Emulate unsigned comparison using signed comparison 6784 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE 6785 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true); 6786 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc); 6787 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc); 6788 6789 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc); 6790 6791 // Res = INP1 - INP2 (non-commutative and non-associative) 6792 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6793 // Res = Mask ? Zero : Res 6794 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); 6795 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc); 6796 } 6797 6798 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6799 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) { 6800 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation. 6801 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2) 6802 // Res = Signed Add INP1, INP2 6803 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6804 // T1 = SRC1 | SRC2 6805 vpor(xtmp1, src1, src2, vlen_enc); 6806 // Max_Unsigned = -1 6807 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6808 // Unsigned compare: Mask = Res <u T1 6809 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc); 6810 // res = Mask ? Max_Unsigned : Res 6811 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc); 6812 } 6813 6814 // 6815 // Section 2-13 Hacker's Delight list following overflow detection check for saturating 6816 // unsigned addition operation. 6817 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1 6818 // 6819 // We empirically determined its semantic equivalence to following reduced expression 6820 // overflow_mask = (a + b) <u (a | b) 6821 // 6822 // and also verified it though Alive2 solver. 6823 // (https://alive2.llvm.org/ce/z/XDQ7dY) 6824 // 6825 6826 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6827 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) { 6828 // Res = Signed Add INP1, INP2 6829 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6830 // Compute T1 = INP1 | INP2 6831 vpor(xtmp3, src1, src2, vlen_enc); 6832 // T1 = Minimum signed value. 6833 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6834 // Convert T1 to signed value, T1 = T1 + MIN_VALUE 6835 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc); 6836 // Convert Res to signed value, Res<s> = Res + MIN_VALUE 6837 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc); 6838 // Compute overflow detection mask = Res<1> <s T1 6839 if (elem_bt == T_INT) { 6840 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc); 6841 } else { 6842 assert(elem_bt == T_LONG, ""); 6843 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc); 6844 } 6845 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6846 } 6847 6848 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6849 int vlen_enc, bool xtmp2_hold_M1) { 6850 if (VM_Version::supports_avx512dq()) { 6851 evpmovq2m(ktmp, src, vlen_enc); 6852 } else { 6853 assert(VM_Version::supports_evex(), ""); 6854 if (!xtmp2_hold_M1) { 6855 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6856 } 6857 evpsraq(xtmp1, src, 63, vlen_enc); 6858 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6859 } 6860 } 6861 6862 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6863 int vlen_enc, bool xtmp2_hold_M1) { 6864 if (VM_Version::supports_avx512dq()) { 6865 evpmovd2m(ktmp, src, vlen_enc); 6866 } else { 6867 assert(VM_Version::supports_evex(), ""); 6868 if (!xtmp2_hold_M1) { 6869 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6870 } 6871 vpsrad(xtmp1, src, 31, vlen_enc); 6872 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6873 } 6874 } 6875 6876 6877 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 6878 if (elem_bt == T_LONG) { 6879 if (VM_Version::supports_evex()) { 6880 evpsraq(dst, src, 63, vlen_enc); 6881 } else { 6882 vpsrad(dst, src, 31, vlen_enc); 6883 vpshufd(dst, dst, 0xF5, vlen_enc); 6884 } 6885 } else { 6886 assert(elem_bt == T_INT, ""); 6887 vpsrad(dst, src, 31, vlen_enc); 6888 } 6889 } 6890 6891 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6892 if (compute_allones) { 6893 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6894 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6895 } else { 6896 vpcmpeqq(allones, allones, allones, vlen_enc); 6897 } 6898 } 6899 if (elem_bt == T_LONG) { 6900 vpsrlq(dst, allones, 1, vlen_enc); 6901 } else { 6902 assert(elem_bt == T_INT, ""); 6903 vpsrld(dst, allones, 1, vlen_enc); 6904 } 6905 } 6906 6907 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6908 if (compute_allones) { 6909 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6910 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6911 } else { 6912 vpcmpeqq(allones, allones, allones, vlen_enc); 6913 } 6914 } 6915 if (elem_bt == T_LONG) { 6916 vpsllq(dst, allones, 63, vlen_enc); 6917 } else { 6918 assert(elem_bt == T_INT, ""); 6919 vpslld(dst, allones, 31, vlen_enc); 6920 } 6921 } 6922 6923 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2, 6924 Assembler::ComparisonPredicate cond, int vlen_enc) { 6925 switch(elem_bt) { 6926 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break; 6927 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break; 6928 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break; 6929 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break; 6930 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6931 } 6932 } 6933 6934 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6935 switch(elem_bt) { 6936 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break; 6937 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break; 6938 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break; 6939 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break; 6940 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6941 } 6942 } 6943 6944 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1, 6945 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) { 6946 if (elem_bt == T_LONG) { 6947 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6948 } else { 6949 assert(elem_bt == T_INT, ""); 6950 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6951 } 6952 } 6953 6954 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6955 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6956 KRegister ktmp1, KRegister ktmp2, int vlen_enc) { 6957 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6958 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6959 // Overflow detection based on Hacker's delight section 2-13. 6960 if (ideal_opc == Op_SaturatingAddV) { 6961 // res = src1 + src2 6962 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6963 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6964 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6965 vpxor(xtmp1, dst, src1, vlen_enc); 6966 vpxor(xtmp2, dst, src2, vlen_enc); 6967 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6968 } else { 6969 assert(ideal_opc == Op_SaturatingSubV, ""); 6970 // res = src1 - src2 6971 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6972 // Overflow occurs when both inputs have opposite polarity and 6973 // result polarity does not comply with first input polarity. 6974 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6975 vpxor(xtmp1, src1, src2, vlen_enc); 6976 vpxor(xtmp2, dst, src1, vlen_enc); 6977 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6978 } 6979 6980 // Compute overflow detection mask. 6981 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc); 6982 // Note: xtmp1 hold -1 in all its lanes after above call. 6983 6984 // Compute mask based on first input polarity. 6985 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true); 6986 6987 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6988 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6989 6990 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to 6991 // set bits in first input polarity mask holds a min value. 6992 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc); 6993 // Blend destination lanes with saturated values using overflow detection mask. 6994 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc); 6995 } 6996 6997 6998 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6999 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 7000 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) { 7001 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 7002 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 7003 // Overflow detection based on Hacker's delight section 2-13. 7004 if (ideal_opc == Op_SaturatingAddV) { 7005 // res = src1 + src2 7006 vpadd(elem_bt, dst, src1, src2, vlen_enc); 7007 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 7008 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 7009 vpxor(xtmp1, dst, src1, vlen_enc); 7010 vpxor(xtmp2, dst, src2, vlen_enc); 7011 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 7012 } else { 7013 assert(ideal_opc == Op_SaturatingSubV, ""); 7014 // res = src1 - src2 7015 vpsub(elem_bt, dst, src1, src2, vlen_enc); 7016 // Overflow occurs when both inputs have opposite polarity and 7017 // result polarity does not comply with first input polarity. 7018 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 7019 vpxor(xtmp1, src1, src2, vlen_enc); 7020 vpxor(xtmp2, dst, src1, vlen_enc); 7021 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 7022 } 7023 7024 // Sign-extend to compute overflow detection mask. 7025 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc); 7026 7027 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc); 7028 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc); 7029 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 7030 7031 // Compose saturating min/max vector using first input polarity mask. 7032 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc); 7033 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc); 7034 7035 // Blend result with saturating vector using overflow detection mask. 7036 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 7037 } 7038 7039 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7040 switch(elem_bt) { 7041 case T_BYTE: 7042 if (ideal_opc == Op_SaturatingAddV) { 7043 vpaddsb(dst, src1, src2, vlen_enc); 7044 } else { 7045 assert(ideal_opc == Op_SaturatingSubV, ""); 7046 vpsubsb(dst, src1, src2, vlen_enc); 7047 } 7048 break; 7049 case T_SHORT: 7050 if (ideal_opc == Op_SaturatingAddV) { 7051 vpaddsw(dst, src1, src2, vlen_enc); 7052 } else { 7053 assert(ideal_opc == Op_SaturatingSubV, ""); 7054 vpsubsw(dst, src1, src2, vlen_enc); 7055 } 7056 break; 7057 default: 7058 fatal("Unsupported type %s", type2name(elem_bt)); 7059 break; 7060 } 7061 } 7062 7063 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7064 switch(elem_bt) { 7065 case T_BYTE: 7066 if (ideal_opc == Op_SaturatingAddV) { 7067 vpaddusb(dst, src1, src2, vlen_enc); 7068 } else { 7069 assert(ideal_opc == Op_SaturatingSubV, ""); 7070 vpsubusb(dst, src1, src2, vlen_enc); 7071 } 7072 break; 7073 case T_SHORT: 7074 if (ideal_opc == Op_SaturatingAddV) { 7075 vpaddusw(dst, src1, src2, vlen_enc); 7076 } else { 7077 assert(ideal_opc == Op_SaturatingSubV, ""); 7078 vpsubusw(dst, src1, src2, vlen_enc); 7079 } 7080 break; 7081 default: 7082 fatal("Unsupported type %s", type2name(elem_bt)); 7083 break; 7084 } 7085 } 7086 7087 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 7088 XMMRegister src2, int vlen_enc) { 7089 switch(elem_bt) { 7090 case T_BYTE: 7091 evpermi2b(dst, src1, src2, vlen_enc); 7092 break; 7093 case T_SHORT: 7094 evpermi2w(dst, src1, src2, vlen_enc); 7095 break; 7096 case T_INT: 7097 evpermi2d(dst, src1, src2, vlen_enc); 7098 break; 7099 case T_LONG: 7100 evpermi2q(dst, src1, src2, vlen_enc); 7101 break; 7102 case T_FLOAT: 7103 evpermi2ps(dst, src1, src2, vlen_enc); 7104 break; 7105 case T_DOUBLE: 7106 evpermi2pd(dst, src1, src2, vlen_enc); 7107 break; 7108 default: 7109 fatal("Unsupported type %s", type2name(elem_bt)); 7110 break; 7111 } 7112 } 7113 7114 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) { 7115 if (is_unsigned) { 7116 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7117 } else { 7118 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7119 } 7120 } 7121 7122 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) { 7123 if (is_unsigned) { 7124 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7125 } else { 7126 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7127 } 7128 } 7129 7130 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 7131 switch(opcode) { 7132 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 7133 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 7134 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 7135 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 7136 default: assert(false, "%s", NodeClassNames[opcode]); break; 7137 } 7138 } 7139 7140 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7141 switch(opcode) { 7142 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 7143 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 7144 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 7145 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 7146 default: assert(false, "%s", NodeClassNames[opcode]); break; 7147 } 7148 } 7149 7150 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7151 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) { 7152 vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit); 7153 } 7154 7155 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7156 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 7157 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) { 7158 // Move sign bits of src2 to mask register. 7159 evpmovw2m(ktmp, src2, vlen_enc); 7160 // xtmp1 = src2 < 0 ? src2 : src1 7161 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7162 // xtmp2 = src2 < 0 ? ? src1 : src2 7163 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7164 // Idea behind above swapping is to make seconds source operand a +ve value. 7165 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in 7166 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction, 7167 // the second source operand, either a NaN or a valid floating-point value, is returned 7168 // dst = max(xtmp1, xtmp2) 7169 evmaxph(dst, xtmp1, xtmp2, vlen_enc); 7170 // isNaN = is_unordered_quiet(xtmp1) 7171 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7172 // Final result is same as first source if its a NaN value, 7173 // in case second operand holds a NaN value then as per above semantics 7174 // result is same as second operand. 7175 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7176 } else { 7177 assert(opcode == Op_MinVHF || opcode == Op_MinHF, ""); 7178 // Move sign bits of src1 to mask register. 7179 evpmovw2m(ktmp, src1, vlen_enc); 7180 // xtmp1 = src1 < 0 ? src2 : src1 7181 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7182 // xtmp2 = src1 < 0 ? src1 : src2 7183 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7184 // Idea behind above swapping is to make seconds source operand a -ve value. 7185 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in 7186 // the second source operand is returned. 7187 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN 7188 // or a valid floating-point value, is written to the result. 7189 // dst = min(xtmp1, xtmp2) 7190 evminph(dst, xtmp1, xtmp2, vlen_enc); 7191 // isNaN = is_unordered_quiet(xtmp1) 7192 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7193 // Final result is same as first source if its a NaN value, 7194 // in case second operand holds a NaN value then as per above semantics 7195 // result is same as second operand. 7196 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7197 } 7198 }