1 /* 2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "gc/shared/barrierSet.hpp" 28 #include "gc/shared/barrierSetAssembler.hpp" 29 #include "oops/methodData.hpp" 30 #include "opto/c2_MacroAssembler.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/output.hpp" 33 #include "opto/opcodes.hpp" 34 #include "opto/subnode.hpp" 35 #include "runtime/globals.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 #include "utilities/checkedCast.hpp" 39 #include "utilities/globalDefinitions.hpp" 40 #include "utilities/powerOfTwo.hpp" 41 #include "utilities/sizes.hpp" 42 43 #ifdef PRODUCT 44 #define BLOCK_COMMENT(str) /* nothing */ 45 #define STOP(error) stop(error) 46 #else 47 #define BLOCK_COMMENT(str) block_comment(str) 48 #define STOP(error) block_comment(error); stop(error) 49 #endif 50 51 // C2 compiled method's prolog code. 52 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) { 53 if (C->clinit_barrier_on_entry()) { 54 assert(VM_Version::supports_fast_class_init_checks(), "sanity"); 55 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started"); 56 57 Label L_skip_barrier; 58 Register klass = rscratch1; 59 60 mov_metadata(klass, C->method()->holder()->constant_encoding()); 61 clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/); 62 63 jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 64 65 bind(L_skip_barrier); 66 } 67 68 int framesize = C->output()->frame_size_in_bytes(); 69 int bangsize = C->output()->bang_size_in_bytes(); 70 bool fp_mode_24b = false; 71 int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0; 72 73 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 74 75 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 76 // Remove word for return addr 77 framesize -= wordSize; 78 stack_bang_size -= wordSize; 79 80 // Calls to C2R adapters often do not accept exceptional returns. 81 // We require that their callers must bang for them. But be careful, because 82 // some VM calls (such as call site linkage) can use several kilobytes of 83 // stack. But the stack safety zone should account for that. 84 // See bugs 4446381, 4468289, 4497237. 85 if (stack_bang_size > 0) { 86 generate_stack_overflow_check(stack_bang_size); 87 88 // We always push rbp, so that on return to interpreter rbp, will be 89 // restored correctly and we can correct the stack. 90 push(rbp); 91 // Save caller's stack pointer into RBP if the frame pointer is preserved. 92 if (PreserveFramePointer) { 93 mov(rbp, rsp); 94 } 95 // Remove word for ebp 96 framesize -= wordSize; 97 98 // Create frame 99 if (framesize) { 100 subptr(rsp, framesize); 101 } 102 } else { 103 subptr(rsp, framesize); 104 105 // Save RBP register now. 106 framesize -= wordSize; 107 movptr(Address(rsp, framesize), rbp); 108 // Save caller's stack pointer into RBP if the frame pointer is preserved. 109 if (PreserveFramePointer) { 110 movptr(rbp, rsp); 111 if (framesize > 0) { 112 addptr(rbp, framesize); 113 } 114 } 115 } 116 117 if (C->needs_stack_repair()) { 118 // Save stack increment just below the saved rbp (also account for fixed framesize and rbp) 119 assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned"); 120 movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize); 121 } 122 123 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 124 framesize -= wordSize; 125 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 126 } 127 128 #ifdef ASSERT 129 if (VerifyStackAtCalls) { 130 Label L; 131 push(rax); 132 mov(rax, rsp); 133 andptr(rax, StackAlignmentInBytes-1); 134 cmpptr(rax, StackAlignmentInBytes-wordSize); 135 pop(rax); 136 jcc(Assembler::equal, L); 137 STOP("Stack is not properly aligned!"); 138 bind(L); 139 } 140 #endif 141 } 142 143 void C2_MacroAssembler::entry_barrier() { 144 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 145 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 146 Label dummy_slow_path; 147 Label dummy_continuation; 148 Label* slow_path = &dummy_slow_path; 149 Label* continuation = &dummy_continuation; 150 if (!Compile::current()->output()->in_scratch_emit_size()) { 151 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 152 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 153 Compile::current()->output()->add_stub(stub); 154 slow_path = &stub->entry(); 155 continuation = &stub->continuation(); 156 } 157 bs->nmethod_entry_barrier(this, slow_path, continuation); 158 } 159 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 161 switch (vlen_in_bytes) { 162 case 4: // fall-through 163 case 8: // fall-through 164 case 16: return Assembler::AVX_128bit; 165 case 32: return Assembler::AVX_256bit; 166 case 64: return Assembler::AVX_512bit; 167 168 default: { 169 ShouldNotReachHere(); 170 return Assembler::AVX_NoVec; 171 } 172 } 173 } 174 175 // fast_lock and fast_unlock used by C2 176 177 // Because the transitions from emitted code to the runtime 178 // monitorenter/exit helper stubs are so slow it's critical that 179 // we inline both the stack-locking fast path and the inflated fast path. 180 // 181 // See also: cmpFastLock and cmpFastUnlock. 182 // 183 // What follows is a specialized inline transliteration of the code 184 // in enter() and exit(). If we're concerned about I$ bloat another 185 // option would be to emit TrySlowEnter and TrySlowExit methods 186 // at startup-time. These methods would accept arguments as 187 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 188 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 189 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 190 // In practice, however, the # of lock sites is bounded and is usually small. 191 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 192 // if the processor uses simple bimodal branch predictors keyed by EIP 193 // Since the helper routines would be called from multiple synchronization 194 // sites. 195 // 196 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 197 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 198 // to those specialized methods. That'd give us a mostly platform-independent 199 // implementation that the JITs could optimize and inline at their pleasure. 200 // Done correctly, the only time we'd need to cross to native could would be 201 // to park() or unpark() threads. We'd also need a few more unsafe operators 202 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 203 // (b) explicit barriers or fence operations. 204 // 205 // TODO: 206 // 207 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 208 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 209 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 210 // the lock operators would typically be faster than reifying Self. 211 // 212 // * Ideally I'd define the primitives as: 213 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 214 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 215 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 216 // Instead, we're stuck with a rather awkward and brittle register assignments below. 217 // Furthermore the register assignments are overconstrained, possibly resulting in 218 // sub-optimal code near the synchronization site. 219 // 220 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 221 // Alternately, use a better sp-proximity test. 222 // 223 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 224 // Either one is sufficient to uniquely identify a thread. 225 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 226 // 227 // * Intrinsify notify() and notifyAll() for the common cases where the 228 // object is locked by the calling thread but the waitlist is empty. 229 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 230 // 231 // * use jccb and jmpb instead of jcc and jmp to improve code density. 232 // But beware of excessive branch density on AMD Opterons. 233 // 234 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 235 // or failure of the fast path. If the fast path fails then we pass 236 // control to the slow path, typically in C. In fast_lock and 237 // fast_unlock we often branch to DONE_LABEL, just to find that C2 238 // will emit a conditional branch immediately after the node. 239 // So we have branches to branches and lots of ICC.ZF games. 240 // Instead, it might be better to have C2 pass a "FailureLabel" 241 // into fast_lock and fast_unlock. In the case of success, control 242 // will drop through the node. ICC.ZF is undefined at exit. 243 // In the case of failure, the node will branch directly to the 244 // FailureLabel 245 246 247 // obj: object to lock 248 // box: on-stack box address (displaced header location) - KILLED 249 // rax,: tmp -- KILLED 250 // scr: tmp -- KILLED 251 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 252 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 253 Metadata* method_data) { 254 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 255 // Ensure the register assignments are disjoint 256 assert(tmpReg == rax, ""); 257 assert(cx1Reg == noreg, ""); 258 assert(cx2Reg == noreg, ""); 259 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 260 261 // Possible cases that we'll encounter in fast_lock 262 // ------------------------------------------------ 263 // * Inflated 264 // -- unlocked 265 // -- Locked 266 // = by self 267 // = by other 268 // * neutral 269 // * stack-locked 270 // -- by self 271 // = sp-proximity test hits 272 // = sp-proximity test generates false-negative 273 // -- by other 274 // 275 276 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 277 278 if (DiagnoseSyncOnValueBasedClasses != 0) { 279 load_klass(tmpReg, objReg, scrReg); 280 testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 281 jcc(Assembler::notZero, DONE_LABEL); 282 } 283 284 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 285 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 286 jcc(Assembler::notZero, IsInflated); 287 288 if (LockingMode == LM_MONITOR) { 289 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 290 testptr(objReg, objReg); 291 } else { 292 assert(LockingMode == LM_LEGACY, "must be"); 293 // Attempt stack-locking ... 294 orptr (tmpReg, markWord::unlocked_value); 295 if (EnableValhalla) { 296 // Mask inline_type bit such that we go to the slow path if object is an inline type 297 andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place)); 298 } 299 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 300 lock(); 301 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 302 jcc(Assembler::equal, COUNT); // Success 303 304 // Recursive locking. 305 // The object is stack-locked: markword contains stack pointer to BasicLock. 306 // Locked by current thread if difference with current SP is less than one page. 307 subptr(tmpReg, rsp); 308 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 309 andptr(tmpReg, (int32_t) (7 - (int)os::vm_page_size()) ); 310 movptr(Address(boxReg, 0), tmpReg); 311 } 312 jmp(DONE_LABEL); 313 314 bind(IsInflated); 315 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 316 317 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 318 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 319 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 320 321 // It's inflated and we use scrReg for ObjectMonitor* in this section. 322 movptr(boxReg, Address(r15_thread, JavaThread::monitor_owner_id_offset())); 323 movq(scrReg, tmpReg); 324 xorq(tmpReg, tmpReg); 325 lock(); 326 cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 327 328 // Propagate ICC.ZF from CAS above into DONE_LABEL. 329 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 330 331 cmpptr(boxReg, rax); // Check if we are already the owner (recursive lock) 332 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 333 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 334 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 335 bind(DONE_LABEL); 336 337 // ZFlag == 1 count in fast path 338 // ZFlag == 0 count in slow path 339 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 340 341 bind(COUNT); 342 if (LockingMode == LM_LEGACY) { 343 // Count monitors in fast path 344 increment(Address(thread, JavaThread::held_monitor_count_offset())); 345 } 346 xorl(tmpReg, tmpReg); // Set ZF == 1 347 348 bind(NO_COUNT); 349 350 // At NO_COUNT the icc ZFlag is set as follows ... 351 // fast_unlock uses the same protocol. 352 // ZFlag == 1 -> Success 353 // ZFlag == 0 -> Failure - force control through the slow path 354 } 355 356 // obj: object to unlock 357 // box: box address (displaced header location), killed. Must be EAX. 358 // tmp: killed, cannot be obj nor box. 359 // 360 // Some commentary on balanced locking: 361 // 362 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 363 // Methods that don't have provably balanced locking are forced to run in the 364 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 365 // The interpreter provides two properties: 366 // I1: At return-time the interpreter automatically and quietly unlocks any 367 // objects acquired the current activation (frame). Recall that the 368 // interpreter maintains an on-stack list of locks currently held by 369 // a frame. 370 // I2: If a method attempts to unlock an object that is not held by the 371 // the frame the interpreter throws IMSX. 372 // 373 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 374 // B() doesn't have provably balanced locking so it runs in the interpreter. 375 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 376 // is still locked by A(). 377 // 378 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 379 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 380 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 381 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 382 // Arguably given that the spec legislates the JNI case as undefined our implementation 383 // could reasonably *avoid* checking owner in fast_unlock(). 384 // In the interest of performance we elide m->Owner==Self check in unlock. 385 // A perfectly viable alternative is to elide the owner check except when 386 // Xcheck:jni is enabled. 387 388 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { 389 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 390 assert(boxReg == rax, ""); 391 assert_different_registers(objReg, boxReg, tmpReg); 392 393 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 394 395 if (LockingMode == LM_LEGACY) { 396 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 397 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 398 } 399 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 400 if (LockingMode != LM_MONITOR) { 401 testptr(tmpReg, markWord::monitor_value); // Inflated? 402 jcc(Assembler::zero, Stacked); 403 } 404 405 // It's inflated. 406 407 // Despite our balanced locking property we still check that m->_owner == Self 408 // as java routines or native JNI code called by this thread might 409 // have released the lock. 410 // 411 // If there's no contention try a 1-0 exit. That is, exit without 412 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 413 // we detect and recover from the race that the 1-0 exit admits. 414 // 415 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 416 // before it STs null into _owner, releasing the lock. Updates 417 // to data protected by the critical section must be visible before 418 // we drop the lock (and thus before any other thread could acquire 419 // the lock and observe the fields protected by the lock). 420 // IA32's memory-model is SPO, so STs are ordered with respect to 421 // each other and there's no need for an explicit barrier (fence). 422 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 423 Label LSuccess, LNotRecursive; 424 425 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 426 jccb(Assembler::equal, LNotRecursive); 427 428 // Recursive inflated unlock 429 decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 430 jmpb(LSuccess); 431 432 bind(LNotRecursive); 433 434 // Set owner to null. 435 // Release to satisfy the JMM 436 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 437 // We need a full fence after clearing owner to avoid stranding. 438 // StoreLoad achieves this. 439 membar(StoreLoad); 440 441 // Check if the entry_list is empty. 442 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(entry_list)), NULL_WORD); 443 jccb(Assembler::zero, LSuccess); // If so we are done. 444 445 // Check if there is a successor. 446 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 447 jccb(Assembler::notZero, LSuccess); // If so we are done. 448 449 // Save the monitor pointer in the current thread, so we can try to 450 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 451 andptr(tmpReg, ~(int32_t)markWord::monitor_value); 452 movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg); 453 454 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 455 jmpb (DONE_LABEL); 456 457 bind (LSuccess); 458 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 459 jmpb (DONE_LABEL); 460 461 if (LockingMode == LM_LEGACY) { 462 bind (Stacked); 463 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 464 lock(); 465 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 466 // Intentional fall-thru into DONE_LABEL 467 } 468 469 bind(DONE_LABEL); 470 471 // ZFlag == 1 count in fast path 472 // ZFlag == 0 count in slow path 473 jccb(Assembler::notZero, NO_COUNT); 474 475 bind(COUNT); 476 477 if (LockingMode == LM_LEGACY) { 478 // Count monitors in fast path 479 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 480 } 481 482 xorl(tmpReg, tmpReg); // Set ZF == 1 483 484 bind(NO_COUNT); 485 } 486 487 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 488 Register t, Register thread) { 489 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 490 assert(rax_reg == rax, "Used for CAS"); 491 assert_different_registers(obj, box, rax_reg, t, thread); 492 493 // Handle inflated monitor. 494 Label inflated; 495 // Finish fast lock successfully. ZF value is irrelevant. 496 Label locked; 497 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 498 Label slow_path; 499 500 if (UseObjectMonitorTable) { 501 // Clear cache in case fast locking succeeds or we need to take the slow-path. 502 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 503 } 504 505 if (DiagnoseSyncOnValueBasedClasses != 0) { 506 load_klass(rax_reg, obj, t); 507 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 508 jcc(Assembler::notZero, slow_path); 509 } 510 511 const Register mark = t; 512 513 { // Lightweight Lock 514 515 Label push; 516 517 const Register top = UseObjectMonitorTable ? rax_reg : box; 518 519 // Load the mark. 520 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 521 522 // Prefetch top. 523 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 524 525 // Check for monitor (0b10). 526 testptr(mark, markWord::monitor_value); 527 jcc(Assembler::notZero, inflated); 528 529 // Check if lock-stack is full. 530 cmpl(top, LockStack::end_offset() - 1); 531 jcc(Assembler::greater, slow_path); 532 533 // Check if recursive. 534 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 535 jccb(Assembler::equal, push); 536 537 // Try to lock. Transition lock bits 0b01 => 0b00 538 movptr(rax_reg, mark); 539 orptr(rax_reg, markWord::unlocked_value); 540 andptr(mark, ~(int32_t)markWord::unlocked_value); 541 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 542 jcc(Assembler::notEqual, slow_path); 543 544 if (UseObjectMonitorTable) { 545 // Need to reload top, clobbered by CAS. 546 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 547 } 548 bind(push); 549 // After successful lock, push object on lock-stack. 550 movptr(Address(thread, top), obj); 551 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 552 jmpb(locked); 553 } 554 555 { // Handle inflated monitor. 556 bind(inflated); 557 558 const Register monitor = t; 559 560 if (!UseObjectMonitorTable) { 561 assert(mark == monitor, "should be the same here"); 562 } else { 563 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 564 // Fetch ObjectMonitor* from the cache or take the slow-path. 565 Label monitor_found; 566 567 // Load cache address 568 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 569 570 const int num_unrolled = 2; 571 for (int i = 0; i < num_unrolled; i++) { 572 cmpptr(obj, Address(t)); 573 jccb(Assembler::equal, monitor_found); 574 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 575 } 576 577 Label loop; 578 579 // Search for obj in cache. 580 bind(loop); 581 582 // Check for match. 583 cmpptr(obj, Address(t)); 584 jccb(Assembler::equal, monitor_found); 585 586 // Search until null encountered, guaranteed _null_sentinel at end. 587 cmpptr(Address(t), 1); 588 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 589 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 590 jmpb(loop); 591 592 // Cache hit. 593 bind(monitor_found); 594 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 595 } 596 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 597 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 598 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 599 600 Label monitor_locked; 601 // Lock the monitor. 602 603 if (UseObjectMonitorTable) { 604 // Cache the monitor for unlock before trashing box. On failure to acquire 605 // the lock, the slow path will reset the entry accordingly (see CacheSetter). 606 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 607 } 608 609 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 610 xorptr(rax_reg, rax_reg); 611 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset())); 612 lock(); cmpxchgptr(box, owner_address); 613 jccb(Assembler::equal, monitor_locked); 614 615 // Check if recursive. 616 cmpptr(box, rax_reg); 617 jccb(Assembler::notEqual, slow_path); 618 619 // Recursive. 620 increment(recursions_address); 621 622 bind(monitor_locked); 623 } 624 625 bind(locked); 626 // Set ZF = 1 627 xorl(rax_reg, rax_reg); 628 629 #ifdef ASSERT 630 // Check that locked label is reached with ZF set. 631 Label zf_correct; 632 Label zf_bad_zero; 633 jcc(Assembler::zero, zf_correct); 634 jmp(zf_bad_zero); 635 #endif 636 637 bind(slow_path); 638 #ifdef ASSERT 639 // Check that slow_path label is reached with ZF not set. 640 jcc(Assembler::notZero, zf_correct); 641 stop("Fast Lock ZF != 0"); 642 bind(zf_bad_zero); 643 stop("Fast Lock ZF != 1"); 644 bind(zf_correct); 645 #endif 646 // C2 uses the value of ZF to determine the continuation. 647 } 648 649 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 650 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 651 assert(reg_rax == rax, "Used for CAS"); 652 assert_different_registers(obj, reg_rax, t); 653 654 // Handle inflated monitor. 655 Label inflated, inflated_check_lock_stack; 656 // Finish fast unlock successfully. MUST jump with ZF == 1 657 Label unlocked, slow_path; 658 659 const Register mark = t; 660 const Register monitor = t; 661 const Register top = UseObjectMonitorTable ? t : reg_rax; 662 const Register box = reg_rax; 663 664 Label dummy; 665 C2FastUnlockLightweightStub* stub = nullptr; 666 667 if (!Compile::current()->output()->in_scratch_emit_size()) { 668 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 669 Compile::current()->output()->add_stub(stub); 670 } 671 672 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 673 674 { // Lightweight Unlock 675 676 // Load top. 677 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 678 679 if (!UseObjectMonitorTable) { 680 // Prefetch mark. 681 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 682 } 683 684 // Check if obj is top of lock-stack. 685 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 686 // Top of lock stack was not obj. Must be monitor. 687 jcc(Assembler::notEqual, inflated_check_lock_stack); 688 689 // Pop lock-stack. 690 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 691 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 692 693 // Check if recursive. 694 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 695 jcc(Assembler::equal, unlocked); 696 697 // We elide the monitor check, let the CAS fail instead. 698 699 if (UseObjectMonitorTable) { 700 // Load mark. 701 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 702 } 703 704 // Try to unlock. Transition lock bits 0b00 => 0b01 705 movptr(reg_rax, mark); 706 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 707 orptr(mark, markWord::unlocked_value); 708 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 709 jcc(Assembler::notEqual, push_and_slow_path); 710 jmp(unlocked); 711 } 712 713 714 { // Handle inflated monitor. 715 bind(inflated_check_lock_stack); 716 #ifdef ASSERT 717 Label check_done; 718 subl(top, oopSize); 719 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 720 jcc(Assembler::below, check_done); 721 cmpptr(obj, Address(thread, top)); 722 jccb(Assembler::notEqual, inflated_check_lock_stack); 723 stop("Fast Unlock lock on stack"); 724 bind(check_done); 725 if (UseObjectMonitorTable) { 726 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 727 } 728 testptr(mark, markWord::monitor_value); 729 jccb(Assembler::notZero, inflated); 730 stop("Fast Unlock not monitor"); 731 #endif 732 733 bind(inflated); 734 735 if (!UseObjectMonitorTable) { 736 assert(mark == monitor, "should be the same here"); 737 } else { 738 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 739 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 740 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 741 cmpptr(monitor, alignof(ObjectMonitor*)); 742 jcc(Assembler::below, slow_path); 743 } 744 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 745 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 746 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 747 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag}; 748 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 749 750 Label recursive; 751 752 // Check if recursive. 753 cmpptr(recursions_address, 0); 754 jccb(Assembler::notZero, recursive); 755 756 // Set owner to null. 757 // Release to satisfy the JMM 758 movptr(owner_address, NULL_WORD); 759 // We need a full fence after clearing owner to avoid stranding. 760 // StoreLoad achieves this. 761 membar(StoreLoad); 762 763 // Check if the entry_list is empty. 764 cmpptr(entry_list_address, NULL_WORD); 765 jccb(Assembler::zero, unlocked); // If so we are done. 766 767 // Check if there is a successor. 768 cmpptr(succ_address, NULL_WORD); 769 jccb(Assembler::notZero, unlocked); // If so we are done. 770 771 // Save the monitor pointer in the current thread, so we can try to 772 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 773 if (!UseObjectMonitorTable) { 774 andptr(monitor, ~(int32_t)markWord::monitor_value); 775 } 776 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 777 778 orl(t, 1); // Fast Unlock ZF = 0 779 jmpb(slow_path); 780 781 // Recursive unlock. 782 bind(recursive); 783 decrement(recursions_address); 784 } 785 786 bind(unlocked); 787 xorl(t, t); // Fast Unlock ZF = 1 788 789 #ifdef ASSERT 790 // Check that unlocked label is reached with ZF set. 791 Label zf_correct; 792 Label zf_bad_zero; 793 jcc(Assembler::zero, zf_correct); 794 jmp(zf_bad_zero); 795 #endif 796 797 bind(slow_path); 798 if (stub != nullptr) { 799 bind(stub->slow_path_continuation()); 800 } 801 #ifdef ASSERT 802 // Check that stub->continuation() label is reached with ZF not set. 803 jcc(Assembler::notZero, zf_correct); 804 stop("Fast Unlock ZF != 0"); 805 bind(zf_bad_zero); 806 stop("Fast Unlock ZF != 1"); 807 bind(zf_correct); 808 #endif 809 // C2 uses the value of ZF to determine the continuation. 810 } 811 812 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) { 813 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi); 814 } 815 816 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) { 817 const int framesize = Compile::current()->output()->frame_size_in_bytes(); 818 masm->movptr(dst, rsp); 819 if (framesize > 2 * wordSize) { 820 masm->addptr(dst, framesize - 2 * wordSize); 821 } 822 } 823 824 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) { 825 if (PreserveFramePointer) { 826 // frame pointer is valid 827 #ifdef ASSERT 828 // Verify frame pointer value in rbp. 829 reconstruct_frame_pointer_helper(this, rtmp); 830 Label L_success; 831 cmpq(rbp, rtmp); 832 jccb(Assembler::equal, L_success); 833 STOP("frame pointer mismatch"); 834 bind(L_success); 835 #endif // ASSERT 836 } else { 837 reconstruct_frame_pointer_helper(this, rbp); 838 } 839 } 840 841 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) { 842 jint lo = t->_lo; 843 jint hi = t->_hi; 844 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi); 845 if (t == TypeInt::INT) { 846 return; 847 } 848 849 BLOCK_COMMENT("CastII {"); 850 Label fail; 851 Label succeed; 852 if (hi == max_jint) { 853 cmpl(val, lo); 854 jccb(Assembler::greaterEqual, succeed); 855 } else { 856 if (lo != min_jint) { 857 cmpl(val, lo); 858 jccb(Assembler::less, fail); 859 } 860 cmpl(val, hi); 861 jccb(Assembler::lessEqual, succeed); 862 } 863 864 bind(fail); 865 movl(c_rarg0, idx); 866 movl(c_rarg1, val); 867 movl(c_rarg2, lo); 868 movl(c_rarg3, hi); 869 reconstruct_frame_pointer(rscratch1); 870 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range))); 871 hlt(); 872 bind(succeed); 873 BLOCK_COMMENT("} // CastII"); 874 } 875 876 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) { 877 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi); 878 } 879 880 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) { 881 jlong lo = t->_lo; 882 jlong hi = t->_hi; 883 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi); 884 if (t == TypeLong::LONG) { 885 return; 886 } 887 888 BLOCK_COMMENT("CastLL {"); 889 Label fail; 890 Label succeed; 891 892 auto cmp_val = [&](jlong bound) { 893 if (is_simm32(bound)) { 894 cmpq(val, checked_cast<int>(bound)); 895 } else { 896 mov64(tmp, bound); 897 cmpq(val, tmp); 898 } 899 }; 900 901 if (hi == max_jlong) { 902 cmp_val(lo); 903 jccb(Assembler::greaterEqual, succeed); 904 } else { 905 if (lo != min_jlong) { 906 cmp_val(lo); 907 jccb(Assembler::less, fail); 908 } 909 cmp_val(hi); 910 jccb(Assembler::lessEqual, succeed); 911 } 912 913 bind(fail); 914 movl(c_rarg0, idx); 915 movq(c_rarg1, val); 916 mov64(c_rarg2, lo); 917 mov64(c_rarg3, hi); 918 reconstruct_frame_pointer(rscratch1); 919 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range))); 920 hlt(); 921 bind(succeed); 922 BLOCK_COMMENT("} // CastLL"); 923 } 924 925 //------------------------------------------------------------------------------------------- 926 // Generic instructions support for use in .ad files C2 code generation 927 928 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 929 if (dst != src) { 930 movdqu(dst, src); 931 } 932 if (opcode == Op_AbsVD) { 933 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 934 } else { 935 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 936 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 937 } 938 } 939 940 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 941 if (opcode == Op_AbsVD) { 942 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 943 } else { 944 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 945 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 946 } 947 } 948 949 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 950 if (dst != src) { 951 movdqu(dst, src); 952 } 953 if (opcode == Op_AbsVF) { 954 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 955 } else { 956 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 957 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 958 } 959 } 960 961 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 962 if (opcode == Op_AbsVF) { 963 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 964 } else { 965 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 966 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 967 } 968 } 969 970 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 971 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 972 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 973 974 if (opcode == Op_MinV) { 975 if (elem_bt == T_BYTE) { 976 pminsb(dst, src); 977 } else if (elem_bt == T_SHORT) { 978 pminsw(dst, src); 979 } else if (elem_bt == T_INT) { 980 pminsd(dst, src); 981 } else { 982 assert(elem_bt == T_LONG, "required"); 983 assert(tmp == xmm0, "required"); 984 assert_different_registers(dst, src, tmp); 985 movdqu(xmm0, dst); 986 pcmpgtq(xmm0, src); 987 blendvpd(dst, src); // xmm0 as mask 988 } 989 } else { // opcode == Op_MaxV 990 if (elem_bt == T_BYTE) { 991 pmaxsb(dst, src); 992 } else if (elem_bt == T_SHORT) { 993 pmaxsw(dst, src); 994 } else if (elem_bt == T_INT) { 995 pmaxsd(dst, src); 996 } else { 997 assert(elem_bt == T_LONG, "required"); 998 assert(tmp == xmm0, "required"); 999 assert_different_registers(dst, src, tmp); 1000 movdqu(xmm0, src); 1001 pcmpgtq(xmm0, dst); 1002 blendvpd(dst, src); // xmm0 as mask 1003 } 1004 } 1005 } 1006 1007 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 1008 XMMRegister src1, Address src2, int vlen_enc) { 1009 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 1010 if (opcode == Op_UMinV) { 1011 switch(elem_bt) { 1012 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 1013 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 1014 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 1015 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 1016 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 1017 } 1018 } else { 1019 assert(opcode == Op_UMaxV, "required"); 1020 switch(elem_bt) { 1021 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 1022 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 1023 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 1024 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 1025 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 1026 } 1027 } 1028 } 1029 1030 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 1031 // For optimality, leverage a full vector width of 512 bits 1032 // for operations over smaller vector sizes on AVX512 targets. 1033 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { 1034 if (opcode == Op_UMaxV) { 1035 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 1036 } else { 1037 assert(opcode == Op_UMinV, "required"); 1038 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 1039 } 1040 } else { 1041 // T1 = -1 1042 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc); 1043 // T1 = -1 << 63 1044 vpsllq(xtmp1, xtmp1, 63, vlen_enc); 1045 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2 1046 vpaddq(xtmp2, xtmp1, src2, vlen_enc); 1047 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1 1048 vpaddq(xtmp1, xtmp1, src1, vlen_enc); 1049 // Mask = T2 > T1 1050 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc); 1051 if (opcode == Op_UMaxV) { 1052 // Res = Mask ? Src2 : Src1 1053 vpblendvb(dst, src1, src2, xtmp1, vlen_enc); 1054 } else { 1055 // Res = Mask ? Src1 : Src2 1056 vpblendvb(dst, src2, src1, xtmp1, vlen_enc); 1057 } 1058 } 1059 } 1060 1061 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 1062 XMMRegister src1, XMMRegister src2, int vlen_enc) { 1063 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 1064 if (opcode == Op_UMinV) { 1065 switch(elem_bt) { 1066 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 1067 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 1068 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 1069 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 1070 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 1071 } 1072 } else { 1073 assert(opcode == Op_UMaxV, "required"); 1074 switch(elem_bt) { 1075 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 1076 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 1077 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 1078 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 1079 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 1080 } 1081 } 1082 } 1083 1084 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1085 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1086 int vlen_enc) { 1087 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1088 1089 if (opcode == Op_MinV) { 1090 if (elem_bt == T_BYTE) { 1091 vpminsb(dst, src1, src2, vlen_enc); 1092 } else if (elem_bt == T_SHORT) { 1093 vpminsw(dst, src1, src2, vlen_enc); 1094 } else if (elem_bt == T_INT) { 1095 vpminsd(dst, src1, src2, vlen_enc); 1096 } else { 1097 assert(elem_bt == T_LONG, "required"); 1098 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1099 vpminsq(dst, src1, src2, vlen_enc); 1100 } else { 1101 assert_different_registers(dst, src1, src2); 1102 vpcmpgtq(dst, src1, src2, vlen_enc); 1103 vblendvpd(dst, src1, src2, dst, vlen_enc); 1104 } 1105 } 1106 } else { // opcode == Op_MaxV 1107 if (elem_bt == T_BYTE) { 1108 vpmaxsb(dst, src1, src2, vlen_enc); 1109 } else if (elem_bt == T_SHORT) { 1110 vpmaxsw(dst, src1, src2, vlen_enc); 1111 } else if (elem_bt == T_INT) { 1112 vpmaxsd(dst, src1, src2, vlen_enc); 1113 } else { 1114 assert(elem_bt == T_LONG, "required"); 1115 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1116 vpmaxsq(dst, src1, src2, vlen_enc); 1117 } else { 1118 assert_different_registers(dst, src1, src2); 1119 vpcmpgtq(dst, src1, src2, vlen_enc); 1120 vblendvpd(dst, src2, src1, dst, vlen_enc); 1121 } 1122 } 1123 } 1124 } 1125 1126 // Float/Double min max 1127 1128 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1129 XMMRegister dst, XMMRegister a, XMMRegister b, 1130 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1131 int vlen_enc) { 1132 assert(UseAVX > 0, "required"); 1133 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1134 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1135 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1136 assert_different_registers(a, tmp, atmp, btmp); 1137 assert_different_registers(b, tmp, atmp, btmp); 1138 1139 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1140 bool is_double_word = is_double_word_type(elem_bt); 1141 1142 /* Note on 'non-obvious' assembly sequence: 1143 * 1144 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1145 * and Java on how they handle floats: 1146 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1147 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1148 * 1149 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1150 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1151 * (only useful when signs differ, noop otherwise) 1152 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1153 1154 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1155 * btmp = (b < +0.0) ? a : b 1156 * atmp = (b < +0.0) ? b : a 1157 * Tmp = Max_Float(atmp , btmp) 1158 * Res = (atmp == NaN) ? atmp : Tmp 1159 */ 1160 1161 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1162 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1163 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1164 XMMRegister mask; 1165 1166 if (!is_double_word && is_min) { 1167 mask = a; 1168 vblend = &MacroAssembler::vblendvps; 1169 vmaxmin = &MacroAssembler::vminps; 1170 vcmp = &MacroAssembler::vcmpps; 1171 } else if (!is_double_word && !is_min) { 1172 mask = b; 1173 vblend = &MacroAssembler::vblendvps; 1174 vmaxmin = &MacroAssembler::vmaxps; 1175 vcmp = &MacroAssembler::vcmpps; 1176 } else if (is_double_word && is_min) { 1177 mask = a; 1178 vblend = &MacroAssembler::vblendvpd; 1179 vmaxmin = &MacroAssembler::vminpd; 1180 vcmp = &MacroAssembler::vcmppd; 1181 } else { 1182 assert(is_double_word && !is_min, "sanity"); 1183 mask = b; 1184 vblend = &MacroAssembler::vblendvpd; 1185 vmaxmin = &MacroAssembler::vmaxpd; 1186 vcmp = &MacroAssembler::vcmppd; 1187 } 1188 1189 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1190 XMMRegister maxmin, scratch; 1191 if (dst == btmp) { 1192 maxmin = btmp; 1193 scratch = tmp; 1194 } else { 1195 maxmin = tmp; 1196 scratch = btmp; 1197 } 1198 1199 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1200 if (precompute_mask && !is_double_word) { 1201 vpsrad(tmp, mask, 32, vlen_enc); 1202 mask = tmp; 1203 } else if (precompute_mask && is_double_word) { 1204 vpxor(tmp, tmp, tmp, vlen_enc); 1205 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1206 mask = tmp; 1207 } 1208 1209 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1210 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1211 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1212 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1213 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1214 } 1215 1216 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1217 XMMRegister dst, XMMRegister a, XMMRegister b, 1218 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1219 int vlen_enc) { 1220 assert(UseAVX > 2, "required"); 1221 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1222 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1223 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1224 assert_different_registers(dst, a, atmp, btmp); 1225 assert_different_registers(dst, b, atmp, btmp); 1226 1227 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1228 bool is_double_word = is_double_word_type(elem_bt); 1229 bool merge = true; 1230 1231 if (!is_double_word && is_min) { 1232 evpmovd2m(ktmp, a, vlen_enc); 1233 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1234 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1235 vminps(dst, atmp, btmp, vlen_enc); 1236 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1237 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1238 } else if (!is_double_word && !is_min) { 1239 evpmovd2m(ktmp, b, vlen_enc); 1240 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1241 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1242 vmaxps(dst, atmp, btmp, vlen_enc); 1243 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1244 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1245 } else if (is_double_word && is_min) { 1246 evpmovq2m(ktmp, a, vlen_enc); 1247 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1248 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1249 vminpd(dst, atmp, btmp, vlen_enc); 1250 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1251 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1252 } else { 1253 assert(is_double_word && !is_min, "sanity"); 1254 evpmovq2m(ktmp, b, vlen_enc); 1255 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1256 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1257 vmaxpd(dst, atmp, btmp, vlen_enc); 1258 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1259 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1260 } 1261 } 1262 1263 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask, 1264 XMMRegister src1, XMMRegister src2, int vlen_enc) { 1265 assert(opc == Op_MinV || opc == Op_MinReductionV || 1266 opc == Op_MaxV || opc == Op_MaxReductionV, "sanity"); 1267 1268 int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_MINMAX_MIN_COMPARE_SIGN 1269 : AVX10_MINMAX_MAX_COMPARE_SIGN; 1270 if (elem_bt == T_FLOAT) { 1271 evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc); 1272 } else { 1273 assert(elem_bt == T_DOUBLE, ""); 1274 evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc); 1275 } 1276 } 1277 1278 // Float/Double signum 1279 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1280 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1281 1282 Label DONE_LABEL; 1283 1284 if (opcode == Op_SignumF) { 1285 ucomiss(dst, zero); 1286 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1287 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1288 movflt(dst, one); 1289 jcc(Assembler::above, DONE_LABEL); 1290 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1291 } else if (opcode == Op_SignumD) { 1292 ucomisd(dst, zero); 1293 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1294 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1295 movdbl(dst, one); 1296 jcc(Assembler::above, DONE_LABEL); 1297 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1298 } 1299 1300 bind(DONE_LABEL); 1301 } 1302 1303 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1304 if (sign) { 1305 pmovsxbw(dst, src); 1306 } else { 1307 pmovzxbw(dst, src); 1308 } 1309 } 1310 1311 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1312 if (sign) { 1313 vpmovsxbw(dst, src, vector_len); 1314 } else { 1315 vpmovzxbw(dst, src, vector_len); 1316 } 1317 } 1318 1319 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1320 if (sign) { 1321 vpmovsxbd(dst, src, vector_len); 1322 } else { 1323 vpmovzxbd(dst, src, vector_len); 1324 } 1325 } 1326 1327 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1328 if (sign) { 1329 vpmovsxwd(dst, src, vector_len); 1330 } else { 1331 vpmovzxwd(dst, src, vector_len); 1332 } 1333 } 1334 1335 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1336 int shift, int vector_len) { 1337 if (opcode == Op_RotateLeftV) { 1338 if (etype == T_INT) { 1339 evprold(dst, src, shift, vector_len); 1340 } else { 1341 assert(etype == T_LONG, "expected type T_LONG"); 1342 evprolq(dst, src, shift, vector_len); 1343 } 1344 } else { 1345 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1346 if (etype == T_INT) { 1347 evprord(dst, src, shift, vector_len); 1348 } else { 1349 assert(etype == T_LONG, "expected type T_LONG"); 1350 evprorq(dst, src, shift, vector_len); 1351 } 1352 } 1353 } 1354 1355 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1356 XMMRegister shift, int vector_len) { 1357 if (opcode == Op_RotateLeftV) { 1358 if (etype == T_INT) { 1359 evprolvd(dst, src, shift, vector_len); 1360 } else { 1361 assert(etype == T_LONG, "expected type T_LONG"); 1362 evprolvq(dst, src, shift, vector_len); 1363 } 1364 } else { 1365 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1366 if (etype == T_INT) { 1367 evprorvd(dst, src, shift, vector_len); 1368 } else { 1369 assert(etype == T_LONG, "expected type T_LONG"); 1370 evprorvq(dst, src, shift, vector_len); 1371 } 1372 } 1373 } 1374 1375 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1376 if (opcode == Op_RShiftVI) { 1377 psrad(dst, shift); 1378 } else if (opcode == Op_LShiftVI) { 1379 pslld(dst, shift); 1380 } else { 1381 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1382 psrld(dst, shift); 1383 } 1384 } 1385 1386 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1387 switch (opcode) { 1388 case Op_RShiftVI: psrad(dst, shift); break; 1389 case Op_LShiftVI: pslld(dst, shift); break; 1390 case Op_URShiftVI: psrld(dst, shift); break; 1391 1392 default: assert(false, "%s", NodeClassNames[opcode]); 1393 } 1394 } 1395 1396 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1397 if (opcode == Op_RShiftVI) { 1398 vpsrad(dst, nds, shift, vector_len); 1399 } else if (opcode == Op_LShiftVI) { 1400 vpslld(dst, nds, shift, vector_len); 1401 } else { 1402 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1403 vpsrld(dst, nds, shift, vector_len); 1404 } 1405 } 1406 1407 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1408 switch (opcode) { 1409 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1410 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1411 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1412 1413 default: assert(false, "%s", NodeClassNames[opcode]); 1414 } 1415 } 1416 1417 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1418 switch (opcode) { 1419 case Op_RShiftVB: // fall-through 1420 case Op_RShiftVS: psraw(dst, shift); break; 1421 1422 case Op_LShiftVB: // fall-through 1423 case Op_LShiftVS: psllw(dst, shift); break; 1424 1425 case Op_URShiftVS: // fall-through 1426 case Op_URShiftVB: psrlw(dst, shift); break; 1427 1428 default: assert(false, "%s", NodeClassNames[opcode]); 1429 } 1430 } 1431 1432 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1433 switch (opcode) { 1434 case Op_RShiftVB: // fall-through 1435 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1436 1437 case Op_LShiftVB: // fall-through 1438 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1439 1440 case Op_URShiftVS: // fall-through 1441 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1442 1443 default: assert(false, "%s", NodeClassNames[opcode]); 1444 } 1445 } 1446 1447 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1448 switch (opcode) { 1449 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1450 case Op_LShiftVL: psllq(dst, shift); break; 1451 case Op_URShiftVL: psrlq(dst, shift); break; 1452 1453 default: assert(false, "%s", NodeClassNames[opcode]); 1454 } 1455 } 1456 1457 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1458 if (opcode == Op_RShiftVL) { 1459 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1460 } else if (opcode == Op_LShiftVL) { 1461 psllq(dst, shift); 1462 } else { 1463 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1464 psrlq(dst, shift); 1465 } 1466 } 1467 1468 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1469 switch (opcode) { 1470 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1471 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1472 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1473 1474 default: assert(false, "%s", NodeClassNames[opcode]); 1475 } 1476 } 1477 1478 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1479 if (opcode == Op_RShiftVL) { 1480 evpsraq(dst, nds, shift, vector_len); 1481 } else if (opcode == Op_LShiftVL) { 1482 vpsllq(dst, nds, shift, vector_len); 1483 } else { 1484 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1485 vpsrlq(dst, nds, shift, vector_len); 1486 } 1487 } 1488 1489 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1490 switch (opcode) { 1491 case Op_RShiftVB: // fall-through 1492 case Op_RShiftVS: // fall-through 1493 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1494 1495 case Op_LShiftVB: // fall-through 1496 case Op_LShiftVS: // fall-through 1497 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1498 1499 case Op_URShiftVB: // fall-through 1500 case Op_URShiftVS: // fall-through 1501 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1502 1503 default: assert(false, "%s", NodeClassNames[opcode]); 1504 } 1505 } 1506 1507 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1508 switch (opcode) { 1509 case Op_RShiftVB: // fall-through 1510 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1511 1512 case Op_LShiftVB: // fall-through 1513 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1514 1515 case Op_URShiftVB: // fall-through 1516 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1517 1518 default: assert(false, "%s", NodeClassNames[opcode]); 1519 } 1520 } 1521 1522 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1523 assert(UseAVX >= 2, "required"); 1524 switch (opcode) { 1525 case Op_RShiftVL: { 1526 if (UseAVX > 2) { 1527 assert(tmp == xnoreg, "not used"); 1528 if (!VM_Version::supports_avx512vl()) { 1529 vlen_enc = Assembler::AVX_512bit; 1530 } 1531 evpsravq(dst, src, shift, vlen_enc); 1532 } else { 1533 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1534 vpsrlvq(dst, src, shift, vlen_enc); 1535 vpsrlvq(tmp, tmp, shift, vlen_enc); 1536 vpxor(dst, dst, tmp, vlen_enc); 1537 vpsubq(dst, dst, tmp, vlen_enc); 1538 } 1539 break; 1540 } 1541 case Op_LShiftVL: { 1542 assert(tmp == xnoreg, "not used"); 1543 vpsllvq(dst, src, shift, vlen_enc); 1544 break; 1545 } 1546 case Op_URShiftVL: { 1547 assert(tmp == xnoreg, "not used"); 1548 vpsrlvq(dst, src, shift, vlen_enc); 1549 break; 1550 } 1551 default: assert(false, "%s", NodeClassNames[opcode]); 1552 } 1553 } 1554 1555 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1556 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1557 assert(opcode == Op_LShiftVB || 1558 opcode == Op_RShiftVB || 1559 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1560 bool sign = (opcode != Op_URShiftVB); 1561 assert(vector_len == 0, "required"); 1562 vextendbd(sign, dst, src, 1); 1563 vpmovzxbd(vtmp, shift, 1); 1564 varshiftd(opcode, dst, dst, vtmp, 1); 1565 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1566 vextracti128_high(vtmp, dst); 1567 vpackusdw(dst, dst, vtmp, 0); 1568 } 1569 1570 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1571 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1572 assert(opcode == Op_LShiftVB || 1573 opcode == Op_RShiftVB || 1574 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1575 bool sign = (opcode != Op_URShiftVB); 1576 int ext_vector_len = vector_len + 1; 1577 vextendbw(sign, dst, src, ext_vector_len); 1578 vpmovzxbw(vtmp, shift, ext_vector_len); 1579 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1580 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1581 if (vector_len == 0) { 1582 vextracti128_high(vtmp, dst); 1583 vpackuswb(dst, dst, vtmp, vector_len); 1584 } else { 1585 vextracti64x4_high(vtmp, dst); 1586 vpackuswb(dst, dst, vtmp, vector_len); 1587 vpermq(dst, dst, 0xD8, vector_len); 1588 } 1589 } 1590 1591 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1592 switch(typ) { 1593 case T_BYTE: 1594 pinsrb(dst, val, idx); 1595 break; 1596 case T_SHORT: 1597 pinsrw(dst, val, idx); 1598 break; 1599 case T_INT: 1600 pinsrd(dst, val, idx); 1601 break; 1602 case T_LONG: 1603 pinsrq(dst, val, idx); 1604 break; 1605 default: 1606 assert(false,"Should not reach here."); 1607 break; 1608 } 1609 } 1610 1611 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1612 switch(typ) { 1613 case T_BYTE: 1614 vpinsrb(dst, src, val, idx); 1615 break; 1616 case T_SHORT: 1617 vpinsrw(dst, src, val, idx); 1618 break; 1619 case T_INT: 1620 vpinsrd(dst, src, val, idx); 1621 break; 1622 case T_LONG: 1623 vpinsrq(dst, src, val, idx); 1624 break; 1625 default: 1626 assert(false,"Should not reach here."); 1627 break; 1628 } 1629 } 1630 1631 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1632 XMMRegister dst, Register base, 1633 Register idx_base, 1634 Register offset, Register mask, 1635 Register mask_idx, Register rtmp, 1636 int vlen_enc) { 1637 vpxor(dst, dst, dst, vlen_enc); 1638 if (elem_bt == T_SHORT) { 1639 for (int i = 0; i < 4; i++) { 1640 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1641 Label skip_load; 1642 btq(mask, mask_idx); 1643 jccb(Assembler::carryClear, skip_load); 1644 movl(rtmp, Address(idx_base, i * 4)); 1645 if (offset != noreg) { 1646 addl(rtmp, offset); 1647 } 1648 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1649 bind(skip_load); 1650 incq(mask_idx); 1651 } 1652 } else { 1653 assert(elem_bt == T_BYTE, ""); 1654 for (int i = 0; i < 8; i++) { 1655 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1656 Label skip_load; 1657 btq(mask, mask_idx); 1658 jccb(Assembler::carryClear, skip_load); 1659 movl(rtmp, Address(idx_base, i * 4)); 1660 if (offset != noreg) { 1661 addl(rtmp, offset); 1662 } 1663 pinsrb(dst, Address(base, rtmp), i); 1664 bind(skip_load); 1665 incq(mask_idx); 1666 } 1667 } 1668 } 1669 1670 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1671 Register base, Register idx_base, 1672 Register offset, Register rtmp, 1673 int vlen_enc) { 1674 vpxor(dst, dst, dst, vlen_enc); 1675 if (elem_bt == T_SHORT) { 1676 for (int i = 0; i < 4; i++) { 1677 // dst[i] = src[offset + idx_base[i]] 1678 movl(rtmp, Address(idx_base, i * 4)); 1679 if (offset != noreg) { 1680 addl(rtmp, offset); 1681 } 1682 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1683 } 1684 } else { 1685 assert(elem_bt == T_BYTE, ""); 1686 for (int i = 0; i < 8; i++) { 1687 // dst[i] = src[offset + idx_base[i]] 1688 movl(rtmp, Address(idx_base, i * 4)); 1689 if (offset != noreg) { 1690 addl(rtmp, offset); 1691 } 1692 pinsrb(dst, Address(base, rtmp), i); 1693 } 1694 } 1695 } 1696 1697 /* 1698 * Gather using hybrid algorithm, first partially unroll scalar loop 1699 * to accumulate values from gather indices into a quad-word(64bit) slice. 1700 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1701 * permutation to place the slice into appropriate vector lane 1702 * locations in destination vector. Following pseudo code describes the 1703 * algorithm in detail: 1704 * 1705 * DST_VEC = ZERO_VEC 1706 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1707 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1708 * FOREACH_ITER: 1709 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1710 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1711 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1712 * PERM_INDEX = PERM_INDEX - TWO_VEC 1713 * 1714 * With each iteration, doubleword permute indices (0,1) corresponding 1715 * to gathered quadword gets right shifted by two lane positions. 1716 * 1717 */ 1718 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1719 Register base, Register idx_base, 1720 Register offset, Register mask, 1721 XMMRegister xtmp1, XMMRegister xtmp2, 1722 XMMRegister temp_dst, Register rtmp, 1723 Register mask_idx, Register length, 1724 int vector_len, int vlen_enc) { 1725 Label GATHER8_LOOP; 1726 assert(is_subword_type(elem_ty), ""); 1727 movl(length, vector_len); 1728 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1729 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1730 vallones(xtmp2, vlen_enc); 1731 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1732 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1733 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1734 1735 bind(GATHER8_LOOP); 1736 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1737 if (mask == noreg) { 1738 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1739 } else { 1740 vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc); 1741 } 1742 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1743 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1744 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1745 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1746 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1747 vpor(dst, dst, temp_dst, vlen_enc); 1748 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1749 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1750 jcc(Assembler::notEqual, GATHER8_LOOP); 1751 } 1752 1753 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1754 switch(typ) { 1755 case T_INT: 1756 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1757 break; 1758 case T_FLOAT: 1759 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1760 break; 1761 case T_LONG: 1762 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1763 break; 1764 case T_DOUBLE: 1765 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1766 break; 1767 default: 1768 assert(false,"Should not reach here."); 1769 break; 1770 } 1771 } 1772 1773 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1774 switch(typ) { 1775 case T_INT: 1776 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1777 break; 1778 case T_FLOAT: 1779 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1780 break; 1781 case T_LONG: 1782 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1783 break; 1784 case T_DOUBLE: 1785 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1786 break; 1787 default: 1788 assert(false,"Should not reach here."); 1789 break; 1790 } 1791 } 1792 1793 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1794 switch(typ) { 1795 case T_INT: 1796 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1797 break; 1798 case T_FLOAT: 1799 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1800 break; 1801 case T_LONG: 1802 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1803 break; 1804 case T_DOUBLE: 1805 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1806 break; 1807 default: 1808 assert(false,"Should not reach here."); 1809 break; 1810 } 1811 } 1812 1813 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1814 if (vlen_in_bytes <= 16) { 1815 pxor (dst, dst); 1816 psubb(dst, src); 1817 switch (elem_bt) { 1818 case T_BYTE: /* nothing to do */ break; 1819 case T_SHORT: pmovsxbw(dst, dst); break; 1820 case T_INT: pmovsxbd(dst, dst); break; 1821 case T_FLOAT: pmovsxbd(dst, dst); break; 1822 case T_LONG: pmovsxbq(dst, dst); break; 1823 case T_DOUBLE: pmovsxbq(dst, dst); break; 1824 1825 default: assert(false, "%s", type2name(elem_bt)); 1826 } 1827 } else { 1828 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1829 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1830 1831 vpxor (dst, dst, dst, vlen_enc); 1832 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1833 1834 switch (elem_bt) { 1835 case T_BYTE: /* nothing to do */ break; 1836 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1837 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1838 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1839 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1840 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1841 1842 default: assert(false, "%s", type2name(elem_bt)); 1843 } 1844 } 1845 } 1846 1847 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1848 if (novlbwdq) { 1849 vpmovsxbd(xtmp, src, vlen_enc); 1850 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1851 Assembler::eq, true, vlen_enc, noreg); 1852 } else { 1853 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1854 vpsubb(xtmp, xtmp, src, vlen_enc); 1855 evpmovb2m(dst, xtmp, vlen_enc); 1856 } 1857 } 1858 1859 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) { 1860 if (is_integral_type(bt)) { 1861 switch (vlen_in_bytes) { 1862 case 4: movdl(dst, src); break; 1863 case 8: movq(dst, src); break; 1864 case 16: movdqu(dst, src); break; 1865 case 32: vmovdqu(dst, src); break; 1866 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1867 default: ShouldNotReachHere(); 1868 } 1869 } else { 1870 switch (vlen_in_bytes) { 1871 case 4: movflt(dst, src); break; 1872 case 8: movdbl(dst, src); break; 1873 case 16: movups(dst, src); break; 1874 case 32: vmovups(dst, src, Assembler::AVX_256bit); break; 1875 case 64: vmovups(dst, src, Assembler::AVX_512bit); break; 1876 default: ShouldNotReachHere(); 1877 } 1878 } 1879 } 1880 1881 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1882 assert(rscratch != noreg || always_reachable(src), "missing"); 1883 1884 if (reachable(src)) { 1885 load_vector(bt, dst, as_Address(src), vlen_in_bytes); 1886 } else { 1887 lea(rscratch, src); 1888 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes); 1889 } 1890 } 1891 1892 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1893 int vlen_enc = vector_length_encoding(vlen); 1894 if (VM_Version::supports_avx()) { 1895 if (bt == T_LONG) { 1896 if (VM_Version::supports_avx2()) { 1897 vpbroadcastq(dst, src, vlen_enc); 1898 } else { 1899 vmovddup(dst, src, vlen_enc); 1900 } 1901 } else if (bt == T_DOUBLE) { 1902 if (vlen_enc != Assembler::AVX_128bit) { 1903 vbroadcastsd(dst, src, vlen_enc, noreg); 1904 } else { 1905 vmovddup(dst, src, vlen_enc); 1906 } 1907 } else { 1908 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1909 vpbroadcastd(dst, src, vlen_enc); 1910 } else { 1911 vbroadcastss(dst, src, vlen_enc); 1912 } 1913 } 1914 } else if (VM_Version::supports_sse3()) { 1915 movddup(dst, src); 1916 } else { 1917 load_vector(bt, dst, src, vlen); 1918 } 1919 } 1920 1921 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1922 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1923 int offset = exact_log2(type2aelembytes(bt)) << 6; 1924 if (is_floating_point_type(bt)) { 1925 offset += 128; 1926 } 1927 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1928 load_vector(T_BYTE, dst, addr, vlen_in_bytes); 1929 } 1930 1931 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1932 1933 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1934 int vector_len = Assembler::AVX_128bit; 1935 1936 switch (opcode) { 1937 case Op_AndReductionV: pand(dst, src); break; 1938 case Op_OrReductionV: por (dst, src); break; 1939 case Op_XorReductionV: pxor(dst, src); break; 1940 case Op_MinReductionV: 1941 switch (typ) { 1942 case T_BYTE: pminsb(dst, src); break; 1943 case T_SHORT: pminsw(dst, src); break; 1944 case T_INT: pminsd(dst, src); break; 1945 case T_LONG: assert(UseAVX > 2, "required"); 1946 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1947 default: assert(false, "wrong type"); 1948 } 1949 break; 1950 case Op_MaxReductionV: 1951 switch (typ) { 1952 case T_BYTE: pmaxsb(dst, src); break; 1953 case T_SHORT: pmaxsw(dst, src); break; 1954 case T_INT: pmaxsd(dst, src); break; 1955 case T_LONG: assert(UseAVX > 2, "required"); 1956 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1957 default: assert(false, "wrong type"); 1958 } 1959 break; 1960 case Op_AddReductionVF: addss(dst, src); break; 1961 case Op_AddReductionVD: addsd(dst, src); break; 1962 case Op_AddReductionVI: 1963 switch (typ) { 1964 case T_BYTE: paddb(dst, src); break; 1965 case T_SHORT: paddw(dst, src); break; 1966 case T_INT: paddd(dst, src); break; 1967 default: assert(false, "wrong type"); 1968 } 1969 break; 1970 case Op_AddReductionVL: paddq(dst, src); break; 1971 case Op_MulReductionVF: mulss(dst, src); break; 1972 case Op_MulReductionVD: mulsd(dst, src); break; 1973 case Op_MulReductionVI: 1974 switch (typ) { 1975 case T_SHORT: pmullw(dst, src); break; 1976 case T_INT: pmulld(dst, src); break; 1977 default: assert(false, "wrong type"); 1978 } 1979 break; 1980 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1981 evpmullq(dst, dst, src, vector_len); break; 1982 default: assert(false, "wrong opcode"); 1983 } 1984 } 1985 1986 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1987 switch (opcode) { 1988 case Op_AddReductionVF: addps(dst, src); break; 1989 case Op_AddReductionVD: addpd(dst, src); break; 1990 case Op_MulReductionVF: mulps(dst, src); break; 1991 case Op_MulReductionVD: mulpd(dst, src); break; 1992 default: assert(false, "%s", NodeClassNames[opcode]); 1993 } 1994 } 1995 1996 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1997 int vector_len = Assembler::AVX_256bit; 1998 1999 switch (opcode) { 2000 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 2001 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 2002 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 2003 case Op_MinReductionV: 2004 switch (typ) { 2005 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 2006 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 2007 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 2008 case T_LONG: assert(UseAVX > 2, "required"); 2009 vpminsq(dst, src1, src2, vector_len); break; 2010 default: assert(false, "wrong type"); 2011 } 2012 break; 2013 case Op_MaxReductionV: 2014 switch (typ) { 2015 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 2016 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 2017 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 2018 case T_LONG: assert(UseAVX > 2, "required"); 2019 vpmaxsq(dst, src1, src2, vector_len); break; 2020 default: assert(false, "wrong type"); 2021 } 2022 break; 2023 case Op_AddReductionVI: 2024 switch (typ) { 2025 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 2026 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 2027 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 2028 default: assert(false, "wrong type"); 2029 } 2030 break; 2031 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 2032 case Op_MulReductionVI: 2033 switch (typ) { 2034 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 2035 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 2036 default: assert(false, "wrong type"); 2037 } 2038 break; 2039 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 2040 default: assert(false, "wrong opcode"); 2041 } 2042 } 2043 2044 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 2045 int vector_len = Assembler::AVX_256bit; 2046 2047 switch (opcode) { 2048 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 2049 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 2050 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 2051 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 2052 default: assert(false, "%s", NodeClassNames[opcode]); 2053 } 2054 } 2055 2056 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 2057 XMMRegister dst, XMMRegister src, 2058 XMMRegister vtmp1, XMMRegister vtmp2) { 2059 switch (opcode) { 2060 case Op_AddReductionVF: 2061 case Op_MulReductionVF: 2062 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 2063 break; 2064 2065 case Op_AddReductionVD: 2066 case Op_MulReductionVD: 2067 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 2068 break; 2069 2070 default: assert(false, "wrong opcode"); 2071 } 2072 } 2073 2074 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 2075 XMMRegister dst, XMMRegister src, 2076 XMMRegister vtmp1, XMMRegister vtmp2) { 2077 switch (opcode) { 2078 case Op_AddReductionVF: 2079 case Op_MulReductionVF: 2080 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 2081 break; 2082 2083 case Op_AddReductionVD: 2084 case Op_MulReductionVD: 2085 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 2086 break; 2087 2088 default: assert(false, "%s", NodeClassNames[opcode]); 2089 } 2090 } 2091 2092 void C2_MacroAssembler::reduceB(int opcode, int vlen, 2093 Register dst, Register src1, XMMRegister src2, 2094 XMMRegister vtmp1, XMMRegister vtmp2) { 2095 switch (vlen) { 2096 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2097 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2098 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2099 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2100 2101 default: assert(false, "wrong vector length"); 2102 } 2103 } 2104 2105 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 2106 Register dst, Register src1, XMMRegister src2, 2107 XMMRegister vtmp1, XMMRegister vtmp2) { 2108 switch (vlen) { 2109 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2110 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2111 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2112 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2113 2114 default: assert(false, "wrong vector length"); 2115 } 2116 } 2117 2118 void C2_MacroAssembler::reduceS(int opcode, int vlen, 2119 Register dst, Register src1, XMMRegister src2, 2120 XMMRegister vtmp1, XMMRegister vtmp2) { 2121 switch (vlen) { 2122 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2123 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2124 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2125 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2126 2127 default: assert(false, "wrong vector length"); 2128 } 2129 } 2130 2131 void C2_MacroAssembler::reduceI(int opcode, int vlen, 2132 Register dst, Register src1, XMMRegister src2, 2133 XMMRegister vtmp1, XMMRegister vtmp2) { 2134 switch (vlen) { 2135 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2136 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2137 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2138 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2139 2140 default: assert(false, "wrong vector length"); 2141 } 2142 } 2143 2144 void C2_MacroAssembler::reduceL(int opcode, int vlen, 2145 Register dst, Register src1, XMMRegister src2, 2146 XMMRegister vtmp1, XMMRegister vtmp2) { 2147 switch (vlen) { 2148 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2149 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2150 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2151 2152 default: assert(false, "wrong vector length"); 2153 } 2154 } 2155 2156 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2157 switch (vlen) { 2158 case 2: 2159 assert(vtmp2 == xnoreg, ""); 2160 reduce2F(opcode, dst, src, vtmp1); 2161 break; 2162 case 4: 2163 assert(vtmp2 == xnoreg, ""); 2164 reduce4F(opcode, dst, src, vtmp1); 2165 break; 2166 case 8: 2167 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2168 break; 2169 case 16: 2170 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2171 break; 2172 default: assert(false, "wrong vector length"); 2173 } 2174 } 2175 2176 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2177 switch (vlen) { 2178 case 2: 2179 assert(vtmp2 == xnoreg, ""); 2180 reduce2D(opcode, dst, src, vtmp1); 2181 break; 2182 case 4: 2183 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2184 break; 2185 case 8: 2186 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2187 break; 2188 default: assert(false, "wrong vector length"); 2189 } 2190 } 2191 2192 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2193 switch (vlen) { 2194 case 2: 2195 assert(vtmp1 == xnoreg, ""); 2196 assert(vtmp2 == xnoreg, ""); 2197 unorderedReduce2F(opcode, dst, src); 2198 break; 2199 case 4: 2200 assert(vtmp2 == xnoreg, ""); 2201 unorderedReduce4F(opcode, dst, src, vtmp1); 2202 break; 2203 case 8: 2204 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 2205 break; 2206 case 16: 2207 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 2208 break; 2209 default: assert(false, "wrong vector length"); 2210 } 2211 } 2212 2213 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2214 switch (vlen) { 2215 case 2: 2216 assert(vtmp1 == xnoreg, ""); 2217 assert(vtmp2 == xnoreg, ""); 2218 unorderedReduce2D(opcode, dst, src); 2219 break; 2220 case 4: 2221 assert(vtmp2 == xnoreg, ""); 2222 unorderedReduce4D(opcode, dst, src, vtmp1); 2223 break; 2224 case 8: 2225 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2226 break; 2227 default: assert(false, "wrong vector length"); 2228 } 2229 } 2230 2231 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2232 if (opcode == Op_AddReductionVI) { 2233 if (vtmp1 != src2) { 2234 movdqu(vtmp1, src2); 2235 } 2236 phaddd(vtmp1, vtmp1); 2237 } else { 2238 pshufd(vtmp1, src2, 0x1); 2239 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2240 } 2241 movdl(vtmp2, src1); 2242 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2243 movdl(dst, vtmp1); 2244 } 2245 2246 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2247 if (opcode == Op_AddReductionVI) { 2248 if (vtmp1 != src2) { 2249 movdqu(vtmp1, src2); 2250 } 2251 phaddd(vtmp1, src2); 2252 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2253 } else { 2254 pshufd(vtmp2, src2, 0xE); 2255 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2256 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2257 } 2258 } 2259 2260 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2261 if (opcode == Op_AddReductionVI) { 2262 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2263 vextracti128_high(vtmp2, vtmp1); 2264 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2265 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2266 } else { 2267 vextracti128_high(vtmp1, src2); 2268 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2269 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2270 } 2271 } 2272 2273 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2274 vextracti64x4_high(vtmp2, src2); 2275 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2276 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2277 } 2278 2279 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2280 pshufd(vtmp2, src2, 0x1); 2281 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2282 movdqu(vtmp1, vtmp2); 2283 psrldq(vtmp1, 2); 2284 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2285 movdqu(vtmp2, vtmp1); 2286 psrldq(vtmp2, 1); 2287 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2288 movdl(vtmp2, src1); 2289 pmovsxbd(vtmp1, vtmp1); 2290 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2291 pextrb(dst, vtmp1, 0x0); 2292 movsbl(dst, dst); 2293 } 2294 2295 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2296 pshufd(vtmp1, src2, 0xE); 2297 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2298 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2299 } 2300 2301 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2302 vextracti128_high(vtmp2, src2); 2303 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2304 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2305 } 2306 2307 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2308 vextracti64x4_high(vtmp1, src2); 2309 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2310 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2311 } 2312 2313 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2314 pmovsxbw(vtmp2, src2); 2315 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2316 } 2317 2318 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2319 if (UseAVX > 1) { 2320 int vector_len = Assembler::AVX_256bit; 2321 vpmovsxbw(vtmp1, src2, vector_len); 2322 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2323 } else { 2324 pmovsxbw(vtmp2, src2); 2325 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2326 pshufd(vtmp2, src2, 0x1); 2327 pmovsxbw(vtmp2, src2); 2328 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2329 } 2330 } 2331 2332 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2333 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2334 int vector_len = Assembler::AVX_512bit; 2335 vpmovsxbw(vtmp1, src2, vector_len); 2336 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2337 } else { 2338 assert(UseAVX >= 2,"Should not reach here."); 2339 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2340 vextracti128_high(vtmp2, src2); 2341 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2342 } 2343 } 2344 2345 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2346 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2347 vextracti64x4_high(vtmp2, src2); 2348 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2349 } 2350 2351 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2352 if (opcode == Op_AddReductionVI) { 2353 if (vtmp1 != src2) { 2354 movdqu(vtmp1, src2); 2355 } 2356 phaddw(vtmp1, vtmp1); 2357 phaddw(vtmp1, vtmp1); 2358 } else { 2359 pshufd(vtmp2, src2, 0x1); 2360 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2361 movdqu(vtmp1, vtmp2); 2362 psrldq(vtmp1, 2); 2363 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2364 } 2365 movdl(vtmp2, src1); 2366 pmovsxwd(vtmp1, vtmp1); 2367 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2368 pextrw(dst, vtmp1, 0x0); 2369 movswl(dst, dst); 2370 } 2371 2372 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2373 if (opcode == Op_AddReductionVI) { 2374 if (vtmp1 != src2) { 2375 movdqu(vtmp1, src2); 2376 } 2377 phaddw(vtmp1, src2); 2378 } else { 2379 pshufd(vtmp1, src2, 0xE); 2380 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2381 } 2382 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2383 } 2384 2385 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2386 if (opcode == Op_AddReductionVI) { 2387 int vector_len = Assembler::AVX_256bit; 2388 vphaddw(vtmp2, src2, src2, vector_len); 2389 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2390 } else { 2391 vextracti128_high(vtmp2, src2); 2392 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2393 } 2394 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2395 } 2396 2397 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2398 int vector_len = Assembler::AVX_256bit; 2399 vextracti64x4_high(vtmp1, src2); 2400 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2401 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2402 } 2403 2404 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2405 pshufd(vtmp2, src2, 0xE); 2406 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2407 movdq(vtmp1, src1); 2408 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2409 movdq(dst, vtmp1); 2410 } 2411 2412 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2413 vextracti128_high(vtmp1, src2); 2414 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2415 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2416 } 2417 2418 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2419 vextracti64x4_high(vtmp2, src2); 2420 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2421 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2422 } 2423 2424 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2425 mov64(temp, -1L); 2426 bzhiq(temp, temp, len); 2427 kmovql(dst, temp); 2428 } 2429 2430 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2431 reduce_operation_128(T_FLOAT, opcode, dst, src); 2432 pshufd(vtmp, src, 0x1); 2433 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2434 } 2435 2436 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2437 reduce2F(opcode, dst, src, vtmp); 2438 pshufd(vtmp, src, 0x2); 2439 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2440 pshufd(vtmp, src, 0x3); 2441 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2442 } 2443 2444 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2445 reduce4F(opcode, dst, src, vtmp2); 2446 vextractf128_high(vtmp2, src); 2447 reduce4F(opcode, dst, vtmp2, vtmp1); 2448 } 2449 2450 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2451 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2452 vextracti64x4_high(vtmp1, src); 2453 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2454 } 2455 2456 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2457 pshufd(dst, src, 0x1); 2458 reduce_operation_128(T_FLOAT, opcode, dst, src); 2459 } 2460 2461 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2462 pshufd(vtmp, src, 0xE); 2463 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2464 unorderedReduce2F(opcode, dst, vtmp); 2465 } 2466 2467 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2468 vextractf128_high(vtmp1, src); 2469 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2470 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2471 } 2472 2473 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2474 vextractf64x4_high(vtmp2, src); 2475 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2476 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2477 } 2478 2479 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2480 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2481 pshufd(vtmp, src, 0xE); 2482 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2483 } 2484 2485 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2486 reduce2D(opcode, dst, src, vtmp2); 2487 vextractf128_high(vtmp2, src); 2488 reduce2D(opcode, dst, vtmp2, vtmp1); 2489 } 2490 2491 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2492 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2493 vextracti64x4_high(vtmp1, src); 2494 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2495 } 2496 2497 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2498 pshufd(dst, src, 0xE); 2499 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2500 } 2501 2502 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2503 vextractf128_high(vtmp, src); 2504 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2505 unorderedReduce2D(opcode, dst, vtmp); 2506 } 2507 2508 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2509 vextractf64x4_high(vtmp2, src); 2510 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2511 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2512 } 2513 2514 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2515 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2516 } 2517 2518 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2519 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2520 } 2521 2522 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 2523 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2524 } 2525 2526 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2527 int vec_enc) { 2528 switch(elem_bt) { 2529 case T_INT: 2530 case T_FLOAT: 2531 vmaskmovps(dst, src, mask, vec_enc); 2532 break; 2533 case T_LONG: 2534 case T_DOUBLE: 2535 vmaskmovpd(dst, src, mask, vec_enc); 2536 break; 2537 default: 2538 fatal("Unsupported type %s", type2name(elem_bt)); 2539 break; 2540 } 2541 } 2542 2543 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2544 int vec_enc) { 2545 switch(elem_bt) { 2546 case T_INT: 2547 case T_FLOAT: 2548 vmaskmovps(dst, src, mask, vec_enc); 2549 break; 2550 case T_LONG: 2551 case T_DOUBLE: 2552 vmaskmovpd(dst, src, mask, vec_enc); 2553 break; 2554 default: 2555 fatal("Unsupported type %s", type2name(elem_bt)); 2556 break; 2557 } 2558 } 2559 2560 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2561 XMMRegister dst, XMMRegister src, 2562 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2563 XMMRegister xmm_0, XMMRegister xmm_1) { 2564 const int permconst[] = {1, 14}; 2565 XMMRegister wsrc = src; 2566 XMMRegister wdst = xmm_0; 2567 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2568 2569 int vlen_enc = Assembler::AVX_128bit; 2570 if (vlen == 16) { 2571 vlen_enc = Assembler::AVX_256bit; 2572 } 2573 2574 for (int i = log2(vlen) - 1; i >=0; i--) { 2575 if (i == 0 && !is_dst_valid) { 2576 wdst = dst; 2577 } 2578 if (i == 3) { 2579 vextracti64x4_high(wtmp, wsrc); 2580 } else if (i == 2) { 2581 vextracti128_high(wtmp, wsrc); 2582 } else { // i = [0,1] 2583 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2584 } 2585 2586 if (VM_Version::supports_avx10_2()) { 2587 vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc); 2588 } else { 2589 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2590 } 2591 wsrc = wdst; 2592 vlen_enc = Assembler::AVX_128bit; 2593 } 2594 if (is_dst_valid) { 2595 if (VM_Version::supports_avx10_2()) { 2596 vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit); 2597 } else { 2598 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2599 } 2600 } 2601 } 2602 2603 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2604 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2605 XMMRegister xmm_0, XMMRegister xmm_1) { 2606 XMMRegister wsrc = src; 2607 XMMRegister wdst = xmm_0; 2608 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2609 int vlen_enc = Assembler::AVX_128bit; 2610 if (vlen == 8) { 2611 vlen_enc = Assembler::AVX_256bit; 2612 } 2613 for (int i = log2(vlen) - 1; i >=0; i--) { 2614 if (i == 0 && !is_dst_valid) { 2615 wdst = dst; 2616 } 2617 if (i == 1) { 2618 vextracti128_high(wtmp, wsrc); 2619 } else if (i == 2) { 2620 vextracti64x4_high(wtmp, wsrc); 2621 } else { 2622 assert(i == 0, "%d", i); 2623 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2624 } 2625 2626 if (VM_Version::supports_avx10_2()) { 2627 vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc); 2628 } else { 2629 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2630 } 2631 2632 wsrc = wdst; 2633 vlen_enc = Assembler::AVX_128bit; 2634 } 2635 2636 if (is_dst_valid) { 2637 if (VM_Version::supports_avx10_2()) { 2638 vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit); 2639 } else { 2640 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2641 } 2642 } 2643 } 2644 2645 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2646 switch (bt) { 2647 case T_BYTE: pextrb(dst, src, idx); break; 2648 case T_SHORT: pextrw(dst, src, idx); break; 2649 case T_INT: pextrd(dst, src, idx); break; 2650 case T_LONG: pextrq(dst, src, idx); break; 2651 2652 default: 2653 assert(false,"Should not reach here."); 2654 break; 2655 } 2656 } 2657 2658 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2659 int esize = type2aelembytes(typ); 2660 int elem_per_lane = 16/esize; 2661 int lane = elemindex / elem_per_lane; 2662 int eindex = elemindex % elem_per_lane; 2663 2664 if (lane >= 2) { 2665 assert(UseAVX > 2, "required"); 2666 vextractf32x4(dst, src, lane & 3); 2667 return dst; 2668 } else if (lane > 0) { 2669 assert(UseAVX > 0, "required"); 2670 vextractf128(dst, src, lane); 2671 return dst; 2672 } else { 2673 return src; 2674 } 2675 } 2676 2677 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2678 if (typ == T_BYTE) { 2679 movsbl(dst, dst); 2680 } else if (typ == T_SHORT) { 2681 movswl(dst, dst); 2682 } 2683 } 2684 2685 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2686 int esize = type2aelembytes(typ); 2687 int elem_per_lane = 16/esize; 2688 int eindex = elemindex % elem_per_lane; 2689 assert(is_integral_type(typ),"required"); 2690 2691 if (eindex == 0) { 2692 if (typ == T_LONG) { 2693 movq(dst, src); 2694 } else { 2695 movdl(dst, src); 2696 movsxl(typ, dst); 2697 } 2698 } else { 2699 extract(typ, dst, src, eindex); 2700 movsxl(typ, dst); 2701 } 2702 } 2703 2704 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2705 int esize = type2aelembytes(typ); 2706 int elem_per_lane = 16/esize; 2707 int eindex = elemindex % elem_per_lane; 2708 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2709 2710 if (eindex == 0) { 2711 movq(dst, src); 2712 } else { 2713 if (typ == T_FLOAT) { 2714 if (UseAVX == 0) { 2715 movdqu(dst, src); 2716 shufps(dst, dst, eindex); 2717 } else { 2718 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2719 } 2720 } else { 2721 if (UseAVX == 0) { 2722 movdqu(dst, src); 2723 psrldq(dst, eindex*esize); 2724 } else { 2725 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2726 } 2727 movq(dst, dst); 2728 } 2729 } 2730 // Zero upper bits 2731 if (typ == T_FLOAT) { 2732 if (UseAVX == 0) { 2733 assert(vtmp != xnoreg, "required."); 2734 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2735 pand(dst, vtmp); 2736 } else { 2737 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2738 } 2739 } 2740 } 2741 2742 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2743 switch(typ) { 2744 case T_BYTE: 2745 case T_BOOLEAN: 2746 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2747 break; 2748 case T_SHORT: 2749 case T_CHAR: 2750 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2751 break; 2752 case T_INT: 2753 case T_FLOAT: 2754 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2755 break; 2756 case T_LONG: 2757 case T_DOUBLE: 2758 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2759 break; 2760 default: 2761 assert(false,"Should not reach here."); 2762 break; 2763 } 2764 } 2765 2766 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2767 assert(rscratch != noreg || always_reachable(src2), "missing"); 2768 2769 switch(typ) { 2770 case T_BOOLEAN: 2771 case T_BYTE: 2772 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2773 break; 2774 case T_CHAR: 2775 case T_SHORT: 2776 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2777 break; 2778 case T_INT: 2779 case T_FLOAT: 2780 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2781 break; 2782 case T_LONG: 2783 case T_DOUBLE: 2784 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2785 break; 2786 default: 2787 assert(false,"Should not reach here."); 2788 break; 2789 } 2790 } 2791 2792 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2793 switch(typ) { 2794 case T_BYTE: 2795 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2796 break; 2797 case T_SHORT: 2798 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2799 break; 2800 case T_INT: 2801 case T_FLOAT: 2802 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2803 break; 2804 case T_LONG: 2805 case T_DOUBLE: 2806 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2807 break; 2808 default: 2809 assert(false,"Should not reach here."); 2810 break; 2811 } 2812 } 2813 2814 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2815 assert(vlen_in_bytes <= 32, ""); 2816 int esize = type2aelembytes(bt); 2817 if (vlen_in_bytes == 32) { 2818 assert(vtmp == xnoreg, "required."); 2819 if (esize >= 4) { 2820 vtestps(src1, src2, AVX_256bit); 2821 } else { 2822 vptest(src1, src2, AVX_256bit); 2823 } 2824 return; 2825 } 2826 if (vlen_in_bytes < 16) { 2827 // Duplicate the lower part to fill the whole register, 2828 // Don't need to do so for src2 2829 assert(vtmp != xnoreg, "required"); 2830 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2831 pshufd(vtmp, src1, shuffle_imm); 2832 } else { 2833 assert(vtmp == xnoreg, "required"); 2834 vtmp = src1; 2835 } 2836 if (esize >= 4 && VM_Version::supports_avx()) { 2837 vtestps(vtmp, src2, AVX_128bit); 2838 } else { 2839 ptest(vtmp, src2); 2840 } 2841 } 2842 2843 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2844 #ifdef ASSERT 2845 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2846 bool is_bw_supported = VM_Version::supports_avx512bw(); 2847 if (is_bw && !is_bw_supported) { 2848 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2849 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2850 "XMM register should be 0-15"); 2851 } 2852 #endif // ASSERT 2853 switch (elem_bt) { 2854 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2855 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2856 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2857 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2858 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2859 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2860 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2861 } 2862 } 2863 2864 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2865 assert(UseAVX >= 2, "required"); 2866 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2867 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2868 if ((UseAVX > 2) && 2869 (!is_bw || VM_Version::supports_avx512bw()) && 2870 (!is_vl || VM_Version::supports_avx512vl())) { 2871 switch (elem_bt) { 2872 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2873 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2874 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2875 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2876 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2877 } 2878 } else { 2879 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2880 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2881 switch (elem_bt) { 2882 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2883 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2884 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2885 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2886 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2887 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2888 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2889 } 2890 } 2891 } 2892 2893 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2894 switch (to_elem_bt) { 2895 case T_SHORT: 2896 vpmovsxbw(dst, src, vlen_enc); 2897 break; 2898 case T_INT: 2899 vpmovsxbd(dst, src, vlen_enc); 2900 break; 2901 case T_FLOAT: 2902 vpmovsxbd(dst, src, vlen_enc); 2903 vcvtdq2ps(dst, dst, vlen_enc); 2904 break; 2905 case T_LONG: 2906 vpmovsxbq(dst, src, vlen_enc); 2907 break; 2908 case T_DOUBLE: { 2909 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2910 vpmovsxbd(dst, src, mid_vlen_enc); 2911 vcvtdq2pd(dst, dst, vlen_enc); 2912 break; 2913 } 2914 default: 2915 fatal("Unsupported type %s", type2name(to_elem_bt)); 2916 break; 2917 } 2918 } 2919 2920 //------------------------------------------------------------------------------------------- 2921 2922 // IndexOf for constant substrings with size >= 8 chars 2923 // which don't need to be loaded through stack. 2924 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2925 Register cnt1, Register cnt2, 2926 int int_cnt2, Register result, 2927 XMMRegister vec, Register tmp, 2928 int ae) { 2929 ShortBranchVerifier sbv(this); 2930 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2931 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2932 2933 // This method uses the pcmpestri instruction with bound registers 2934 // inputs: 2935 // xmm - substring 2936 // rax - substring length (elements count) 2937 // mem - scanned string 2938 // rdx - string length (elements count) 2939 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2940 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2941 // outputs: 2942 // rcx - matched index in string 2943 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2944 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2945 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2946 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2947 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2948 2949 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2950 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2951 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2952 2953 // Note, inline_string_indexOf() generates checks: 2954 // if (substr.count > string.count) return -1; 2955 // if (substr.count == 0) return 0; 2956 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2957 2958 // Load substring. 2959 if (ae == StrIntrinsicNode::UL) { 2960 pmovzxbw(vec, Address(str2, 0)); 2961 } else { 2962 movdqu(vec, Address(str2, 0)); 2963 } 2964 movl(cnt2, int_cnt2); 2965 movptr(result, str1); // string addr 2966 2967 if (int_cnt2 > stride) { 2968 jmpb(SCAN_TO_SUBSTR); 2969 2970 // Reload substr for rescan, this code 2971 // is executed only for large substrings (> 8 chars) 2972 bind(RELOAD_SUBSTR); 2973 if (ae == StrIntrinsicNode::UL) { 2974 pmovzxbw(vec, Address(str2, 0)); 2975 } else { 2976 movdqu(vec, Address(str2, 0)); 2977 } 2978 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2979 2980 bind(RELOAD_STR); 2981 // We came here after the beginning of the substring was 2982 // matched but the rest of it was not so we need to search 2983 // again. Start from the next element after the previous match. 2984 2985 // cnt2 is number of substring reminding elements and 2986 // cnt1 is number of string reminding elements when cmp failed. 2987 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2988 subl(cnt1, cnt2); 2989 addl(cnt1, int_cnt2); 2990 movl(cnt2, int_cnt2); // Now restore cnt2 2991 2992 decrementl(cnt1); // Shift to next element 2993 cmpl(cnt1, cnt2); 2994 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2995 2996 addptr(result, (1<<scale1)); 2997 2998 } // (int_cnt2 > 8) 2999 3000 // Scan string for start of substr in 16-byte vectors 3001 bind(SCAN_TO_SUBSTR); 3002 pcmpestri(vec, Address(result, 0), mode); 3003 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3004 subl(cnt1, stride); 3005 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3006 cmpl(cnt1, cnt2); 3007 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3008 addptr(result, 16); 3009 jmpb(SCAN_TO_SUBSTR); 3010 3011 // Found a potential substr 3012 bind(FOUND_CANDIDATE); 3013 // Matched whole vector if first element matched (tmp(rcx) == 0). 3014 if (int_cnt2 == stride) { 3015 jccb(Assembler::overflow, RET_FOUND); // OF == 1 3016 } else { // int_cnt2 > 8 3017 jccb(Assembler::overflow, FOUND_SUBSTR); 3018 } 3019 // After pcmpestri tmp(rcx) contains matched element index 3020 // Compute start addr of substr 3021 lea(result, Address(result, tmp, scale1)); 3022 3023 // Make sure string is still long enough 3024 subl(cnt1, tmp); 3025 cmpl(cnt1, cnt2); 3026 if (int_cnt2 == stride) { 3027 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3028 } else { // int_cnt2 > 8 3029 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 3030 } 3031 // Left less then substring. 3032 3033 bind(RET_NOT_FOUND); 3034 movl(result, -1); 3035 jmp(EXIT); 3036 3037 if (int_cnt2 > stride) { 3038 // This code is optimized for the case when whole substring 3039 // is matched if its head is matched. 3040 bind(MATCH_SUBSTR_HEAD); 3041 pcmpestri(vec, Address(result, 0), mode); 3042 // Reload only string if does not match 3043 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 3044 3045 Label CONT_SCAN_SUBSTR; 3046 // Compare the rest of substring (> 8 chars). 3047 bind(FOUND_SUBSTR); 3048 // First 8 chars are already matched. 3049 negptr(cnt2); 3050 addptr(cnt2, stride); 3051 3052 bind(SCAN_SUBSTR); 3053 subl(cnt1, stride); 3054 cmpl(cnt2, -stride); // Do not read beyond substring 3055 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 3056 // Back-up strings to avoid reading beyond substring: 3057 // cnt1 = cnt1 - cnt2 + 8 3058 addl(cnt1, cnt2); // cnt2 is negative 3059 addl(cnt1, stride); 3060 movl(cnt2, stride); negptr(cnt2); 3061 bind(CONT_SCAN_SUBSTR); 3062 if (int_cnt2 < (int)G) { 3063 int tail_off1 = int_cnt2<<scale1; 3064 int tail_off2 = int_cnt2<<scale2; 3065 if (ae == StrIntrinsicNode::UL) { 3066 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 3067 } else { 3068 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 3069 } 3070 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 3071 } else { 3072 // calculate index in register to avoid integer overflow (int_cnt2*2) 3073 movl(tmp, int_cnt2); 3074 addptr(tmp, cnt2); 3075 if (ae == StrIntrinsicNode::UL) { 3076 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 3077 } else { 3078 movdqu(vec, Address(str2, tmp, scale2, 0)); 3079 } 3080 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 3081 } 3082 // Need to reload strings pointers if not matched whole vector 3083 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3084 addptr(cnt2, stride); 3085 jcc(Assembler::negative, SCAN_SUBSTR); 3086 // Fall through if found full substring 3087 3088 } // (int_cnt2 > 8) 3089 3090 bind(RET_FOUND); 3091 // Found result if we matched full small substring. 3092 // Compute substr offset 3093 subptr(result, str1); 3094 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3095 shrl(result, 1); // index 3096 } 3097 bind(EXIT); 3098 3099 } // string_indexofC8 3100 3101 // Small strings are loaded through stack if they cross page boundary. 3102 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 3103 Register cnt1, Register cnt2, 3104 int int_cnt2, Register result, 3105 XMMRegister vec, Register tmp, 3106 int ae) { 3107 ShortBranchVerifier sbv(this); 3108 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3109 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3110 3111 // 3112 // int_cnt2 is length of small (< 8 chars) constant substring 3113 // or (-1) for non constant substring in which case its length 3114 // is in cnt2 register. 3115 // 3116 // Note, inline_string_indexOf() generates checks: 3117 // if (substr.count > string.count) return -1; 3118 // if (substr.count == 0) return 0; 3119 // 3120 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 3121 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 3122 // This method uses the pcmpestri instruction with bound registers 3123 // inputs: 3124 // xmm - substring 3125 // rax - substring length (elements count) 3126 // mem - scanned string 3127 // rdx - string length (elements count) 3128 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 3129 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 3130 // outputs: 3131 // rcx - matched index in string 3132 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3133 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 3134 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 3135 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 3136 3137 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 3138 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 3139 FOUND_CANDIDATE; 3140 3141 { //======================================================== 3142 // We don't know where these strings are located 3143 // and we can't read beyond them. Load them through stack. 3144 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 3145 3146 movptr(tmp, rsp); // save old SP 3147 3148 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 3149 if (int_cnt2 == (1>>scale2)) { // One byte 3150 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 3151 load_unsigned_byte(result, Address(str2, 0)); 3152 movdl(vec, result); // move 32 bits 3153 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 3154 // Not enough header space in 32-bit VM: 12+3 = 15. 3155 movl(result, Address(str2, -1)); 3156 shrl(result, 8); 3157 movdl(vec, result); // move 32 bits 3158 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 3159 load_unsigned_short(result, Address(str2, 0)); 3160 movdl(vec, result); // move 32 bits 3161 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 3162 movdl(vec, Address(str2, 0)); // move 32 bits 3163 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 3164 movq(vec, Address(str2, 0)); // move 64 bits 3165 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 3166 // Array header size is 12 bytes in 32-bit VM 3167 // + 6 bytes for 3 chars == 18 bytes, 3168 // enough space to load vec and shift. 3169 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3170 if (ae == StrIntrinsicNode::UL) { 3171 int tail_off = int_cnt2-8; 3172 pmovzxbw(vec, Address(str2, tail_off)); 3173 psrldq(vec, -2*tail_off); 3174 } 3175 else { 3176 int tail_off = int_cnt2*(1<<scale2); 3177 movdqu(vec, Address(str2, tail_off-16)); 3178 psrldq(vec, 16-tail_off); 3179 } 3180 } 3181 } else { // not constant substring 3182 cmpl(cnt2, stride); 3183 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3184 3185 // We can read beyond string if srt+16 does not cross page boundary 3186 // since heaps are aligned and mapped by pages. 3187 assert(os::vm_page_size() < (int)G, "default page should be small"); 3188 movl(result, str2); // We need only low 32 bits 3189 andl(result, ((int)os::vm_page_size()-1)); 3190 cmpl(result, ((int)os::vm_page_size()-16)); 3191 jccb(Assembler::belowEqual, CHECK_STR); 3192 3193 // Move small strings to stack to allow load 16 bytes into vec. 3194 subptr(rsp, 16); 3195 int stk_offset = wordSize-(1<<scale2); 3196 push(cnt2); 3197 3198 bind(COPY_SUBSTR); 3199 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3200 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3201 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3202 } else if (ae == StrIntrinsicNode::UU) { 3203 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3204 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3205 } 3206 decrement(cnt2); 3207 jccb(Assembler::notZero, COPY_SUBSTR); 3208 3209 pop(cnt2); 3210 movptr(str2, rsp); // New substring address 3211 } // non constant 3212 3213 bind(CHECK_STR); 3214 cmpl(cnt1, stride); 3215 jccb(Assembler::aboveEqual, BIG_STRINGS); 3216 3217 // Check cross page boundary. 3218 movl(result, str1); // We need only low 32 bits 3219 andl(result, ((int)os::vm_page_size()-1)); 3220 cmpl(result, ((int)os::vm_page_size()-16)); 3221 jccb(Assembler::belowEqual, BIG_STRINGS); 3222 3223 subptr(rsp, 16); 3224 int stk_offset = -(1<<scale1); 3225 if (int_cnt2 < 0) { // not constant 3226 push(cnt2); 3227 stk_offset += wordSize; 3228 } 3229 movl(cnt2, cnt1); 3230 3231 bind(COPY_STR); 3232 if (ae == StrIntrinsicNode::LL) { 3233 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3234 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3235 } else { 3236 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3237 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3238 } 3239 decrement(cnt2); 3240 jccb(Assembler::notZero, COPY_STR); 3241 3242 if (int_cnt2 < 0) { // not constant 3243 pop(cnt2); 3244 } 3245 movptr(str1, rsp); // New string address 3246 3247 bind(BIG_STRINGS); 3248 // Load substring. 3249 if (int_cnt2 < 0) { // -1 3250 if (ae == StrIntrinsicNode::UL) { 3251 pmovzxbw(vec, Address(str2, 0)); 3252 } else { 3253 movdqu(vec, Address(str2, 0)); 3254 } 3255 push(cnt2); // substr count 3256 push(str2); // substr addr 3257 push(str1); // string addr 3258 } else { 3259 // Small (< 8 chars) constant substrings are loaded already. 3260 movl(cnt2, int_cnt2); 3261 } 3262 push(tmp); // original SP 3263 3264 } // Finished loading 3265 3266 //======================================================== 3267 // Start search 3268 // 3269 3270 movptr(result, str1); // string addr 3271 3272 if (int_cnt2 < 0) { // Only for non constant substring 3273 jmpb(SCAN_TO_SUBSTR); 3274 3275 // SP saved at sp+0 3276 // String saved at sp+1*wordSize 3277 // Substr saved at sp+2*wordSize 3278 // Substr count saved at sp+3*wordSize 3279 3280 // Reload substr for rescan, this code 3281 // is executed only for large substrings (> 8 chars) 3282 bind(RELOAD_SUBSTR); 3283 movptr(str2, Address(rsp, 2*wordSize)); 3284 movl(cnt2, Address(rsp, 3*wordSize)); 3285 if (ae == StrIntrinsicNode::UL) { 3286 pmovzxbw(vec, Address(str2, 0)); 3287 } else { 3288 movdqu(vec, Address(str2, 0)); 3289 } 3290 // We came here after the beginning of the substring was 3291 // matched but the rest of it was not so we need to search 3292 // again. Start from the next element after the previous match. 3293 subptr(str1, result); // Restore counter 3294 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3295 shrl(str1, 1); 3296 } 3297 addl(cnt1, str1); 3298 decrementl(cnt1); // Shift to next element 3299 cmpl(cnt1, cnt2); 3300 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3301 3302 addptr(result, (1<<scale1)); 3303 } // non constant 3304 3305 // Scan string for start of substr in 16-byte vectors 3306 bind(SCAN_TO_SUBSTR); 3307 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3308 pcmpestri(vec, Address(result, 0), mode); 3309 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3310 subl(cnt1, stride); 3311 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3312 cmpl(cnt1, cnt2); 3313 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3314 addptr(result, 16); 3315 3316 bind(ADJUST_STR); 3317 cmpl(cnt1, stride); // Do not read beyond string 3318 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3319 // Back-up string to avoid reading beyond string. 3320 lea(result, Address(result, cnt1, scale1, -16)); 3321 movl(cnt1, stride); 3322 jmpb(SCAN_TO_SUBSTR); 3323 3324 // Found a potential substr 3325 bind(FOUND_CANDIDATE); 3326 // After pcmpestri tmp(rcx) contains matched element index 3327 3328 // Make sure string is still long enough 3329 subl(cnt1, tmp); 3330 cmpl(cnt1, cnt2); 3331 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3332 // Left less then substring. 3333 3334 bind(RET_NOT_FOUND); 3335 movl(result, -1); 3336 jmp(CLEANUP); 3337 3338 bind(FOUND_SUBSTR); 3339 // Compute start addr of substr 3340 lea(result, Address(result, tmp, scale1)); 3341 if (int_cnt2 > 0) { // Constant substring 3342 // Repeat search for small substring (< 8 chars) 3343 // from new point without reloading substring. 3344 // Have to check that we don't read beyond string. 3345 cmpl(tmp, stride-int_cnt2); 3346 jccb(Assembler::greater, ADJUST_STR); 3347 // Fall through if matched whole substring. 3348 } else { // non constant 3349 assert(int_cnt2 == -1, "should be != 0"); 3350 3351 addl(tmp, cnt2); 3352 // Found result if we matched whole substring. 3353 cmpl(tmp, stride); 3354 jcc(Assembler::lessEqual, RET_FOUND); 3355 3356 // Repeat search for small substring (<= 8 chars) 3357 // from new point 'str1' without reloading substring. 3358 cmpl(cnt2, stride); 3359 // Have to check that we don't read beyond string. 3360 jccb(Assembler::lessEqual, ADJUST_STR); 3361 3362 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3363 // Compare the rest of substring (> 8 chars). 3364 movptr(str1, result); 3365 3366 cmpl(tmp, cnt2); 3367 // First 8 chars are already matched. 3368 jccb(Assembler::equal, CHECK_NEXT); 3369 3370 bind(SCAN_SUBSTR); 3371 pcmpestri(vec, Address(str1, 0), mode); 3372 // Need to reload strings pointers if not matched whole vector 3373 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3374 3375 bind(CHECK_NEXT); 3376 subl(cnt2, stride); 3377 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3378 addptr(str1, 16); 3379 if (ae == StrIntrinsicNode::UL) { 3380 addptr(str2, 8); 3381 } else { 3382 addptr(str2, 16); 3383 } 3384 subl(cnt1, stride); 3385 cmpl(cnt2, stride); // Do not read beyond substring 3386 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3387 // Back-up strings to avoid reading beyond substring. 3388 3389 if (ae == StrIntrinsicNode::UL) { 3390 lea(str2, Address(str2, cnt2, scale2, -8)); 3391 lea(str1, Address(str1, cnt2, scale1, -16)); 3392 } else { 3393 lea(str2, Address(str2, cnt2, scale2, -16)); 3394 lea(str1, Address(str1, cnt2, scale1, -16)); 3395 } 3396 subl(cnt1, cnt2); 3397 movl(cnt2, stride); 3398 addl(cnt1, stride); 3399 bind(CONT_SCAN_SUBSTR); 3400 if (ae == StrIntrinsicNode::UL) { 3401 pmovzxbw(vec, Address(str2, 0)); 3402 } else { 3403 movdqu(vec, Address(str2, 0)); 3404 } 3405 jmp(SCAN_SUBSTR); 3406 3407 bind(RET_FOUND_LONG); 3408 movptr(str1, Address(rsp, wordSize)); 3409 } // non constant 3410 3411 bind(RET_FOUND); 3412 // Compute substr offset 3413 subptr(result, str1); 3414 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3415 shrl(result, 1); // index 3416 } 3417 bind(CLEANUP); 3418 pop(rsp); // restore SP 3419 3420 } // string_indexof 3421 3422 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3423 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3424 ShortBranchVerifier sbv(this); 3425 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3426 3427 int stride = 8; 3428 3429 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3430 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3431 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3432 FOUND_SEQ_CHAR, DONE_LABEL; 3433 3434 movptr(result, str1); 3435 if (UseAVX >= 2) { 3436 cmpl(cnt1, stride); 3437 jcc(Assembler::less, SCAN_TO_CHAR); 3438 cmpl(cnt1, 2*stride); 3439 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3440 movdl(vec1, ch); 3441 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3442 vpxor(vec2, vec2); 3443 movl(tmp, cnt1); 3444 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3445 andl(cnt1,0x0000000F); //tail count (in chars) 3446 3447 bind(SCAN_TO_16_CHAR_LOOP); 3448 vmovdqu(vec3, Address(result, 0)); 3449 vpcmpeqw(vec3, vec3, vec1, 1); 3450 vptest(vec2, vec3); 3451 jcc(Assembler::carryClear, FOUND_CHAR); 3452 addptr(result, 32); 3453 subl(tmp, 2*stride); 3454 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3455 jmp(SCAN_TO_8_CHAR); 3456 bind(SCAN_TO_8_CHAR_INIT); 3457 movdl(vec1, ch); 3458 pshuflw(vec1, vec1, 0x00); 3459 pshufd(vec1, vec1, 0); 3460 pxor(vec2, vec2); 3461 } 3462 bind(SCAN_TO_8_CHAR); 3463 cmpl(cnt1, stride); 3464 jcc(Assembler::less, SCAN_TO_CHAR); 3465 if (UseAVX < 2) { 3466 movdl(vec1, ch); 3467 pshuflw(vec1, vec1, 0x00); 3468 pshufd(vec1, vec1, 0); 3469 pxor(vec2, vec2); 3470 } 3471 movl(tmp, cnt1); 3472 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3473 andl(cnt1,0x00000007); //tail count (in chars) 3474 3475 bind(SCAN_TO_8_CHAR_LOOP); 3476 movdqu(vec3, Address(result, 0)); 3477 pcmpeqw(vec3, vec1); 3478 ptest(vec2, vec3); 3479 jcc(Assembler::carryClear, FOUND_CHAR); 3480 addptr(result, 16); 3481 subl(tmp, stride); 3482 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3483 bind(SCAN_TO_CHAR); 3484 testl(cnt1, cnt1); 3485 jcc(Assembler::zero, RET_NOT_FOUND); 3486 bind(SCAN_TO_CHAR_LOOP); 3487 load_unsigned_short(tmp, Address(result, 0)); 3488 cmpl(ch, tmp); 3489 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3490 addptr(result, 2); 3491 subl(cnt1, 1); 3492 jccb(Assembler::zero, RET_NOT_FOUND); 3493 jmp(SCAN_TO_CHAR_LOOP); 3494 3495 bind(RET_NOT_FOUND); 3496 movl(result, -1); 3497 jmpb(DONE_LABEL); 3498 3499 bind(FOUND_CHAR); 3500 if (UseAVX >= 2) { 3501 vpmovmskb(tmp, vec3); 3502 } else { 3503 pmovmskb(tmp, vec3); 3504 } 3505 bsfl(ch, tmp); 3506 addptr(result, ch); 3507 3508 bind(FOUND_SEQ_CHAR); 3509 subptr(result, str1); 3510 shrl(result, 1); 3511 3512 bind(DONE_LABEL); 3513 } // string_indexof_char 3514 3515 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3516 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3517 ShortBranchVerifier sbv(this); 3518 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3519 3520 int stride = 16; 3521 3522 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3523 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3524 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3525 FOUND_SEQ_CHAR, DONE_LABEL; 3526 3527 movptr(result, str1); 3528 if (UseAVX >= 2) { 3529 cmpl(cnt1, stride); 3530 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3531 cmpl(cnt1, stride*2); 3532 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3533 movdl(vec1, ch); 3534 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3535 vpxor(vec2, vec2); 3536 movl(tmp, cnt1); 3537 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3538 andl(cnt1,0x0000001F); //tail count (in chars) 3539 3540 bind(SCAN_TO_32_CHAR_LOOP); 3541 vmovdqu(vec3, Address(result, 0)); 3542 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3543 vptest(vec2, vec3); 3544 jcc(Assembler::carryClear, FOUND_CHAR); 3545 addptr(result, 32); 3546 subl(tmp, stride*2); 3547 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3548 jmp(SCAN_TO_16_CHAR); 3549 3550 bind(SCAN_TO_16_CHAR_INIT); 3551 movdl(vec1, ch); 3552 pxor(vec2, vec2); 3553 pshufb(vec1, vec2); 3554 } 3555 3556 bind(SCAN_TO_16_CHAR); 3557 cmpl(cnt1, stride); 3558 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3559 if (UseAVX < 2) { 3560 movdl(vec1, ch); 3561 pxor(vec2, vec2); 3562 pshufb(vec1, vec2); 3563 } 3564 movl(tmp, cnt1); 3565 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3566 andl(cnt1,0x0000000F); //tail count (in bytes) 3567 3568 bind(SCAN_TO_16_CHAR_LOOP); 3569 movdqu(vec3, Address(result, 0)); 3570 pcmpeqb(vec3, vec1); 3571 ptest(vec2, vec3); 3572 jcc(Assembler::carryClear, FOUND_CHAR); 3573 addptr(result, 16); 3574 subl(tmp, stride); 3575 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3576 3577 bind(SCAN_TO_CHAR_INIT); 3578 testl(cnt1, cnt1); 3579 jcc(Assembler::zero, RET_NOT_FOUND); 3580 bind(SCAN_TO_CHAR_LOOP); 3581 load_unsigned_byte(tmp, Address(result, 0)); 3582 cmpl(ch, tmp); 3583 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3584 addptr(result, 1); 3585 subl(cnt1, 1); 3586 jccb(Assembler::zero, RET_NOT_FOUND); 3587 jmp(SCAN_TO_CHAR_LOOP); 3588 3589 bind(RET_NOT_FOUND); 3590 movl(result, -1); 3591 jmpb(DONE_LABEL); 3592 3593 bind(FOUND_CHAR); 3594 if (UseAVX >= 2) { 3595 vpmovmskb(tmp, vec3); 3596 } else { 3597 pmovmskb(tmp, vec3); 3598 } 3599 bsfl(ch, tmp); 3600 addptr(result, ch); 3601 3602 bind(FOUND_SEQ_CHAR); 3603 subptr(result, str1); 3604 3605 bind(DONE_LABEL); 3606 } // stringL_indexof_char 3607 3608 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3609 switch (eltype) { 3610 case T_BOOLEAN: return sizeof(jboolean); 3611 case T_BYTE: return sizeof(jbyte); 3612 case T_SHORT: return sizeof(jshort); 3613 case T_CHAR: return sizeof(jchar); 3614 case T_INT: return sizeof(jint); 3615 default: 3616 ShouldNotReachHere(); 3617 return -1; 3618 } 3619 } 3620 3621 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3622 switch (eltype) { 3623 // T_BOOLEAN used as surrogate for unsigned byte 3624 case T_BOOLEAN: movzbl(dst, src); break; 3625 case T_BYTE: movsbl(dst, src); break; 3626 case T_SHORT: movswl(dst, src); break; 3627 case T_CHAR: movzwl(dst, src); break; 3628 case T_INT: movl(dst, src); break; 3629 default: 3630 ShouldNotReachHere(); 3631 } 3632 } 3633 3634 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3635 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3636 } 3637 3638 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3639 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3640 } 3641 3642 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3643 const int vlen = Assembler::AVX_256bit; 3644 switch (eltype) { 3645 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3646 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3647 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3648 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3649 case T_INT: 3650 // do nothing 3651 break; 3652 default: 3653 ShouldNotReachHere(); 3654 } 3655 } 3656 3657 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3658 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3659 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3660 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3661 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3662 BasicType eltype) { 3663 ShortBranchVerifier sbv(this); 3664 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3665 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3666 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3667 3668 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3669 SHORT_UNROLLED_LOOP_EXIT, 3670 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3671 UNROLLED_VECTOR_LOOP_BEGIN, 3672 END; 3673 switch (eltype) { 3674 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3675 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3676 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3677 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3678 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3679 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3680 } 3681 3682 // For "renaming" for readibility of the code 3683 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3684 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3685 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3686 3687 const int elsize = arrays_hashcode_elsize(eltype); 3688 3689 /* 3690 if (cnt1 >= 2) { 3691 if (cnt1 >= 32) { 3692 UNROLLED VECTOR LOOP 3693 } 3694 UNROLLED SCALAR LOOP 3695 } 3696 SINGLE SCALAR 3697 */ 3698 3699 cmpl(cnt1, 32); 3700 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3701 3702 // cnt1 >= 32 && generate_vectorized_loop 3703 xorl(index, index); 3704 3705 // vresult = IntVector.zero(I256); 3706 for (int idx = 0; idx < 4; idx++) { 3707 vpxor(vresult[idx], vresult[idx]); 3708 } 3709 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3710 Register bound = tmp2; 3711 Register next = tmp3; 3712 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3713 movl(next, Address(tmp2, 0)); 3714 movdl(vnext, next); 3715 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3716 3717 // index = 0; 3718 // bound = cnt1 & ~(32 - 1); 3719 movl(bound, cnt1); 3720 andl(bound, ~(32 - 1)); 3721 // for (; index < bound; index += 32) { 3722 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3723 // result *= next; 3724 imull(result, next); 3725 // loop fission to upfront the cost of fetching from memory, OOO execution 3726 // can then hopefully do a better job of prefetching 3727 for (int idx = 0; idx < 4; idx++) { 3728 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3729 } 3730 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3731 for (int idx = 0; idx < 4; idx++) { 3732 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3733 arrays_hashcode_elvcast(vtmp[idx], eltype); 3734 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3735 } 3736 // index += 32; 3737 addl(index, 32); 3738 // index < bound; 3739 cmpl(index, bound); 3740 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3741 // } 3742 3743 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3744 subl(cnt1, bound); 3745 // release bound 3746 3747 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3748 for (int idx = 0; idx < 4; idx++) { 3749 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3750 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3751 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3752 } 3753 // result += vresult.reduceLanes(ADD); 3754 for (int idx = 0; idx < 4; idx++) { 3755 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3756 } 3757 3758 // } else if (cnt1 < 32) { 3759 3760 bind(SHORT_UNROLLED_BEGIN); 3761 // int i = 1; 3762 movl(index, 1); 3763 cmpl(index, cnt1); 3764 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3765 3766 // for (; i < cnt1 ; i += 2) { 3767 bind(SHORT_UNROLLED_LOOP_BEGIN); 3768 movl(tmp3, 961); 3769 imull(result, tmp3); 3770 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3771 movl(tmp3, tmp2); 3772 shll(tmp3, 5); 3773 subl(tmp3, tmp2); 3774 addl(result, tmp3); 3775 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3776 addl(result, tmp3); 3777 addl(index, 2); 3778 cmpl(index, cnt1); 3779 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3780 3781 // } 3782 // if (i >= cnt1) { 3783 bind(SHORT_UNROLLED_LOOP_EXIT); 3784 jccb(Assembler::greater, END); 3785 movl(tmp2, result); 3786 shll(result, 5); 3787 subl(result, tmp2); 3788 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3789 addl(result, tmp3); 3790 // } 3791 bind(END); 3792 3793 BLOCK_COMMENT("} // arrays_hashcode"); 3794 3795 } // arrays_hashcode 3796 3797 // helper function for string_compare 3798 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3799 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3800 Address::ScaleFactor scale2, Register index, int ae) { 3801 if (ae == StrIntrinsicNode::LL) { 3802 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3803 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3804 } else if (ae == StrIntrinsicNode::UU) { 3805 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3806 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3807 } else { 3808 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3809 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3810 } 3811 } 3812 3813 // Compare strings, used for char[] and byte[]. 3814 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3815 Register cnt1, Register cnt2, Register result, 3816 XMMRegister vec1, int ae, KRegister mask) { 3817 ShortBranchVerifier sbv(this); 3818 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3819 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3 3820 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3821 int stride2x2 = 0x40; 3822 Address::ScaleFactor scale = Address::no_scale; 3823 Address::ScaleFactor scale1 = Address::no_scale; 3824 Address::ScaleFactor scale2 = Address::no_scale; 3825 3826 if (ae != StrIntrinsicNode::LL) { 3827 stride2x2 = 0x20; 3828 } 3829 3830 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3831 shrl(cnt2, 1); 3832 } 3833 // Compute the minimum of the string lengths and the 3834 // difference of the string lengths (stack). 3835 // Do the conditional move stuff 3836 movl(result, cnt1); 3837 subl(cnt1, cnt2); 3838 push(cnt1); 3839 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3840 3841 // Is the minimum length zero? 3842 testl(cnt2, cnt2); 3843 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3844 if (ae == StrIntrinsicNode::LL) { 3845 // Load first bytes 3846 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3847 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3848 } else if (ae == StrIntrinsicNode::UU) { 3849 // Load first characters 3850 load_unsigned_short(result, Address(str1, 0)); 3851 load_unsigned_short(cnt1, Address(str2, 0)); 3852 } else { 3853 load_unsigned_byte(result, Address(str1, 0)); 3854 load_unsigned_short(cnt1, Address(str2, 0)); 3855 } 3856 subl(result, cnt1); 3857 jcc(Assembler::notZero, POP_LABEL); 3858 3859 if (ae == StrIntrinsicNode::UU) { 3860 // Divide length by 2 to get number of chars 3861 shrl(cnt2, 1); 3862 } 3863 cmpl(cnt2, 1); 3864 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3865 3866 // Check if the strings start at the same location and setup scale and stride 3867 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3868 cmpptr(str1, str2); 3869 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3870 if (ae == StrIntrinsicNode::LL) { 3871 scale = Address::times_1; 3872 stride = 16; 3873 } else { 3874 scale = Address::times_2; 3875 stride = 8; 3876 } 3877 } else { 3878 scale1 = Address::times_1; 3879 scale2 = Address::times_2; 3880 // scale not used 3881 stride = 8; 3882 } 3883 3884 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3885 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3886 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3887 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3888 Label COMPARE_TAIL_LONG; 3889 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3 3890 3891 int pcmpmask = 0x19; 3892 if (ae == StrIntrinsicNode::LL) { 3893 pcmpmask &= ~0x01; 3894 } 3895 3896 // Setup to compare 16-chars (32-bytes) vectors, 3897 // start from first character again because it has aligned address. 3898 if (ae == StrIntrinsicNode::LL) { 3899 stride2 = 32; 3900 } else { 3901 stride2 = 16; 3902 } 3903 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3904 adr_stride = stride << scale; 3905 } else { 3906 adr_stride1 = 8; //stride << scale1; 3907 adr_stride2 = 16; //stride << scale2; 3908 } 3909 3910 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3911 // rax and rdx are used by pcmpestri as elements counters 3912 movl(result, cnt2); 3913 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3914 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3915 3916 // fast path : compare first 2 8-char vectors. 3917 bind(COMPARE_16_CHARS); 3918 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3919 movdqu(vec1, Address(str1, 0)); 3920 } else { 3921 pmovzxbw(vec1, Address(str1, 0)); 3922 } 3923 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3924 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3925 3926 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3927 movdqu(vec1, Address(str1, adr_stride)); 3928 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3929 } else { 3930 pmovzxbw(vec1, Address(str1, adr_stride1)); 3931 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3932 } 3933 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3934 addl(cnt1, stride); 3935 3936 // Compare the characters at index in cnt1 3937 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3938 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3939 subl(result, cnt2); 3940 jmp(POP_LABEL); 3941 3942 // Setup the registers to start vector comparison loop 3943 bind(COMPARE_WIDE_VECTORS); 3944 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3945 lea(str1, Address(str1, result, scale)); 3946 lea(str2, Address(str2, result, scale)); 3947 } else { 3948 lea(str1, Address(str1, result, scale1)); 3949 lea(str2, Address(str2, result, scale2)); 3950 } 3951 subl(result, stride2); 3952 subl(cnt2, stride2); 3953 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3954 negptr(result); 3955 3956 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3957 bind(COMPARE_WIDE_VECTORS_LOOP); 3958 3959 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3960 cmpl(cnt2, stride2x2); 3961 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3962 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3963 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3964 3965 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3966 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3967 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3968 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3969 } else { 3970 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3971 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3972 } 3973 kortestql(mask, mask); 3974 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3975 addptr(result, stride2x2); // update since we already compared at this addr 3976 subl(cnt2, stride2x2); // and sub the size too 3977 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3978 3979 vpxor(vec1, vec1); 3980 jmpb(COMPARE_WIDE_TAIL); 3981 }//if (VM_Version::supports_avx512vlbw()) 3982 3983 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3984 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3985 vmovdqu(vec1, Address(str1, result, scale)); 3986 vpxor(vec1, Address(str2, result, scale)); 3987 } else { 3988 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3989 vpxor(vec1, Address(str2, result, scale2)); 3990 } 3991 vptest(vec1, vec1); 3992 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3993 addptr(result, stride2); 3994 subl(cnt2, stride2); 3995 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3996 // clean upper bits of YMM registers 3997 vpxor(vec1, vec1); 3998 3999 // compare wide vectors tail 4000 bind(COMPARE_WIDE_TAIL); 4001 testptr(result, result); 4002 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 4003 4004 movl(result, stride2); 4005 movl(cnt2, result); 4006 negptr(result); 4007 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4008 4009 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 4010 bind(VECTOR_NOT_EQUAL); 4011 // clean upper bits of YMM registers 4012 vpxor(vec1, vec1); 4013 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4014 lea(str1, Address(str1, result, scale)); 4015 lea(str2, Address(str2, result, scale)); 4016 } else { 4017 lea(str1, Address(str1, result, scale1)); 4018 lea(str2, Address(str2, result, scale2)); 4019 } 4020 jmp(COMPARE_16_CHARS); 4021 4022 // Compare tail chars, length between 1 to 15 chars 4023 bind(COMPARE_TAIL_LONG); 4024 movl(cnt2, result); 4025 cmpl(cnt2, stride); 4026 jcc(Assembler::less, COMPARE_SMALL_STR); 4027 4028 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4029 movdqu(vec1, Address(str1, 0)); 4030 } else { 4031 pmovzxbw(vec1, Address(str1, 0)); 4032 } 4033 pcmpestri(vec1, Address(str2, 0), pcmpmask); 4034 jcc(Assembler::below, COMPARE_INDEX_CHAR); 4035 subptr(cnt2, stride); 4036 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 4037 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4038 lea(str1, Address(str1, result, scale)); 4039 lea(str2, Address(str2, result, scale)); 4040 } else { 4041 lea(str1, Address(str1, result, scale1)); 4042 lea(str2, Address(str2, result, scale2)); 4043 } 4044 negptr(cnt2); 4045 jmpb(WHILE_HEAD_LABEL); 4046 4047 bind(COMPARE_SMALL_STR); 4048 } else if (UseSSE42Intrinsics) { 4049 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 4050 int pcmpmask = 0x19; 4051 // Setup to compare 8-char (16-byte) vectors, 4052 // start from first character again because it has aligned address. 4053 movl(result, cnt2); 4054 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 4055 if (ae == StrIntrinsicNode::LL) { 4056 pcmpmask &= ~0x01; 4057 } 4058 jcc(Assembler::zero, COMPARE_TAIL); 4059 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4060 lea(str1, Address(str1, result, scale)); 4061 lea(str2, Address(str2, result, scale)); 4062 } else { 4063 lea(str1, Address(str1, result, scale1)); 4064 lea(str2, Address(str2, result, scale2)); 4065 } 4066 negptr(result); 4067 4068 // pcmpestri 4069 // inputs: 4070 // vec1- substring 4071 // rax - negative string length (elements count) 4072 // mem - scanned string 4073 // rdx - string length (elements count) 4074 // pcmpmask - cmp mode: 11000 (string compare with negated result) 4075 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 4076 // outputs: 4077 // rcx - first mismatched element index 4078 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 4079 4080 bind(COMPARE_WIDE_VECTORS); 4081 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4082 movdqu(vec1, Address(str1, result, scale)); 4083 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 4084 } else { 4085 pmovzxbw(vec1, Address(str1, result, scale1)); 4086 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4087 } 4088 // After pcmpestri cnt1(rcx) contains mismatched element index 4089 4090 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 4091 addptr(result, stride); 4092 subptr(cnt2, stride); 4093 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4094 4095 // compare wide vectors tail 4096 testptr(result, result); 4097 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 4098 4099 movl(cnt2, stride); 4100 movl(result, stride); 4101 negptr(result); 4102 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4103 movdqu(vec1, Address(str1, result, scale)); 4104 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 4105 } else { 4106 pmovzxbw(vec1, Address(str1, result, scale1)); 4107 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4108 } 4109 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 4110 4111 // Mismatched characters in the vectors 4112 bind(VECTOR_NOT_EQUAL); 4113 addptr(cnt1, result); 4114 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 4115 subl(result, cnt2); 4116 jmpb(POP_LABEL); 4117 4118 bind(COMPARE_TAIL); // limit is zero 4119 movl(cnt2, result); 4120 // Fallthru to tail compare 4121 } 4122 // Shift str2 and str1 to the end of the arrays, negate min 4123 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4124 lea(str1, Address(str1, cnt2, scale)); 4125 lea(str2, Address(str2, cnt2, scale)); 4126 } else { 4127 lea(str1, Address(str1, cnt2, scale1)); 4128 lea(str2, Address(str2, cnt2, scale2)); 4129 } 4130 decrementl(cnt2); // first character was compared already 4131 negptr(cnt2); 4132 4133 // Compare the rest of the elements 4134 bind(WHILE_HEAD_LABEL); 4135 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 4136 subl(result, cnt1); 4137 jccb(Assembler::notZero, POP_LABEL); 4138 increment(cnt2); 4139 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 4140 4141 // Strings are equal up to min length. Return the length difference. 4142 bind(LENGTH_DIFF_LABEL); 4143 pop(result); 4144 if (ae == StrIntrinsicNode::UU) { 4145 // Divide diff by 2 to get number of chars 4146 sarl(result, 1); 4147 } 4148 jmpb(DONE_LABEL); 4149 4150 if (VM_Version::supports_avx512vlbw()) { 4151 4152 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 4153 4154 kmovql(cnt1, mask); 4155 notq(cnt1); 4156 bsfq(cnt2, cnt1); 4157 if (ae != StrIntrinsicNode::LL) { 4158 // Divide diff by 2 to get number of chars 4159 sarl(cnt2, 1); 4160 } 4161 addq(result, cnt2); 4162 if (ae == StrIntrinsicNode::LL) { 4163 load_unsigned_byte(cnt1, Address(str2, result)); 4164 load_unsigned_byte(result, Address(str1, result)); 4165 } else if (ae == StrIntrinsicNode::UU) { 4166 load_unsigned_short(cnt1, Address(str2, result, scale)); 4167 load_unsigned_short(result, Address(str1, result, scale)); 4168 } else { 4169 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4170 load_unsigned_byte(result, Address(str1, result, scale1)); 4171 } 4172 subl(result, cnt1); 4173 jmpb(POP_LABEL); 4174 }//if (VM_Version::supports_avx512vlbw()) 4175 4176 // Discard the stored length difference 4177 bind(POP_LABEL); 4178 pop(cnt1); 4179 4180 // That's it 4181 bind(DONE_LABEL); 4182 if(ae == StrIntrinsicNode::UL) { 4183 negl(result); 4184 } 4185 4186 } 4187 4188 // Search for Non-ASCII character (Negative byte value) in a byte array, 4189 // return the index of the first such character, otherwise the length 4190 // of the array segment searched. 4191 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4192 // @IntrinsicCandidate 4193 // public static int countPositives(byte[] ba, int off, int len) { 4194 // for (int i = off; i < off + len; i++) { 4195 // if (ba[i] < 0) { 4196 // return i - off; 4197 // } 4198 // } 4199 // return len; 4200 // } 4201 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4202 Register result, Register tmp1, 4203 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4204 // rsi: byte array 4205 // rcx: len 4206 // rax: result 4207 ShortBranchVerifier sbv(this); 4208 assert_different_registers(ary1, len, result, tmp1); 4209 assert_different_registers(vec1, vec2); 4210 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4211 4212 movl(result, len); // copy 4213 // len == 0 4214 testl(len, len); 4215 jcc(Assembler::zero, DONE); 4216 4217 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4218 VM_Version::supports_avx512vlbw() && 4219 VM_Version::supports_bmi2()) { 4220 4221 Label test_64_loop, test_tail, BREAK_LOOP; 4222 movl(tmp1, len); 4223 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4224 4225 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4226 andl(len, 0xffffffc0); // vector count (in chars) 4227 jccb(Assembler::zero, test_tail); 4228 4229 lea(ary1, Address(ary1, len, Address::times_1)); 4230 negptr(len); 4231 4232 bind(test_64_loop); 4233 // Check whether our 64 elements of size byte contain negatives 4234 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4235 kortestql(mask1, mask1); 4236 jcc(Assembler::notZero, BREAK_LOOP); 4237 4238 addptr(len, 64); 4239 jccb(Assembler::notZero, test_64_loop); 4240 4241 bind(test_tail); 4242 // bail out when there is nothing to be done 4243 testl(tmp1, -1); 4244 jcc(Assembler::zero, DONE); 4245 4246 4247 // check the tail for absense of negatives 4248 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4249 { 4250 Register tmp3_aliased = len; 4251 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4252 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4253 notq(tmp3_aliased); 4254 kmovql(mask2, tmp3_aliased); 4255 } 4256 4257 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4258 ktestq(mask1, mask2); 4259 jcc(Assembler::zero, DONE); 4260 4261 // do a full check for negative registers in the tail 4262 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4263 // ary1 already pointing to the right place 4264 jmpb(TAIL_START); 4265 4266 bind(BREAK_LOOP); 4267 // At least one byte in the last 64 byte block was negative. 4268 // Set up to look at the last 64 bytes as if they were a tail 4269 lea(ary1, Address(ary1, len, Address::times_1)); 4270 addptr(result, len); 4271 // Ignore the very last byte: if all others are positive, 4272 // it must be negative, so we can skip right to the 2+1 byte 4273 // end comparison at this point 4274 orl(result, 63); 4275 movl(len, 63); 4276 // Fallthru to tail compare 4277 } else { 4278 4279 if (UseAVX >= 2) { 4280 // With AVX2, use 32-byte vector compare 4281 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4282 4283 // Compare 32-byte vectors 4284 testl(len, 0xffffffe0); // vector count (in bytes) 4285 jccb(Assembler::zero, TAIL_START); 4286 4287 andl(len, 0xffffffe0); 4288 lea(ary1, Address(ary1, len, Address::times_1)); 4289 negptr(len); 4290 4291 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4292 movdl(vec2, tmp1); 4293 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4294 4295 bind(COMPARE_WIDE_VECTORS); 4296 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4297 vptest(vec1, vec2); 4298 jccb(Assembler::notZero, BREAK_LOOP); 4299 addptr(len, 32); 4300 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4301 4302 testl(result, 0x0000001f); // any bytes remaining? 4303 jcc(Assembler::zero, DONE); 4304 4305 // Quick test using the already prepared vector mask 4306 movl(len, result); 4307 andl(len, 0x0000001f); 4308 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4309 vptest(vec1, vec2); 4310 jcc(Assembler::zero, DONE); 4311 // There are zeros, jump to the tail to determine exactly where 4312 jmpb(TAIL_START); 4313 4314 bind(BREAK_LOOP); 4315 // At least one byte in the last 32-byte vector is negative. 4316 // Set up to look at the last 32 bytes as if they were a tail 4317 lea(ary1, Address(ary1, len, Address::times_1)); 4318 addptr(result, len); 4319 // Ignore the very last byte: if all others are positive, 4320 // it must be negative, so we can skip right to the 2+1 byte 4321 // end comparison at this point 4322 orl(result, 31); 4323 movl(len, 31); 4324 // Fallthru to tail compare 4325 } else if (UseSSE42Intrinsics) { 4326 // With SSE4.2, use double quad vector compare 4327 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4328 4329 // Compare 16-byte vectors 4330 testl(len, 0xfffffff0); // vector count (in bytes) 4331 jcc(Assembler::zero, TAIL_START); 4332 4333 andl(len, 0xfffffff0); 4334 lea(ary1, Address(ary1, len, Address::times_1)); 4335 negptr(len); 4336 4337 movl(tmp1, 0x80808080); 4338 movdl(vec2, tmp1); 4339 pshufd(vec2, vec2, 0); 4340 4341 bind(COMPARE_WIDE_VECTORS); 4342 movdqu(vec1, Address(ary1, len, Address::times_1)); 4343 ptest(vec1, vec2); 4344 jccb(Assembler::notZero, BREAK_LOOP); 4345 addptr(len, 16); 4346 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4347 4348 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4349 jcc(Assembler::zero, DONE); 4350 4351 // Quick test using the already prepared vector mask 4352 movl(len, result); 4353 andl(len, 0x0000000f); // tail count (in bytes) 4354 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4355 ptest(vec1, vec2); 4356 jcc(Assembler::zero, DONE); 4357 jmpb(TAIL_START); 4358 4359 bind(BREAK_LOOP); 4360 // At least one byte in the last 16-byte vector is negative. 4361 // Set up and look at the last 16 bytes as if they were a tail 4362 lea(ary1, Address(ary1, len, Address::times_1)); 4363 addptr(result, len); 4364 // Ignore the very last byte: if all others are positive, 4365 // it must be negative, so we can skip right to the 2+1 byte 4366 // end comparison at this point 4367 orl(result, 15); 4368 movl(len, 15); 4369 // Fallthru to tail compare 4370 } 4371 } 4372 4373 bind(TAIL_START); 4374 // Compare 4-byte vectors 4375 andl(len, 0xfffffffc); // vector count (in bytes) 4376 jccb(Assembler::zero, COMPARE_CHAR); 4377 4378 lea(ary1, Address(ary1, len, Address::times_1)); 4379 negptr(len); 4380 4381 bind(COMPARE_VECTORS); 4382 movl(tmp1, Address(ary1, len, Address::times_1)); 4383 andl(tmp1, 0x80808080); 4384 jccb(Assembler::notZero, TAIL_ADJUST); 4385 addptr(len, 4); 4386 jccb(Assembler::notZero, COMPARE_VECTORS); 4387 4388 // Compare trailing char (final 2-3 bytes), if any 4389 bind(COMPARE_CHAR); 4390 4391 testl(result, 0x2); // tail char 4392 jccb(Assembler::zero, COMPARE_BYTE); 4393 load_unsigned_short(tmp1, Address(ary1, 0)); 4394 andl(tmp1, 0x00008080); 4395 jccb(Assembler::notZero, CHAR_ADJUST); 4396 lea(ary1, Address(ary1, 2)); 4397 4398 bind(COMPARE_BYTE); 4399 testl(result, 0x1); // tail byte 4400 jccb(Assembler::zero, DONE); 4401 load_unsigned_byte(tmp1, Address(ary1, 0)); 4402 testl(tmp1, 0x00000080); 4403 jccb(Assembler::zero, DONE); 4404 subptr(result, 1); 4405 jmpb(DONE); 4406 4407 bind(TAIL_ADJUST); 4408 // there are negative bits in the last 4 byte block. 4409 // Adjust result and check the next three bytes 4410 addptr(result, len); 4411 orl(result, 3); 4412 lea(ary1, Address(ary1, len, Address::times_1)); 4413 jmpb(COMPARE_CHAR); 4414 4415 bind(CHAR_ADJUST); 4416 // We are looking at a char + optional byte tail, and found that one 4417 // of the bytes in the char is negative. Adjust the result, check the 4418 // first byte and readjust if needed. 4419 andl(result, 0xfffffffc); 4420 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4421 jccb(Assembler::notZero, DONE); 4422 addptr(result, 1); 4423 4424 // That's it 4425 bind(DONE); 4426 if (UseAVX >= 2) { 4427 // clean upper bits of YMM registers 4428 vpxor(vec1, vec1); 4429 vpxor(vec2, vec2); 4430 } 4431 } 4432 4433 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4434 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4435 Register limit, Register result, Register chr, 4436 XMMRegister vec1, XMMRegister vec2, bool is_char, 4437 KRegister mask, bool expand_ary2) { 4438 // for expand_ary2, limit is the (smaller) size of the second array. 4439 ShortBranchVerifier sbv(this); 4440 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4441 4442 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4443 "Expansion only implemented for AVX2"); 4444 4445 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4446 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4447 4448 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4449 int scaleIncr = expand_ary2 ? 8 : 16; 4450 4451 if (is_array_equ) { 4452 // Check the input args 4453 cmpoop(ary1, ary2); 4454 jcc(Assembler::equal, TRUE_LABEL); 4455 4456 // Need additional checks for arrays_equals. 4457 testptr(ary1, ary1); 4458 jcc(Assembler::zero, FALSE_LABEL); 4459 testptr(ary2, ary2); 4460 jcc(Assembler::zero, FALSE_LABEL); 4461 4462 // Check the lengths 4463 movl(limit, Address(ary1, length_offset)); 4464 cmpl(limit, Address(ary2, length_offset)); 4465 jcc(Assembler::notEqual, FALSE_LABEL); 4466 } 4467 4468 // count == 0 4469 testl(limit, limit); 4470 jcc(Assembler::zero, TRUE_LABEL); 4471 4472 if (is_array_equ) { 4473 // Load array address 4474 lea(ary1, Address(ary1, base_offset)); 4475 lea(ary2, Address(ary2, base_offset)); 4476 } 4477 4478 if (is_array_equ && is_char) { 4479 // arrays_equals when used for char[]. 4480 shll(limit, 1); // byte count != 0 4481 } 4482 movl(result, limit); // copy 4483 4484 if (UseAVX >= 2) { 4485 // With AVX2, use 32-byte vector compare 4486 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4487 4488 // Compare 32-byte vectors 4489 if (expand_ary2) { 4490 andl(result, 0x0000000f); // tail count (in bytes) 4491 andl(limit, 0xfffffff0); // vector count (in bytes) 4492 jcc(Assembler::zero, COMPARE_TAIL); 4493 } else { 4494 andl(result, 0x0000001f); // tail count (in bytes) 4495 andl(limit, 0xffffffe0); // vector count (in bytes) 4496 jcc(Assembler::zero, COMPARE_TAIL_16); 4497 } 4498 4499 lea(ary1, Address(ary1, limit, scaleFactor)); 4500 lea(ary2, Address(ary2, limit, Address::times_1)); 4501 negptr(limit); 4502 4503 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4504 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4505 4506 cmpl(limit, -64); 4507 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4508 4509 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4510 4511 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4512 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4513 kortestql(mask, mask); 4514 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4515 addptr(limit, 64); // update since we already compared at this addr 4516 cmpl(limit, -64); 4517 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4518 4519 // At this point we may still need to compare -limit+result bytes. 4520 // We could execute the next two instruction and just continue via non-wide path: 4521 // cmpl(limit, 0); 4522 // jcc(Assembler::equal, COMPARE_TAIL); // true 4523 // But since we stopped at the points ary{1,2}+limit which are 4524 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4525 // (|limit| <= 32 and result < 32), 4526 // we may just compare the last 64 bytes. 4527 // 4528 addptr(result, -64); // it is safe, bc we just came from this area 4529 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4530 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4531 kortestql(mask, mask); 4532 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4533 4534 jmp(TRUE_LABEL); 4535 4536 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4537 4538 }//if (VM_Version::supports_avx512vlbw()) 4539 4540 bind(COMPARE_WIDE_VECTORS); 4541 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4542 if (expand_ary2) { 4543 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4544 } else { 4545 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4546 } 4547 vpxor(vec1, vec2); 4548 4549 vptest(vec1, vec1); 4550 jcc(Assembler::notZero, FALSE_LABEL); 4551 addptr(limit, scaleIncr * 2); 4552 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4553 4554 testl(result, result); 4555 jcc(Assembler::zero, TRUE_LABEL); 4556 4557 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4558 if (expand_ary2) { 4559 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4560 } else { 4561 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4562 } 4563 vpxor(vec1, vec2); 4564 4565 vptest(vec1, vec1); 4566 jcc(Assembler::notZero, FALSE_LABEL); 4567 jmp(TRUE_LABEL); 4568 4569 bind(COMPARE_TAIL_16); // limit is zero 4570 movl(limit, result); 4571 4572 // Compare 16-byte chunks 4573 andl(result, 0x0000000f); // tail count (in bytes) 4574 andl(limit, 0xfffffff0); // vector count (in bytes) 4575 jcc(Assembler::zero, COMPARE_TAIL); 4576 4577 lea(ary1, Address(ary1, limit, scaleFactor)); 4578 lea(ary2, Address(ary2, limit, Address::times_1)); 4579 negptr(limit); 4580 4581 bind(COMPARE_WIDE_VECTORS_16); 4582 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4583 if (expand_ary2) { 4584 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4585 } else { 4586 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4587 } 4588 pxor(vec1, vec2); 4589 4590 ptest(vec1, vec1); 4591 jcc(Assembler::notZero, FALSE_LABEL); 4592 addptr(limit, scaleIncr); 4593 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4594 4595 bind(COMPARE_TAIL); // limit is zero 4596 movl(limit, result); 4597 // Fallthru to tail compare 4598 } else if (UseSSE42Intrinsics) { 4599 // With SSE4.2, use double quad vector compare 4600 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4601 4602 // Compare 16-byte vectors 4603 andl(result, 0x0000000f); // tail count (in bytes) 4604 andl(limit, 0xfffffff0); // vector count (in bytes) 4605 jcc(Assembler::zero, COMPARE_TAIL); 4606 4607 lea(ary1, Address(ary1, limit, Address::times_1)); 4608 lea(ary2, Address(ary2, limit, Address::times_1)); 4609 negptr(limit); 4610 4611 bind(COMPARE_WIDE_VECTORS); 4612 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4613 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4614 pxor(vec1, vec2); 4615 4616 ptest(vec1, vec1); 4617 jcc(Assembler::notZero, FALSE_LABEL); 4618 addptr(limit, 16); 4619 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4620 4621 testl(result, result); 4622 jcc(Assembler::zero, TRUE_LABEL); 4623 4624 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4625 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4626 pxor(vec1, vec2); 4627 4628 ptest(vec1, vec1); 4629 jccb(Assembler::notZero, FALSE_LABEL); 4630 jmpb(TRUE_LABEL); 4631 4632 bind(COMPARE_TAIL); // limit is zero 4633 movl(limit, result); 4634 // Fallthru to tail compare 4635 } 4636 4637 // Compare 4-byte vectors 4638 if (expand_ary2) { 4639 testl(result, result); 4640 jccb(Assembler::zero, TRUE_LABEL); 4641 } else { 4642 andl(limit, 0xfffffffc); // vector count (in bytes) 4643 jccb(Assembler::zero, COMPARE_CHAR); 4644 } 4645 4646 lea(ary1, Address(ary1, limit, scaleFactor)); 4647 lea(ary2, Address(ary2, limit, Address::times_1)); 4648 negptr(limit); 4649 4650 bind(COMPARE_VECTORS); 4651 if (expand_ary2) { 4652 // There are no "vector" operations for bytes to shorts 4653 movzbl(chr, Address(ary2, limit, Address::times_1)); 4654 cmpw(Address(ary1, limit, Address::times_2), chr); 4655 jccb(Assembler::notEqual, FALSE_LABEL); 4656 addptr(limit, 1); 4657 jcc(Assembler::notZero, COMPARE_VECTORS); 4658 jmp(TRUE_LABEL); 4659 } else { 4660 movl(chr, Address(ary1, limit, Address::times_1)); 4661 cmpl(chr, Address(ary2, limit, Address::times_1)); 4662 jccb(Assembler::notEqual, FALSE_LABEL); 4663 addptr(limit, 4); 4664 jcc(Assembler::notZero, COMPARE_VECTORS); 4665 } 4666 4667 // Compare trailing char (final 2 bytes), if any 4668 bind(COMPARE_CHAR); 4669 testl(result, 0x2); // tail char 4670 jccb(Assembler::zero, COMPARE_BYTE); 4671 load_unsigned_short(chr, Address(ary1, 0)); 4672 load_unsigned_short(limit, Address(ary2, 0)); 4673 cmpl(chr, limit); 4674 jccb(Assembler::notEqual, FALSE_LABEL); 4675 4676 if (is_array_equ && is_char) { 4677 bind(COMPARE_BYTE); 4678 } else { 4679 lea(ary1, Address(ary1, 2)); 4680 lea(ary2, Address(ary2, 2)); 4681 4682 bind(COMPARE_BYTE); 4683 testl(result, 0x1); // tail byte 4684 jccb(Assembler::zero, TRUE_LABEL); 4685 load_unsigned_byte(chr, Address(ary1, 0)); 4686 load_unsigned_byte(limit, Address(ary2, 0)); 4687 cmpl(chr, limit); 4688 jccb(Assembler::notEqual, FALSE_LABEL); 4689 } 4690 bind(TRUE_LABEL); 4691 movl(result, 1); // return true 4692 jmpb(DONE); 4693 4694 bind(FALSE_LABEL); 4695 xorl(result, result); // return false 4696 4697 // That's it 4698 bind(DONE); 4699 if (UseAVX >= 2) { 4700 // clean upper bits of YMM registers 4701 vpxor(vec1, vec1); 4702 vpxor(vec2, vec2); 4703 } 4704 } 4705 4706 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4707 #define __ masm. 4708 Register dst = stub.data<0>(); 4709 XMMRegister src = stub.data<1>(); 4710 address target = stub.data<2>(); 4711 __ bind(stub.entry()); 4712 __ subptr(rsp, 8); 4713 __ movdbl(Address(rsp), src); 4714 __ call(RuntimeAddress(target)); 4715 // APX REX2 encoding for pop(dst) increases the stub size by 1 byte. 4716 __ pop(dst); 4717 __ jmp(stub.continuation()); 4718 #undef __ 4719 } 4720 4721 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4722 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4723 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4724 4725 address slowpath_target; 4726 if (dst_bt == T_INT) { 4727 if (src_bt == T_FLOAT) { 4728 cvttss2sil(dst, src); 4729 cmpl(dst, 0x80000000); 4730 slowpath_target = StubRoutines::x86::f2i_fixup(); 4731 } else { 4732 cvttsd2sil(dst, src); 4733 cmpl(dst, 0x80000000); 4734 slowpath_target = StubRoutines::x86::d2i_fixup(); 4735 } 4736 } else { 4737 if (src_bt == T_FLOAT) { 4738 cvttss2siq(dst, src); 4739 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4740 slowpath_target = StubRoutines::x86::f2l_fixup(); 4741 } else { 4742 cvttsd2siq(dst, src); 4743 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4744 slowpath_target = StubRoutines::x86::d2l_fixup(); 4745 } 4746 } 4747 4748 // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte. 4749 int max_size = 23 + (UseAPX ? 1 : 0); 4750 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath); 4751 jcc(Assembler::equal, stub->entry()); 4752 bind(stub->continuation()); 4753 } 4754 4755 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4756 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4757 switch(ideal_opc) { 4758 case Op_LShiftVS: 4759 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4760 case Op_LShiftVI: 4761 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4762 case Op_LShiftVL: 4763 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4764 case Op_RShiftVS: 4765 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4766 case Op_RShiftVI: 4767 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4768 case Op_RShiftVL: 4769 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4770 case Op_URShiftVS: 4771 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4772 case Op_URShiftVI: 4773 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4774 case Op_URShiftVL: 4775 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4776 case Op_RotateRightV: 4777 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4778 case Op_RotateLeftV: 4779 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4780 default: 4781 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4782 break; 4783 } 4784 } 4785 4786 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4787 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) { 4788 if (is_unsigned) { 4789 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4790 } else { 4791 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4792 } 4793 } 4794 4795 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4796 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4797 switch (elem_bt) { 4798 case T_BYTE: 4799 if (ideal_opc == Op_SaturatingAddV) { 4800 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4801 } else { 4802 assert(ideal_opc == Op_SaturatingSubV, ""); 4803 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4804 } 4805 break; 4806 case T_SHORT: 4807 if (ideal_opc == Op_SaturatingAddV) { 4808 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4809 } else { 4810 assert(ideal_opc == Op_SaturatingSubV, ""); 4811 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4812 } 4813 break; 4814 default: 4815 fatal("Unsupported type %s", type2name(elem_bt)); 4816 break; 4817 } 4818 } 4819 4820 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4821 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4822 switch (elem_bt) { 4823 case T_BYTE: 4824 if (ideal_opc == Op_SaturatingAddV) { 4825 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4826 } else { 4827 assert(ideal_opc == Op_SaturatingSubV, ""); 4828 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4829 } 4830 break; 4831 case T_SHORT: 4832 if (ideal_opc == Op_SaturatingAddV) { 4833 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4834 } else { 4835 assert(ideal_opc == Op_SaturatingSubV, ""); 4836 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4837 } 4838 break; 4839 default: 4840 fatal("Unsupported type %s", type2name(elem_bt)); 4841 break; 4842 } 4843 } 4844 4845 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4846 Address src2, bool is_unsigned, bool merge, int vlen_enc) { 4847 if (is_unsigned) { 4848 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4849 } else { 4850 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4851 } 4852 } 4853 4854 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4855 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4856 switch (elem_bt) { 4857 case T_BYTE: 4858 if (ideal_opc == Op_SaturatingAddV) { 4859 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4860 } else { 4861 assert(ideal_opc == Op_SaturatingSubV, ""); 4862 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4863 } 4864 break; 4865 case T_SHORT: 4866 if (ideal_opc == Op_SaturatingAddV) { 4867 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4868 } else { 4869 assert(ideal_opc == Op_SaturatingSubV, ""); 4870 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4871 } 4872 break; 4873 default: 4874 fatal("Unsupported type %s", type2name(elem_bt)); 4875 break; 4876 } 4877 } 4878 4879 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4880 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4881 switch (elem_bt) { 4882 case T_BYTE: 4883 if (ideal_opc == Op_SaturatingAddV) { 4884 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4885 } else { 4886 assert(ideal_opc == Op_SaturatingSubV, ""); 4887 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4888 } 4889 break; 4890 case T_SHORT: 4891 if (ideal_opc == Op_SaturatingAddV) { 4892 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4893 } else { 4894 assert(ideal_opc == Op_SaturatingSubV, ""); 4895 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4896 } 4897 break; 4898 default: 4899 fatal("Unsupported type %s", type2name(elem_bt)); 4900 break; 4901 } 4902 } 4903 4904 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4905 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4906 bool is_varshift) { 4907 switch (ideal_opc) { 4908 case Op_AddVB: 4909 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4910 case Op_AddVS: 4911 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4912 case Op_AddVI: 4913 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4914 case Op_AddVL: 4915 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4916 case Op_AddVF: 4917 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4918 case Op_AddVD: 4919 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4920 case Op_SubVB: 4921 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4922 case Op_SubVS: 4923 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4924 case Op_SubVI: 4925 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4926 case Op_SubVL: 4927 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4928 case Op_SubVF: 4929 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4930 case Op_SubVD: 4931 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4932 case Op_MulVS: 4933 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4934 case Op_MulVI: 4935 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4936 case Op_MulVL: 4937 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4938 case Op_MulVF: 4939 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4940 case Op_MulVD: 4941 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4942 case Op_DivVF: 4943 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4944 case Op_DivVD: 4945 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4946 case Op_SqrtVF: 4947 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4948 case Op_SqrtVD: 4949 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4950 case Op_AbsVB: 4951 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4952 case Op_AbsVS: 4953 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4954 case Op_AbsVI: 4955 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4956 case Op_AbsVL: 4957 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4958 case Op_FmaVF: 4959 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4960 case Op_FmaVD: 4961 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4962 case Op_VectorRearrange: 4963 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4964 case Op_LShiftVS: 4965 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4966 case Op_LShiftVI: 4967 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4968 case Op_LShiftVL: 4969 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4970 case Op_RShiftVS: 4971 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4972 case Op_RShiftVI: 4973 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4974 case Op_RShiftVL: 4975 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4976 case Op_URShiftVS: 4977 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4978 case Op_URShiftVI: 4979 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4980 case Op_URShiftVL: 4981 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4982 case Op_RotateLeftV: 4983 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4984 case Op_RotateRightV: 4985 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4986 case Op_MaxV: 4987 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4988 case Op_MinV: 4989 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4990 case Op_UMinV: 4991 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4992 case Op_UMaxV: 4993 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4994 case Op_XorV: 4995 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4996 case Op_OrV: 4997 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4998 case Op_AndV: 4999 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5000 default: 5001 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 5002 break; 5003 } 5004 } 5005 5006 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 5007 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 5008 switch (ideal_opc) { 5009 case Op_AddVB: 5010 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 5011 case Op_AddVS: 5012 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 5013 case Op_AddVI: 5014 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 5015 case Op_AddVL: 5016 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 5017 case Op_AddVF: 5018 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 5019 case Op_AddVD: 5020 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 5021 case Op_SubVB: 5022 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 5023 case Op_SubVS: 5024 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 5025 case Op_SubVI: 5026 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 5027 case Op_SubVL: 5028 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 5029 case Op_SubVF: 5030 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 5031 case Op_SubVD: 5032 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 5033 case Op_MulVS: 5034 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 5035 case Op_MulVI: 5036 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 5037 case Op_MulVL: 5038 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 5039 case Op_MulVF: 5040 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 5041 case Op_MulVD: 5042 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 5043 case Op_DivVF: 5044 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 5045 case Op_DivVD: 5046 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 5047 case Op_FmaVF: 5048 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 5049 case Op_FmaVD: 5050 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 5051 case Op_MaxV: 5052 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5053 case Op_MinV: 5054 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5055 case Op_UMaxV: 5056 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5057 case Op_UMinV: 5058 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5059 case Op_XorV: 5060 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5061 case Op_OrV: 5062 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5063 case Op_AndV: 5064 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 5065 default: 5066 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 5067 break; 5068 } 5069 } 5070 5071 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 5072 KRegister src1, KRegister src2) { 5073 BasicType etype = T_ILLEGAL; 5074 switch(mask_len) { 5075 case 2: 5076 case 4: 5077 case 8: etype = T_BYTE; break; 5078 case 16: etype = T_SHORT; break; 5079 case 32: etype = T_INT; break; 5080 case 64: etype = T_LONG; break; 5081 default: fatal("Unsupported type"); break; 5082 } 5083 assert(etype != T_ILLEGAL, ""); 5084 switch(ideal_opc) { 5085 case Op_AndVMask: 5086 kand(etype, dst, src1, src2); break; 5087 case Op_OrVMask: 5088 kor(etype, dst, src1, src2); break; 5089 case Op_XorVMask: 5090 kxor(etype, dst, src1, src2); break; 5091 default: 5092 fatal("Unsupported masked operation"); break; 5093 } 5094 } 5095 5096 /* 5097 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5098 * If src is NaN, the result is 0. 5099 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 5100 * the result is equal to the value of Integer.MIN_VALUE. 5101 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 5102 * the result is equal to the value of Integer.MAX_VALUE. 5103 */ 5104 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5105 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5106 Register rscratch, AddressLiteral float_sign_flip, 5107 int vec_enc) { 5108 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5109 Label done; 5110 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 5111 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5112 vptest(xtmp2, xtmp2, vec_enc); 5113 jccb(Assembler::equal, done); 5114 5115 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 5116 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 5117 5118 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5119 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 5120 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 5121 5122 // Recompute the mask for remaining special value. 5123 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 5124 // Extract SRC values corresponding to TRUE mask lanes. 5125 vpand(xtmp4, xtmp2, src, vec_enc); 5126 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 5127 // values are set. 5128 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 5129 5130 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 5131 bind(done); 5132 } 5133 5134 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5135 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5136 Register rscratch, AddressLiteral float_sign_flip, 5137 int vec_enc) { 5138 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5139 Label done; 5140 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5141 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5142 kortestwl(ktmp1, ktmp1); 5143 jccb(Assembler::equal, done); 5144 5145 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5146 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5147 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5148 5149 kxorwl(ktmp1, ktmp1, ktmp2); 5150 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5151 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5152 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5153 bind(done); 5154 } 5155 5156 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5157 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5158 Register rscratch, AddressLiteral double_sign_flip, 5159 int vec_enc) { 5160 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5161 5162 Label done; 5163 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5164 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 5165 kortestwl(ktmp1, ktmp1); 5166 jccb(Assembler::equal, done); 5167 5168 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5169 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5170 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5171 5172 kxorwl(ktmp1, ktmp1, ktmp2); 5173 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5174 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5175 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5176 bind(done); 5177 } 5178 5179 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5180 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5181 Register rscratch, AddressLiteral float_sign_flip, 5182 int vec_enc) { 5183 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5184 Label done; 5185 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5186 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5187 kortestwl(ktmp1, ktmp1); 5188 jccb(Assembler::equal, done); 5189 5190 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5191 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5192 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5193 5194 kxorwl(ktmp1, ktmp1, ktmp2); 5195 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5196 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5197 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5198 bind(done); 5199 } 5200 5201 /* 5202 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5203 * If src is NaN, the result is 0. 5204 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 5205 * the result is equal to the value of Long.MIN_VALUE. 5206 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 5207 * the result is equal to the value of Long.MAX_VALUE. 5208 */ 5209 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5210 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5211 Register rscratch, AddressLiteral double_sign_flip, 5212 int vec_enc) { 5213 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5214 5215 Label done; 5216 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5217 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 5218 kortestwl(ktmp1, ktmp1); 5219 jccb(Assembler::equal, done); 5220 5221 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5222 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5223 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5224 5225 kxorwl(ktmp1, ktmp1, ktmp2); 5226 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5227 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5228 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5229 bind(done); 5230 } 5231 5232 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5233 XMMRegister xtmp, int index, int vec_enc) { 5234 assert(vec_enc < Assembler::AVX_512bit, ""); 5235 if (vec_enc == Assembler::AVX_256bit) { 5236 vextractf128_high(xtmp, src); 5237 vshufps(dst, src, xtmp, index, vec_enc); 5238 } else { 5239 vshufps(dst, src, zero, index, vec_enc); 5240 } 5241 } 5242 5243 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5244 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5245 AddressLiteral float_sign_flip, int src_vec_enc) { 5246 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5247 5248 Label done; 5249 // Compare the destination lanes with float_sign_flip 5250 // value to get mask for all special values. 5251 movdqu(xtmp1, float_sign_flip, rscratch); 5252 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5253 ptest(xtmp2, xtmp2); 5254 jccb(Assembler::equal, done); 5255 5256 // Flip float_sign_flip to get max integer value. 5257 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5258 pxor(xtmp1, xtmp4); 5259 5260 // Set detination lanes corresponding to unordered source lanes as zero. 5261 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5262 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5263 5264 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5265 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5266 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5267 5268 // Recompute the mask for remaining special value. 5269 pxor(xtmp2, xtmp3); 5270 // Extract mask corresponding to non-negative source lanes. 5271 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5272 5273 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5274 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5275 pand(xtmp3, xtmp2); 5276 5277 // Replace destination lanes holding special value(0x80000000) with max int 5278 // if corresponding source lane holds a +ve value. 5279 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5280 bind(done); 5281 } 5282 5283 5284 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5285 XMMRegister xtmp, Register rscratch, int vec_enc) { 5286 switch(to_elem_bt) { 5287 case T_SHORT: 5288 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5289 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5290 vpackusdw(dst, dst, zero, vec_enc); 5291 if (vec_enc == Assembler::AVX_256bit) { 5292 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5293 } 5294 break; 5295 case T_BYTE: 5296 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5297 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5298 vpackusdw(dst, dst, zero, vec_enc); 5299 if (vec_enc == Assembler::AVX_256bit) { 5300 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5301 } 5302 vpackuswb(dst, dst, zero, vec_enc); 5303 break; 5304 default: assert(false, "%s", type2name(to_elem_bt)); 5305 } 5306 } 5307 5308 /* 5309 * Algorithm for vector D2L and F2I conversions:- 5310 * a) Perform vector D2L/F2I cast. 5311 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5312 * It signifies that source value could be any of the special floating point 5313 * values(NaN,-Inf,Inf,Max,-Min). 5314 * c) Set destination to zero if source is NaN value. 5315 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5316 */ 5317 5318 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5319 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5320 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5321 int to_elem_sz = type2aelembytes(to_elem_bt); 5322 assert(to_elem_sz <= 4, ""); 5323 vcvttps2dq(dst, src, vec_enc); 5324 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5325 if (to_elem_sz < 4) { 5326 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5327 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5328 } 5329 } 5330 5331 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5332 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5333 Register rscratch, int vec_enc) { 5334 int to_elem_sz = type2aelembytes(to_elem_bt); 5335 assert(to_elem_sz <= 4, ""); 5336 vcvttps2dq(dst, src, vec_enc); 5337 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5338 switch(to_elem_bt) { 5339 case T_INT: 5340 break; 5341 case T_SHORT: 5342 evpmovdw(dst, dst, vec_enc); 5343 break; 5344 case T_BYTE: 5345 evpmovdb(dst, dst, vec_enc); 5346 break; 5347 default: assert(false, "%s", type2name(to_elem_bt)); 5348 } 5349 } 5350 5351 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5352 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5353 Register rscratch, int vec_enc) { 5354 evcvttps2qq(dst, src, vec_enc); 5355 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5356 } 5357 5358 // Handling for downcasting from double to integer or sub-word types on AVX2. 5359 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5360 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5361 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5362 int to_elem_sz = type2aelembytes(to_elem_bt); 5363 assert(to_elem_sz < 8, ""); 5364 vcvttpd2dq(dst, src, vec_enc); 5365 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5366 float_sign_flip, vec_enc); 5367 if (to_elem_sz < 4) { 5368 // xtmp4 holds all zero lanes. 5369 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5370 } 5371 } 5372 5373 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5374 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5375 KRegister ktmp2, AddressLiteral sign_flip, 5376 Register rscratch, int vec_enc) { 5377 if (VM_Version::supports_avx512dq()) { 5378 evcvttpd2qq(dst, src, vec_enc); 5379 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5380 switch(to_elem_bt) { 5381 case T_LONG: 5382 break; 5383 case T_INT: 5384 evpmovsqd(dst, dst, vec_enc); 5385 break; 5386 case T_SHORT: 5387 evpmovsqd(dst, dst, vec_enc); 5388 evpmovdw(dst, dst, vec_enc); 5389 break; 5390 case T_BYTE: 5391 evpmovsqd(dst, dst, vec_enc); 5392 evpmovdb(dst, dst, vec_enc); 5393 break; 5394 default: assert(false, "%s", type2name(to_elem_bt)); 5395 } 5396 } else { 5397 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5398 vcvttpd2dq(dst, src, vec_enc); 5399 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5400 switch(to_elem_bt) { 5401 case T_INT: 5402 break; 5403 case T_SHORT: 5404 evpmovdw(dst, dst, vec_enc); 5405 break; 5406 case T_BYTE: 5407 evpmovdb(dst, dst, vec_enc); 5408 break; 5409 default: assert(false, "%s", type2name(to_elem_bt)); 5410 } 5411 } 5412 } 5413 5414 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5415 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5416 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5417 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5418 // and re-instantiate original MXCSR.RC mode after that. 5419 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5420 5421 mov64(tmp, julong_cast(0.5L)); 5422 evpbroadcastq(xtmp1, tmp, vec_enc); 5423 vaddpd(xtmp1, src , xtmp1, vec_enc); 5424 evcvtpd2qq(dst, xtmp1, vec_enc); 5425 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5426 double_sign_flip, vec_enc);; 5427 5428 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5429 } 5430 5431 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5432 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5433 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5434 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5435 // and re-instantiate original MXCSR.RC mode after that. 5436 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5437 5438 movl(tmp, jint_cast(0.5)); 5439 movq(xtmp1, tmp); 5440 vbroadcastss(xtmp1, xtmp1, vec_enc); 5441 vaddps(xtmp1, src , xtmp1, vec_enc); 5442 vcvtps2dq(dst, xtmp1, vec_enc); 5443 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5444 float_sign_flip, vec_enc); 5445 5446 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5447 } 5448 5449 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5450 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5451 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5452 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5453 // and re-instantiate original MXCSR.RC mode after that. 5454 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5455 5456 movl(tmp, jint_cast(0.5)); 5457 movq(xtmp1, tmp); 5458 vbroadcastss(xtmp1, xtmp1, vec_enc); 5459 vaddps(xtmp1, src , xtmp1, vec_enc); 5460 vcvtps2dq(dst, xtmp1, vec_enc); 5461 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5462 5463 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5464 } 5465 5466 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5467 BasicType from_elem_bt, BasicType to_elem_bt) { 5468 switch (from_elem_bt) { 5469 case T_BYTE: 5470 switch (to_elem_bt) { 5471 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5472 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5473 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5474 default: ShouldNotReachHere(); 5475 } 5476 break; 5477 case T_SHORT: 5478 switch (to_elem_bt) { 5479 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5480 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5481 default: ShouldNotReachHere(); 5482 } 5483 break; 5484 case T_INT: 5485 assert(to_elem_bt == T_LONG, ""); 5486 vpmovzxdq(dst, src, vlen_enc); 5487 break; 5488 default: 5489 ShouldNotReachHere(); 5490 } 5491 } 5492 5493 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5494 BasicType from_elem_bt, BasicType to_elem_bt) { 5495 switch (from_elem_bt) { 5496 case T_BYTE: 5497 switch (to_elem_bt) { 5498 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5499 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5500 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5501 default: ShouldNotReachHere(); 5502 } 5503 break; 5504 case T_SHORT: 5505 switch (to_elem_bt) { 5506 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5507 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5508 default: ShouldNotReachHere(); 5509 } 5510 break; 5511 case T_INT: 5512 assert(to_elem_bt == T_LONG, ""); 5513 vpmovsxdq(dst, src, vlen_enc); 5514 break; 5515 default: 5516 ShouldNotReachHere(); 5517 } 5518 } 5519 5520 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5521 BasicType dst_bt, BasicType src_bt, int vlen) { 5522 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5523 assert(vlen_enc != AVX_512bit, ""); 5524 5525 int dst_bt_size = type2aelembytes(dst_bt); 5526 int src_bt_size = type2aelembytes(src_bt); 5527 if (dst_bt_size > src_bt_size) { 5528 switch (dst_bt_size / src_bt_size) { 5529 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5530 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5531 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5532 default: ShouldNotReachHere(); 5533 } 5534 } else { 5535 assert(dst_bt_size < src_bt_size, ""); 5536 switch (src_bt_size / dst_bt_size) { 5537 case 2: { 5538 if (vlen_enc == AVX_128bit) { 5539 vpacksswb(dst, src, src, vlen_enc); 5540 } else { 5541 vpacksswb(dst, src, src, vlen_enc); 5542 vpermq(dst, dst, 0x08, vlen_enc); 5543 } 5544 break; 5545 } 5546 case 4: { 5547 if (vlen_enc == AVX_128bit) { 5548 vpackssdw(dst, src, src, vlen_enc); 5549 vpacksswb(dst, dst, dst, vlen_enc); 5550 } else { 5551 vpackssdw(dst, src, src, vlen_enc); 5552 vpermq(dst, dst, 0x08, vlen_enc); 5553 vpacksswb(dst, dst, dst, AVX_128bit); 5554 } 5555 break; 5556 } 5557 case 8: { 5558 if (vlen_enc == AVX_128bit) { 5559 vpshufd(dst, src, 0x08, vlen_enc); 5560 vpackssdw(dst, dst, dst, vlen_enc); 5561 vpacksswb(dst, dst, dst, vlen_enc); 5562 } else { 5563 vpshufd(dst, src, 0x08, vlen_enc); 5564 vpermq(dst, dst, 0x08, vlen_enc); 5565 vpackssdw(dst, dst, dst, AVX_128bit); 5566 vpacksswb(dst, dst, dst, AVX_128bit); 5567 } 5568 break; 5569 } 5570 default: ShouldNotReachHere(); 5571 } 5572 } 5573 } 5574 5575 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5576 bool merge, BasicType bt, int vlen_enc) { 5577 if (bt == T_INT) { 5578 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5579 } else { 5580 assert(bt == T_LONG, ""); 5581 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5582 } 5583 } 5584 5585 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5586 bool merge, BasicType bt, int vlen_enc) { 5587 if (bt == T_INT) { 5588 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5589 } else { 5590 assert(bt == T_LONG, ""); 5591 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5592 } 5593 } 5594 5595 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5596 Register rtmp2, XMMRegister xtmp, int mask_len, 5597 int vec_enc) { 5598 int index = 0; 5599 int vindex = 0; 5600 mov64(rtmp1, 0x0101010101010101L); 5601 pdepq(rtmp1, src, rtmp1); 5602 if (mask_len > 8) { 5603 movq(rtmp2, src); 5604 vpxor(xtmp, xtmp, xtmp, vec_enc); 5605 movq(xtmp, rtmp1); 5606 } 5607 movq(dst, rtmp1); 5608 5609 mask_len -= 8; 5610 while (mask_len > 0) { 5611 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5612 index++; 5613 if ((index % 2) == 0) { 5614 pxor(xtmp, xtmp); 5615 } 5616 mov64(rtmp1, 0x0101010101010101L); 5617 shrq(rtmp2, 8); 5618 pdepq(rtmp1, rtmp2, rtmp1); 5619 pinsrq(xtmp, rtmp1, index % 2); 5620 vindex = index / 2; 5621 if (vindex) { 5622 // Write entire 16 byte vector when both 64 bit 5623 // lanes are update to save redundant instructions. 5624 if (index % 2) { 5625 vinsertf128(dst, dst, xtmp, vindex); 5626 } 5627 } else { 5628 vmovdqu(dst, xtmp); 5629 } 5630 mask_len -= 8; 5631 } 5632 } 5633 5634 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5635 switch(opc) { 5636 case Op_VectorMaskTrueCount: 5637 popcntq(dst, tmp); 5638 break; 5639 case Op_VectorMaskLastTrue: 5640 if (VM_Version::supports_lzcnt()) { 5641 lzcntq(tmp, tmp); 5642 movl(dst, 63); 5643 subl(dst, tmp); 5644 } else { 5645 movl(dst, -1); 5646 bsrq(tmp, tmp); 5647 cmov32(Assembler::notZero, dst, tmp); 5648 } 5649 break; 5650 case Op_VectorMaskFirstTrue: 5651 if (VM_Version::supports_bmi1()) { 5652 if (masklen < 32) { 5653 orl(tmp, 1 << masklen); 5654 tzcntl(dst, tmp); 5655 } else if (masklen == 32) { 5656 tzcntl(dst, tmp); 5657 } else { 5658 assert(masklen == 64, ""); 5659 tzcntq(dst, tmp); 5660 } 5661 } else { 5662 if (masklen < 32) { 5663 orl(tmp, 1 << masklen); 5664 bsfl(dst, tmp); 5665 } else { 5666 assert(masklen == 32 || masklen == 64, ""); 5667 movl(dst, masklen); 5668 if (masklen == 32) { 5669 bsfl(tmp, tmp); 5670 } else { 5671 bsfq(tmp, tmp); 5672 } 5673 cmov32(Assembler::notZero, dst, tmp); 5674 } 5675 } 5676 break; 5677 case Op_VectorMaskToLong: 5678 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5679 break; 5680 default: assert(false, "Unhandled mask operation"); 5681 } 5682 } 5683 5684 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5685 int masklen, int masksize, int vec_enc) { 5686 assert(VM_Version::supports_popcnt(), ""); 5687 5688 if(VM_Version::supports_avx512bw()) { 5689 kmovql(tmp, mask); 5690 } else { 5691 assert(masklen <= 16, ""); 5692 kmovwl(tmp, mask); 5693 } 5694 5695 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5696 // operations needs to be clipped. 5697 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5698 andq(tmp, (1 << masklen) - 1); 5699 } 5700 5701 vector_mask_operation_helper(opc, dst, tmp, masklen); 5702 } 5703 5704 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5705 Register tmp, int masklen, BasicType bt, int vec_enc) { 5706 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5707 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5708 assert(VM_Version::supports_popcnt(), ""); 5709 5710 bool need_clip = false; 5711 switch(bt) { 5712 case T_BOOLEAN: 5713 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5714 vpxor(xtmp, xtmp, xtmp, vec_enc); 5715 vpsubb(xtmp, xtmp, mask, vec_enc); 5716 vpmovmskb(tmp, xtmp, vec_enc); 5717 need_clip = masklen < 16; 5718 break; 5719 case T_BYTE: 5720 vpmovmskb(tmp, mask, vec_enc); 5721 need_clip = masklen < 16; 5722 break; 5723 case T_SHORT: 5724 vpacksswb(xtmp, mask, mask, vec_enc); 5725 if (masklen >= 16) { 5726 vpermpd(xtmp, xtmp, 8, vec_enc); 5727 } 5728 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5729 need_clip = masklen < 16; 5730 break; 5731 case T_INT: 5732 case T_FLOAT: 5733 vmovmskps(tmp, mask, vec_enc); 5734 need_clip = masklen < 4; 5735 break; 5736 case T_LONG: 5737 case T_DOUBLE: 5738 vmovmskpd(tmp, mask, vec_enc); 5739 need_clip = masklen < 2; 5740 break; 5741 default: assert(false, "Unhandled type, %s", type2name(bt)); 5742 } 5743 5744 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5745 // operations needs to be clipped. 5746 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5747 // need_clip implies masklen < 32 5748 andq(tmp, (1 << masklen) - 1); 5749 } 5750 5751 vector_mask_operation_helper(opc, dst, tmp, masklen); 5752 } 5753 5754 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5755 Register rtmp2, int mask_len) { 5756 kmov(rtmp1, src); 5757 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5758 mov64(rtmp2, -1L); 5759 pextq(rtmp2, rtmp2, rtmp1); 5760 kmov(dst, rtmp2); 5761 } 5762 5763 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5764 XMMRegister mask, Register rtmp, Register rscratch, 5765 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5766 int vec_enc) { 5767 assert(type2aelembytes(bt) >= 4, ""); 5768 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5769 address compress_perm_table = nullptr; 5770 address expand_perm_table = nullptr; 5771 if (type2aelembytes(bt) == 8) { 5772 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5773 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5774 vmovmskpd(rtmp, mask, vec_enc); 5775 } else { 5776 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5777 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5778 vmovmskps(rtmp, mask, vec_enc); 5779 } 5780 shlq(rtmp, 5); // for 32 byte permute row. 5781 if (opcode == Op_CompressV) { 5782 lea(rscratch, ExternalAddress(compress_perm_table)); 5783 } else { 5784 lea(rscratch, ExternalAddress(expand_perm_table)); 5785 } 5786 addptr(rtmp, rscratch); 5787 vmovdqu(permv, Address(rtmp)); 5788 vpermps(dst, permv, src, Assembler::AVX_256bit); 5789 vpxor(xtmp, xtmp, xtmp, vec_enc); 5790 // Blend the result with zero vector using permute mask, each column entry 5791 // in a permute table row contains either a valid permute index or a -1 (default) 5792 // value, this can potentially be used as a blending mask after 5793 // compressing/expanding the source vector lanes. 5794 vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv); 5795 } 5796 5797 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5798 bool merge, BasicType bt, int vec_enc) { 5799 if (opcode == Op_CompressV) { 5800 switch(bt) { 5801 case T_BYTE: 5802 evpcompressb(dst, mask, src, merge, vec_enc); 5803 break; 5804 case T_CHAR: 5805 case T_SHORT: 5806 evpcompressw(dst, mask, src, merge, vec_enc); 5807 break; 5808 case T_INT: 5809 evpcompressd(dst, mask, src, merge, vec_enc); 5810 break; 5811 case T_FLOAT: 5812 evcompressps(dst, mask, src, merge, vec_enc); 5813 break; 5814 case T_LONG: 5815 evpcompressq(dst, mask, src, merge, vec_enc); 5816 break; 5817 case T_DOUBLE: 5818 evcompresspd(dst, mask, src, merge, vec_enc); 5819 break; 5820 default: 5821 fatal("Unsupported type %s", type2name(bt)); 5822 break; 5823 } 5824 } else { 5825 assert(opcode == Op_ExpandV, ""); 5826 switch(bt) { 5827 case T_BYTE: 5828 evpexpandb(dst, mask, src, merge, vec_enc); 5829 break; 5830 case T_CHAR: 5831 case T_SHORT: 5832 evpexpandw(dst, mask, src, merge, vec_enc); 5833 break; 5834 case T_INT: 5835 evpexpandd(dst, mask, src, merge, vec_enc); 5836 break; 5837 case T_FLOAT: 5838 evexpandps(dst, mask, src, merge, vec_enc); 5839 break; 5840 case T_LONG: 5841 evpexpandq(dst, mask, src, merge, vec_enc); 5842 break; 5843 case T_DOUBLE: 5844 evexpandpd(dst, mask, src, merge, vec_enc); 5845 break; 5846 default: 5847 fatal("Unsupported type %s", type2name(bt)); 5848 break; 5849 } 5850 } 5851 } 5852 5853 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5854 KRegister ktmp1, int vec_enc) { 5855 if (opcode == Op_SignumVD) { 5856 vsubpd(dst, zero, one, vec_enc); 5857 // if src < 0 ? -1 : 1 5858 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5859 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5860 // if src == NaN, -0.0 or 0.0 return src. 5861 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5862 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5863 } else { 5864 assert(opcode == Op_SignumVF, ""); 5865 vsubps(dst, zero, one, vec_enc); 5866 // if src < 0 ? -1 : 1 5867 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5868 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5869 // if src == NaN, -0.0 or 0.0 return src. 5870 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5871 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5872 } 5873 } 5874 5875 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5876 XMMRegister xtmp1, int vec_enc) { 5877 if (opcode == Op_SignumVD) { 5878 vsubpd(dst, zero, one, vec_enc); 5879 // if src < 0 ? -1 : 1 5880 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5881 // if src == NaN, -0.0 or 0.0 return src. 5882 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5883 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5884 } else { 5885 assert(opcode == Op_SignumVF, ""); 5886 vsubps(dst, zero, one, vec_enc); 5887 // if src < 0 ? -1 : 1 5888 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5889 // if src == NaN, -0.0 or 0.0 return src. 5890 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5891 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5892 } 5893 } 5894 5895 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5896 if (VM_Version::supports_avx512bw()) { 5897 if (mask_len > 32) { 5898 kmovql(dst, src); 5899 } else { 5900 kmovdl(dst, src); 5901 if (mask_len != 32) { 5902 kshiftrdl(dst, dst, 32 - mask_len); 5903 } 5904 } 5905 } else { 5906 assert(mask_len <= 16, ""); 5907 kmovwl(dst, src); 5908 if (mask_len != 16) { 5909 kshiftrwl(dst, dst, 16 - mask_len); 5910 } 5911 } 5912 } 5913 5914 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5915 int lane_size = type2aelembytes(bt); 5916 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5917 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) { 5918 movptr(rtmp, imm32); 5919 switch(lane_size) { 5920 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5921 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5922 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5923 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5924 fatal("Unsupported lane size %d", lane_size); 5925 break; 5926 } 5927 } else { 5928 movptr(rtmp, imm32); 5929 movq(dst, rtmp); 5930 switch(lane_size) { 5931 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5932 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5933 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5934 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5935 fatal("Unsupported lane size %d", lane_size); 5936 break; 5937 } 5938 } 5939 } 5940 5941 // 5942 // Following is lookup table based popcount computation algorithm:- 5943 // Index Bit set count 5944 // [ 0000 -> 0, 5945 // 0001 -> 1, 5946 // 0010 -> 1, 5947 // 0011 -> 2, 5948 // 0100 -> 1, 5949 // 0101 -> 2, 5950 // 0110 -> 2, 5951 // 0111 -> 3, 5952 // 1000 -> 1, 5953 // 1001 -> 2, 5954 // 1010 -> 3, 5955 // 1011 -> 3, 5956 // 1100 -> 2, 5957 // 1101 -> 3, 5958 // 1111 -> 4 ] 5959 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5960 // shuffle indices for lookup table access. 5961 // b. Right shift each byte of vector lane by 4 positions. 5962 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5963 // shuffle indices for lookup table access. 5964 // d. Add the bitset count of upper and lower 4 bits of each byte. 5965 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5966 // count of all the bytes of a quadword. 5967 // f. Perform step e. for upper 128bit vector lane. 5968 // g. Pack the bitset count of quadwords back to double word. 5969 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5970 5971 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5972 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5973 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5974 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5975 vpsrlw(dst, src, 4, vec_enc); 5976 vpand(dst, dst, xtmp1, vec_enc); 5977 vpand(xtmp1, src, xtmp1, vec_enc); 5978 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5979 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5980 vpshufb(dst, xtmp2, dst, vec_enc); 5981 vpaddb(dst, dst, xtmp1, vec_enc); 5982 } 5983 5984 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5985 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5986 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5987 // Following code is as per steps e,f,g and h of above algorithm. 5988 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5989 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5990 vpsadbw(dst, dst, xtmp2, vec_enc); 5991 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5992 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5993 vpackuswb(dst, xtmp1, dst, vec_enc); 5994 } 5995 5996 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5997 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5998 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5999 // Add the popcount of upper and lower bytes of word. 6000 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 6001 vpsrlw(dst, xtmp1, 8, vec_enc); 6002 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 6003 vpaddw(dst, dst, xtmp1, vec_enc); 6004 } 6005 6006 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6007 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6008 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 6009 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6010 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 6011 } 6012 6013 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6014 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6015 switch(bt) { 6016 case T_LONG: 6017 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 6018 break; 6019 case T_INT: 6020 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 6021 break; 6022 case T_CHAR: 6023 case T_SHORT: 6024 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 6025 break; 6026 case T_BYTE: 6027 case T_BOOLEAN: 6028 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 6029 break; 6030 default: 6031 fatal("Unsupported type %s", type2name(bt)); 6032 break; 6033 } 6034 } 6035 6036 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6037 KRegister mask, bool merge, int vec_enc) { 6038 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6039 switch(bt) { 6040 case T_LONG: 6041 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 6042 evpopcntq(dst, mask, src, merge, vec_enc); 6043 break; 6044 case T_INT: 6045 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 6046 evpopcntd(dst, mask, src, merge, vec_enc); 6047 break; 6048 case T_CHAR: 6049 case T_SHORT: 6050 assert(VM_Version::supports_avx512_bitalg(), ""); 6051 evpopcntw(dst, mask, src, merge, vec_enc); 6052 break; 6053 case T_BYTE: 6054 case T_BOOLEAN: 6055 assert(VM_Version::supports_avx512_bitalg(), ""); 6056 evpopcntb(dst, mask, src, merge, vec_enc); 6057 break; 6058 default: 6059 fatal("Unsupported type %s", type2name(bt)); 6060 break; 6061 } 6062 } 6063 6064 // Bit reversal algorithm first reverses the bits of each byte followed by 6065 // a byte level reversal for multi-byte primitive types (short/int/long). 6066 // Algorithm performs a lookup table access to get reverse bit sequence 6067 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 6068 // is obtained by swapping the reverse bit sequences of upper and lower 6069 // nibble of a byte. 6070 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6071 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6072 if (VM_Version::supports_avx512vlbw()) { 6073 6074 // Get the reverse bit sequence of lower nibble of each byte. 6075 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 6076 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6077 evpandq(dst, xtmp2, src, vec_enc); 6078 vpshufb(dst, xtmp1, dst, vec_enc); 6079 vpsllq(dst, dst, 4, vec_enc); 6080 6081 // Get the reverse bit sequence of upper nibble of each byte. 6082 vpandn(xtmp2, xtmp2, src, vec_enc); 6083 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6084 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6085 6086 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6087 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6088 evporq(xtmp2, dst, xtmp2, vec_enc); 6089 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6090 6091 } else if(vec_enc == Assembler::AVX_512bit) { 6092 // Shift based bit reversal. 6093 assert(bt == T_LONG || bt == T_INT, ""); 6094 6095 // Swap lower and upper nibble of each byte. 6096 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 6097 6098 // Swap two least and most significant bits of each nibble. 6099 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 6100 6101 // Swap adjacent pair of bits. 6102 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6103 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 6104 6105 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6106 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 6107 } else { 6108 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 6109 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6110 6111 // Get the reverse bit sequence of lower nibble of each byte. 6112 vpand(dst, xtmp2, src, vec_enc); 6113 vpshufb(dst, xtmp1, dst, vec_enc); 6114 vpsllq(dst, dst, 4, vec_enc); 6115 6116 // Get the reverse bit sequence of upper nibble of each byte. 6117 vpandn(xtmp2, xtmp2, src, vec_enc); 6118 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6119 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6120 6121 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6122 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6123 vpor(xtmp2, dst, xtmp2, vec_enc); 6124 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6125 } 6126 } 6127 6128 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 6129 XMMRegister xtmp, Register rscratch) { 6130 assert(VM_Version::supports_gfni(), ""); 6131 assert(rscratch != noreg || always_reachable(mask), "missing"); 6132 6133 // Galois field instruction based bit reversal based on following algorithm. 6134 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6135 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 6136 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 6137 vector_reverse_byte(bt, dst, xtmp, vec_enc); 6138 } 6139 6140 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 6141 XMMRegister xtmp1, Register rtmp, int vec_enc) { 6142 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 6143 evpandq(dst, xtmp1, src, vec_enc); 6144 vpsllq(dst, dst, nbits, vec_enc); 6145 vpandn(xtmp1, xtmp1, src, vec_enc); 6146 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 6147 evporq(dst, dst, xtmp1, vec_enc); 6148 } 6149 6150 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6151 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6152 // Shift based bit reversal. 6153 assert(VM_Version::supports_evex(), ""); 6154 switch(bt) { 6155 case T_LONG: 6156 // Swap upper and lower double word of each quad word. 6157 evprorq(xtmp1, k0, src, 32, true, vec_enc); 6158 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 6159 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6160 break; 6161 case T_INT: 6162 // Swap upper and lower word of each double word. 6163 evprord(xtmp1, k0, src, 16, true, vec_enc); 6164 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6165 break; 6166 case T_CHAR: 6167 case T_SHORT: 6168 // Swap upper and lower byte of each word. 6169 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 6170 break; 6171 case T_BYTE: 6172 evmovdquq(dst, k0, src, true, vec_enc); 6173 break; 6174 default: 6175 fatal("Unsupported type %s", type2name(bt)); 6176 break; 6177 } 6178 } 6179 6180 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 6181 if (bt == T_BYTE) { 6182 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 6183 evmovdquq(dst, k0, src, true, vec_enc); 6184 } else { 6185 vmovdqu(dst, src); 6186 } 6187 return; 6188 } 6189 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 6190 // pre-computed shuffle indices. 6191 switch(bt) { 6192 case T_LONG: 6193 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 6194 break; 6195 case T_INT: 6196 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 6197 break; 6198 case T_CHAR: 6199 case T_SHORT: 6200 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6201 break; 6202 default: 6203 fatal("Unsupported type %s", type2name(bt)); 6204 break; 6205 } 6206 vpshufb(dst, src, dst, vec_enc); 6207 } 6208 6209 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6210 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6211 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6212 assert(is_integral_type(bt), ""); 6213 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6214 assert(VM_Version::supports_avx512cd(), ""); 6215 switch(bt) { 6216 case T_LONG: 6217 evplzcntq(dst, ktmp, src, merge, vec_enc); 6218 break; 6219 case T_INT: 6220 evplzcntd(dst, ktmp, src, merge, vec_enc); 6221 break; 6222 case T_SHORT: 6223 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6224 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6225 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6226 vpunpckhwd(dst, xtmp1, src, vec_enc); 6227 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6228 vpackusdw(dst, xtmp2, dst, vec_enc); 6229 break; 6230 case T_BYTE: 6231 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6232 // accessing the lookup table. 6233 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6234 // accessing the lookup table. 6235 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6236 assert(VM_Version::supports_avx512bw(), ""); 6237 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6238 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6239 vpand(xtmp2, dst, src, vec_enc); 6240 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6241 vpsrlw(xtmp3, src, 4, vec_enc); 6242 vpand(xtmp3, dst, xtmp3, vec_enc); 6243 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6244 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6245 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6246 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6247 break; 6248 default: 6249 fatal("Unsupported type %s", type2name(bt)); 6250 break; 6251 } 6252 } 6253 6254 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6255 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6256 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6257 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6258 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6259 // accessing the lookup table. 6260 vpand(dst, xtmp2, src, vec_enc); 6261 vpshufb(dst, xtmp1, dst, vec_enc); 6262 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6263 // accessing the lookup table. 6264 vpsrlw(xtmp3, src, 4, vec_enc); 6265 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6266 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6267 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6268 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6269 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6270 vpaddb(dst, dst, xtmp2, vec_enc); 6271 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6272 } 6273 6274 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6275 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6276 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6277 // Add zero counts of lower byte and upper byte of a word if 6278 // upper byte holds a zero value. 6279 vpsrlw(xtmp3, src, 8, vec_enc); 6280 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6281 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6282 vpsllw(xtmp2, dst, 8, vec_enc); 6283 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6284 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6285 vpsrlw(dst, dst, 8, vec_enc); 6286 } 6287 6288 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6289 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6290 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6291 // hence biased exponent can be used to compute leading zero count as per 6292 // following formula:- 6293 // LZCNT = 31 - (biased_exp - 127) 6294 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6295 6296 // Broadcast 0xFF 6297 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6298 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6299 6300 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher 6301 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit 6302 // contributes to the leading number of zeros. 6303 vpsrld(xtmp2, src, 1, vec_enc); 6304 vpandn(xtmp3, xtmp2, src, vec_enc); 6305 6306 // Extract biased exponent. 6307 vcvtdq2ps(dst, xtmp3, vec_enc); 6308 vpsrld(dst, dst, 23, vec_enc); 6309 vpand(dst, dst, xtmp1, vec_enc); 6310 6311 // Broadcast 127. 6312 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6313 // Exponent = biased_exp - 127 6314 vpsubd(dst, dst, xtmp1, vec_enc); 6315 6316 // Exponent_plus_one = Exponent + 1 6317 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6318 vpaddd(dst, dst, xtmp3, vec_enc); 6319 6320 // Replace -ve exponent with zero, exponent is -ve when src 6321 // lane contains a zero value. 6322 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6323 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6324 6325 // Rematerialize broadcast 32. 6326 vpslld(xtmp1, xtmp3, 5, vec_enc); 6327 // Exponent is 32 if corresponding source lane contains max_int value. 6328 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6329 // LZCNT = 32 - exponent_plus_one 6330 vpsubd(dst, xtmp1, dst, vec_enc); 6331 6332 // Replace LZCNT with a value 1 if corresponding source lane 6333 // contains max_int value. 6334 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6335 6336 // Replace biased_exp with 0 if source lane value is less than zero. 6337 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6338 vblendvps(dst, dst, xtmp2, src, vec_enc); 6339 } 6340 6341 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6342 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6343 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6344 // Add zero counts of lower word and upper word of a double word if 6345 // upper word holds a zero value. 6346 vpsrld(xtmp3, src, 16, vec_enc); 6347 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6348 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6349 vpslld(xtmp2, dst, 16, vec_enc); 6350 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6351 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6352 vpsrld(dst, dst, 16, vec_enc); 6353 // Add zero counts of lower doubleword and upper doubleword of a 6354 // quadword if upper doubleword holds a zero value. 6355 vpsrlq(xtmp3, src, 32, vec_enc); 6356 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6357 vpsllq(xtmp2, dst, 32, vec_enc); 6358 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6359 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6360 vpsrlq(dst, dst, 32, vec_enc); 6361 } 6362 6363 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6364 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6365 Register rtmp, int vec_enc) { 6366 assert(is_integral_type(bt), "unexpected type"); 6367 assert(vec_enc < Assembler::AVX_512bit, ""); 6368 switch(bt) { 6369 case T_LONG: 6370 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6371 break; 6372 case T_INT: 6373 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6374 break; 6375 case T_SHORT: 6376 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6377 break; 6378 case T_BYTE: 6379 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6380 break; 6381 default: 6382 fatal("Unsupported type %s", type2name(bt)); 6383 break; 6384 } 6385 } 6386 6387 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6388 switch(bt) { 6389 case T_BYTE: 6390 vpsubb(dst, src1, src2, vec_enc); 6391 break; 6392 case T_SHORT: 6393 vpsubw(dst, src1, src2, vec_enc); 6394 break; 6395 case T_INT: 6396 vpsubd(dst, src1, src2, vec_enc); 6397 break; 6398 case T_LONG: 6399 vpsubq(dst, src1, src2, vec_enc); 6400 break; 6401 default: 6402 fatal("Unsupported type %s", type2name(bt)); 6403 break; 6404 } 6405 } 6406 6407 // Trailing zero count computation is based on leading zero count operation as per 6408 // following equation. All AVX3 targets support AVX512CD feature which offers 6409 // direct vector instruction to compute leading zero count. 6410 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6411 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6412 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6413 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6414 assert(is_integral_type(bt), ""); 6415 // xtmp = -1 6416 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6417 // xtmp = xtmp + src 6418 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6419 // xtmp = xtmp & ~src 6420 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6421 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6422 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6423 vpsub(bt, dst, xtmp4, dst, vec_enc); 6424 } 6425 6426 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6427 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6428 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6429 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6430 assert(is_integral_type(bt), ""); 6431 // xtmp = 0 6432 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6433 // xtmp = 0 - src 6434 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6435 // xtmp = xtmp | src 6436 vpor(xtmp3, xtmp3, src, vec_enc); 6437 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6438 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6439 vpsub(bt, dst, xtmp1, dst, vec_enc); 6440 } 6441 6442 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6443 Label done; 6444 Label neg_divisor_fastpath; 6445 cmpl(divisor, 0); 6446 jccb(Assembler::less, neg_divisor_fastpath); 6447 xorl(rdx, rdx); 6448 divl(divisor); 6449 jmpb(done); 6450 bind(neg_divisor_fastpath); 6451 // Fastpath for divisor < 0: 6452 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6453 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6454 movl(rdx, rax); 6455 subl(rdx, divisor); 6456 if (VM_Version::supports_bmi1()) { 6457 andnl(rax, rdx, rax); 6458 } else { 6459 notl(rdx); 6460 andl(rax, rdx); 6461 } 6462 shrl(rax, 31); 6463 bind(done); 6464 } 6465 6466 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6467 Label done; 6468 Label neg_divisor_fastpath; 6469 cmpl(divisor, 0); 6470 jccb(Assembler::less, neg_divisor_fastpath); 6471 xorl(rdx, rdx); 6472 divl(divisor); 6473 jmpb(done); 6474 bind(neg_divisor_fastpath); 6475 // Fastpath when divisor < 0: 6476 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6477 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6478 movl(rdx, rax); 6479 subl(rax, divisor); 6480 if (VM_Version::supports_bmi1()) { 6481 andnl(rax, rax, rdx); 6482 } else { 6483 notl(rax); 6484 andl(rax, rdx); 6485 } 6486 sarl(rax, 31); 6487 andl(rax, divisor); 6488 subl(rdx, rax); 6489 bind(done); 6490 } 6491 6492 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6493 Label done; 6494 Label neg_divisor_fastpath; 6495 6496 cmpl(divisor, 0); 6497 jccb(Assembler::less, neg_divisor_fastpath); 6498 xorl(rdx, rdx); 6499 divl(divisor); 6500 jmpb(done); 6501 bind(neg_divisor_fastpath); 6502 // Fastpath for divisor < 0: 6503 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6504 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6505 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6506 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6507 movl(rdx, rax); 6508 subl(rax, divisor); 6509 if (VM_Version::supports_bmi1()) { 6510 andnl(rax, rax, rdx); 6511 } else { 6512 notl(rax); 6513 andl(rax, rdx); 6514 } 6515 movl(tmp, rax); 6516 shrl(rax, 31); // quotient 6517 sarl(tmp, 31); 6518 andl(tmp, divisor); 6519 subl(rdx, tmp); // remainder 6520 bind(done); 6521 } 6522 6523 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6524 XMMRegister xtmp2, Register rtmp) { 6525 if(VM_Version::supports_gfni()) { 6526 // Galois field instruction based bit reversal based on following algorithm. 6527 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6528 mov64(rtmp, 0x8040201008040201L); 6529 movq(xtmp1, src); 6530 movq(xtmp2, rtmp); 6531 gf2p8affineqb(xtmp1, xtmp2, 0); 6532 movq(dst, xtmp1); 6533 } else { 6534 // Swap even and odd numbered bits. 6535 movl(rtmp, src); 6536 andl(rtmp, 0x55555555); 6537 shll(rtmp, 1); 6538 movl(dst, src); 6539 andl(dst, 0xAAAAAAAA); 6540 shrl(dst, 1); 6541 orl(dst, rtmp); 6542 6543 // Swap LSB and MSB 2 bits of each nibble. 6544 movl(rtmp, dst); 6545 andl(rtmp, 0x33333333); 6546 shll(rtmp, 2); 6547 andl(dst, 0xCCCCCCCC); 6548 shrl(dst, 2); 6549 orl(dst, rtmp); 6550 6551 // Swap LSB and MSB 4 bits of each byte. 6552 movl(rtmp, dst); 6553 andl(rtmp, 0x0F0F0F0F); 6554 shll(rtmp, 4); 6555 andl(dst, 0xF0F0F0F0); 6556 shrl(dst, 4); 6557 orl(dst, rtmp); 6558 } 6559 bswapl(dst); 6560 } 6561 6562 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6563 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6564 if(VM_Version::supports_gfni()) { 6565 // Galois field instruction based bit reversal based on following algorithm. 6566 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6567 mov64(rtmp1, 0x8040201008040201L); 6568 movq(xtmp1, src); 6569 movq(xtmp2, rtmp1); 6570 gf2p8affineqb(xtmp1, xtmp2, 0); 6571 movq(dst, xtmp1); 6572 } else { 6573 // Swap even and odd numbered bits. 6574 movq(rtmp1, src); 6575 mov64(rtmp2, 0x5555555555555555L); 6576 andq(rtmp1, rtmp2); 6577 shlq(rtmp1, 1); 6578 movq(dst, src); 6579 notq(rtmp2); 6580 andq(dst, rtmp2); 6581 shrq(dst, 1); 6582 orq(dst, rtmp1); 6583 6584 // Swap LSB and MSB 2 bits of each nibble. 6585 movq(rtmp1, dst); 6586 mov64(rtmp2, 0x3333333333333333L); 6587 andq(rtmp1, rtmp2); 6588 shlq(rtmp1, 2); 6589 notq(rtmp2); 6590 andq(dst, rtmp2); 6591 shrq(dst, 2); 6592 orq(dst, rtmp1); 6593 6594 // Swap LSB and MSB 4 bits of each byte. 6595 movq(rtmp1, dst); 6596 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6597 andq(rtmp1, rtmp2); 6598 shlq(rtmp1, 4); 6599 notq(rtmp2); 6600 andq(dst, rtmp2); 6601 shrq(dst, 4); 6602 orq(dst, rtmp1); 6603 } 6604 bswapq(dst); 6605 } 6606 6607 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6608 Label done; 6609 Label neg_divisor_fastpath; 6610 cmpq(divisor, 0); 6611 jccb(Assembler::less, neg_divisor_fastpath); 6612 xorl(rdx, rdx); 6613 divq(divisor); 6614 jmpb(done); 6615 bind(neg_divisor_fastpath); 6616 // Fastpath for divisor < 0: 6617 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6618 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6619 movq(rdx, rax); 6620 subq(rdx, divisor); 6621 if (VM_Version::supports_bmi1()) { 6622 andnq(rax, rdx, rax); 6623 } else { 6624 notq(rdx); 6625 andq(rax, rdx); 6626 } 6627 shrq(rax, 63); 6628 bind(done); 6629 } 6630 6631 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6632 Label done; 6633 Label neg_divisor_fastpath; 6634 cmpq(divisor, 0); 6635 jccb(Assembler::less, neg_divisor_fastpath); 6636 xorq(rdx, rdx); 6637 divq(divisor); 6638 jmp(done); 6639 bind(neg_divisor_fastpath); 6640 // Fastpath when divisor < 0: 6641 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6642 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6643 movq(rdx, rax); 6644 subq(rax, divisor); 6645 if (VM_Version::supports_bmi1()) { 6646 andnq(rax, rax, rdx); 6647 } else { 6648 notq(rax); 6649 andq(rax, rdx); 6650 } 6651 sarq(rax, 63); 6652 andq(rax, divisor); 6653 subq(rdx, rax); 6654 bind(done); 6655 } 6656 6657 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6658 Label done; 6659 Label neg_divisor_fastpath; 6660 cmpq(divisor, 0); 6661 jccb(Assembler::less, neg_divisor_fastpath); 6662 xorq(rdx, rdx); 6663 divq(divisor); 6664 jmp(done); 6665 bind(neg_divisor_fastpath); 6666 // Fastpath for divisor < 0: 6667 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6668 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6669 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6670 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6671 movq(rdx, rax); 6672 subq(rax, divisor); 6673 if (VM_Version::supports_bmi1()) { 6674 andnq(rax, rax, rdx); 6675 } else { 6676 notq(rax); 6677 andq(rax, rdx); 6678 } 6679 movq(tmp, rax); 6680 shrq(rax, 63); // quotient 6681 sarq(tmp, 63); 6682 andq(tmp, divisor); 6683 subq(rdx, tmp); // remainder 6684 bind(done); 6685 } 6686 6687 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6688 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6689 int vlen_enc) { 6690 assert(VM_Version::supports_avx512bw(), ""); 6691 // Byte shuffles are inlane operations and indices are determined using 6692 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6693 // normalized to index range 0-15. This makes sure that all the multiples 6694 // of an index value are placed at same relative position in 128 bit 6695 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6696 // will be 16th element in their respective 128 bit lanes. 6697 movl(rtmp, 16); 6698 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6699 6700 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6701 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6702 // original shuffle indices and move the shuffled lanes corresponding to true 6703 // mask to destination vector. 6704 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6705 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6706 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6707 6708 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6709 // and broadcasting second 128 bit lane. 6710 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6711 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6712 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6713 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6714 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6715 6716 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6717 // and broadcasting third 128 bit lane. 6718 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6719 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6720 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6721 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6722 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6723 6724 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6725 // and broadcasting third 128 bit lane. 6726 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6727 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6728 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6729 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6730 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6731 } 6732 6733 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6734 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6735 if (vlen_enc == AVX_128bit) { 6736 vpermilps(dst, src, shuffle, vlen_enc); 6737 } else if (bt == T_INT) { 6738 vpermd(dst, shuffle, src, vlen_enc); 6739 } else { 6740 assert(bt == T_FLOAT, ""); 6741 vpermps(dst, shuffle, src, vlen_enc); 6742 } 6743 } 6744 6745 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 6746 switch(opcode) { 6747 case Op_AddHF: vaddsh(dst, src1, src2); break; 6748 case Op_SubHF: vsubsh(dst, src1, src2); break; 6749 case Op_MulHF: vmulsh(dst, src1, src2); break; 6750 case Op_DivHF: vdivsh(dst, src1, src2); break; 6751 default: assert(false, "%s", NodeClassNames[opcode]); break; 6752 } 6753 } 6754 6755 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6756 switch(elem_bt) { 6757 case T_BYTE: 6758 if (ideal_opc == Op_SaturatingAddV) { 6759 vpaddsb(dst, src1, src2, vlen_enc); 6760 } else { 6761 assert(ideal_opc == Op_SaturatingSubV, ""); 6762 vpsubsb(dst, src1, src2, vlen_enc); 6763 } 6764 break; 6765 case T_SHORT: 6766 if (ideal_opc == Op_SaturatingAddV) { 6767 vpaddsw(dst, src1, src2, vlen_enc); 6768 } else { 6769 assert(ideal_opc == Op_SaturatingSubV, ""); 6770 vpsubsw(dst, src1, src2, vlen_enc); 6771 } 6772 break; 6773 default: 6774 fatal("Unsupported type %s", type2name(elem_bt)); 6775 break; 6776 } 6777 } 6778 6779 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6780 switch(elem_bt) { 6781 case T_BYTE: 6782 if (ideal_opc == Op_SaturatingAddV) { 6783 vpaddusb(dst, src1, src2, vlen_enc); 6784 } else { 6785 assert(ideal_opc == Op_SaturatingSubV, ""); 6786 vpsubusb(dst, src1, src2, vlen_enc); 6787 } 6788 break; 6789 case T_SHORT: 6790 if (ideal_opc == Op_SaturatingAddV) { 6791 vpaddusw(dst, src1, src2, vlen_enc); 6792 } else { 6793 assert(ideal_opc == Op_SaturatingSubV, ""); 6794 vpsubusw(dst, src1, src2, vlen_enc); 6795 } 6796 break; 6797 default: 6798 fatal("Unsupported type %s", type2name(elem_bt)); 6799 break; 6800 } 6801 } 6802 6803 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6804 XMMRegister src2, KRegister ktmp, int vlen_enc) { 6805 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input. 6806 // overflow_mask = Inp1 <u Inp2 6807 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc); 6808 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative) 6809 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false); 6810 } 6811 6812 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6813 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6814 // Emulate unsigned comparison using signed comparison 6815 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE 6816 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true); 6817 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc); 6818 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc); 6819 6820 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc); 6821 6822 // Res = INP1 - INP2 (non-commutative and non-associative) 6823 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6824 // Res = Mask ? Zero : Res 6825 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); 6826 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc); 6827 } 6828 6829 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6830 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) { 6831 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation. 6832 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2) 6833 // Res = Signed Add INP1, INP2 6834 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6835 // T1 = SRC1 | SRC2 6836 vpor(xtmp1, src1, src2, vlen_enc); 6837 // Max_Unsigned = -1 6838 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6839 // Unsigned compare: Mask = Res <u T1 6840 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc); 6841 // res = Mask ? Max_Unsigned : Res 6842 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc); 6843 } 6844 6845 // 6846 // Section 2-13 Hacker's Delight list following overflow detection check for saturating 6847 // unsigned addition operation. 6848 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1 6849 // 6850 // We empirically determined its semantic equivalence to following reduced expression 6851 // overflow_mask = (a + b) <u (a | b) 6852 // 6853 // and also verified it though Alive2 solver. 6854 // (https://alive2.llvm.org/ce/z/XDQ7dY) 6855 // 6856 6857 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6858 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) { 6859 // Res = Signed Add INP1, INP2 6860 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6861 // Compute T1 = INP1 | INP2 6862 vpor(xtmp3, src1, src2, vlen_enc); 6863 // T1 = Minimum signed value. 6864 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6865 // Convert T1 to signed value, T1 = T1 + MIN_VALUE 6866 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc); 6867 // Convert Res to signed value, Res<s> = Res + MIN_VALUE 6868 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc); 6869 // Compute overflow detection mask = Res<1> <s T1 6870 if (elem_bt == T_INT) { 6871 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc); 6872 } else { 6873 assert(elem_bt == T_LONG, ""); 6874 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc); 6875 } 6876 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6877 } 6878 6879 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6880 int vlen_enc, bool xtmp2_hold_M1) { 6881 if (VM_Version::supports_avx512dq()) { 6882 evpmovq2m(ktmp, src, vlen_enc); 6883 } else { 6884 assert(VM_Version::supports_evex(), ""); 6885 if (!xtmp2_hold_M1) { 6886 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6887 } 6888 evpsraq(xtmp1, src, 63, vlen_enc); 6889 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6890 } 6891 } 6892 6893 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6894 int vlen_enc, bool xtmp2_hold_M1) { 6895 if (VM_Version::supports_avx512dq()) { 6896 evpmovd2m(ktmp, src, vlen_enc); 6897 } else { 6898 assert(VM_Version::supports_evex(), ""); 6899 if (!xtmp2_hold_M1) { 6900 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6901 } 6902 vpsrad(xtmp1, src, 31, vlen_enc); 6903 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6904 } 6905 } 6906 6907 6908 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 6909 if (elem_bt == T_LONG) { 6910 if (VM_Version::supports_evex()) { 6911 evpsraq(dst, src, 63, vlen_enc); 6912 } else { 6913 vpsrad(dst, src, 31, vlen_enc); 6914 vpshufd(dst, dst, 0xF5, vlen_enc); 6915 } 6916 } else { 6917 assert(elem_bt == T_INT, ""); 6918 vpsrad(dst, src, 31, vlen_enc); 6919 } 6920 } 6921 6922 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6923 if (compute_allones) { 6924 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6925 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6926 } else { 6927 vpcmpeqq(allones, allones, allones, vlen_enc); 6928 } 6929 } 6930 if (elem_bt == T_LONG) { 6931 vpsrlq(dst, allones, 1, vlen_enc); 6932 } else { 6933 assert(elem_bt == T_INT, ""); 6934 vpsrld(dst, allones, 1, vlen_enc); 6935 } 6936 } 6937 6938 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6939 if (compute_allones) { 6940 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6941 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6942 } else { 6943 vpcmpeqq(allones, allones, allones, vlen_enc); 6944 } 6945 } 6946 if (elem_bt == T_LONG) { 6947 vpsllq(dst, allones, 63, vlen_enc); 6948 } else { 6949 assert(elem_bt == T_INT, ""); 6950 vpslld(dst, allones, 31, vlen_enc); 6951 } 6952 } 6953 6954 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2, 6955 Assembler::ComparisonPredicate cond, int vlen_enc) { 6956 switch(elem_bt) { 6957 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break; 6958 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break; 6959 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break; 6960 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break; 6961 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6962 } 6963 } 6964 6965 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6966 switch(elem_bt) { 6967 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break; 6968 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break; 6969 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break; 6970 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break; 6971 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6972 } 6973 } 6974 6975 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1, 6976 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) { 6977 if (elem_bt == T_LONG) { 6978 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6979 } else { 6980 assert(elem_bt == T_INT, ""); 6981 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6982 } 6983 } 6984 6985 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6986 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6987 KRegister ktmp1, KRegister ktmp2, int vlen_enc) { 6988 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6989 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6990 // Overflow detection based on Hacker's delight section 2-13. 6991 if (ideal_opc == Op_SaturatingAddV) { 6992 // res = src1 + src2 6993 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6994 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6995 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6996 vpxor(xtmp1, dst, src1, vlen_enc); 6997 vpxor(xtmp2, dst, src2, vlen_enc); 6998 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6999 } else { 7000 assert(ideal_opc == Op_SaturatingSubV, ""); 7001 // res = src1 - src2 7002 vpsub(elem_bt, dst, src1, src2, vlen_enc); 7003 // Overflow occurs when both inputs have opposite polarity and 7004 // result polarity does not comply with first input polarity. 7005 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 7006 vpxor(xtmp1, src1, src2, vlen_enc); 7007 vpxor(xtmp2, dst, src1, vlen_enc); 7008 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 7009 } 7010 7011 // Compute overflow detection mask. 7012 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc); 7013 // Note: xtmp1 hold -1 in all its lanes after above call. 7014 7015 // Compute mask based on first input polarity. 7016 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true); 7017 7018 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 7019 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 7020 7021 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to 7022 // set bits in first input polarity mask holds a min value. 7023 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc); 7024 // Blend destination lanes with saturated values using overflow detection mask. 7025 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc); 7026 } 7027 7028 7029 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 7030 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 7031 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) { 7032 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 7033 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 7034 // Overflow detection based on Hacker's delight section 2-13. 7035 if (ideal_opc == Op_SaturatingAddV) { 7036 // res = src1 + src2 7037 vpadd(elem_bt, dst, src1, src2, vlen_enc); 7038 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 7039 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 7040 vpxor(xtmp1, dst, src1, vlen_enc); 7041 vpxor(xtmp2, dst, src2, vlen_enc); 7042 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 7043 } else { 7044 assert(ideal_opc == Op_SaturatingSubV, ""); 7045 // res = src1 - src2 7046 vpsub(elem_bt, dst, src1, src2, vlen_enc); 7047 // Overflow occurs when both inputs have opposite polarity and 7048 // result polarity does not comply with first input polarity. 7049 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 7050 vpxor(xtmp1, src1, src2, vlen_enc); 7051 vpxor(xtmp2, dst, src1, vlen_enc); 7052 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 7053 } 7054 7055 // Sign-extend to compute overflow detection mask. 7056 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc); 7057 7058 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc); 7059 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc); 7060 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 7061 7062 // Compose saturating min/max vector using first input polarity mask. 7063 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc); 7064 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc); 7065 7066 // Blend result with saturating vector using overflow detection mask. 7067 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 7068 } 7069 7070 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7071 switch(elem_bt) { 7072 case T_BYTE: 7073 if (ideal_opc == Op_SaturatingAddV) { 7074 vpaddsb(dst, src1, src2, vlen_enc); 7075 } else { 7076 assert(ideal_opc == Op_SaturatingSubV, ""); 7077 vpsubsb(dst, src1, src2, vlen_enc); 7078 } 7079 break; 7080 case T_SHORT: 7081 if (ideal_opc == Op_SaturatingAddV) { 7082 vpaddsw(dst, src1, src2, vlen_enc); 7083 } else { 7084 assert(ideal_opc == Op_SaturatingSubV, ""); 7085 vpsubsw(dst, src1, src2, vlen_enc); 7086 } 7087 break; 7088 default: 7089 fatal("Unsupported type %s", type2name(elem_bt)); 7090 break; 7091 } 7092 } 7093 7094 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7095 switch(elem_bt) { 7096 case T_BYTE: 7097 if (ideal_opc == Op_SaturatingAddV) { 7098 vpaddusb(dst, src1, src2, vlen_enc); 7099 } else { 7100 assert(ideal_opc == Op_SaturatingSubV, ""); 7101 vpsubusb(dst, src1, src2, vlen_enc); 7102 } 7103 break; 7104 case T_SHORT: 7105 if (ideal_opc == Op_SaturatingAddV) { 7106 vpaddusw(dst, src1, src2, vlen_enc); 7107 } else { 7108 assert(ideal_opc == Op_SaturatingSubV, ""); 7109 vpsubusw(dst, src1, src2, vlen_enc); 7110 } 7111 break; 7112 default: 7113 fatal("Unsupported type %s", type2name(elem_bt)); 7114 break; 7115 } 7116 } 7117 7118 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 7119 XMMRegister src2, int vlen_enc) { 7120 switch(elem_bt) { 7121 case T_BYTE: 7122 evpermi2b(dst, src1, src2, vlen_enc); 7123 break; 7124 case T_SHORT: 7125 evpermi2w(dst, src1, src2, vlen_enc); 7126 break; 7127 case T_INT: 7128 evpermi2d(dst, src1, src2, vlen_enc); 7129 break; 7130 case T_LONG: 7131 evpermi2q(dst, src1, src2, vlen_enc); 7132 break; 7133 case T_FLOAT: 7134 evpermi2ps(dst, src1, src2, vlen_enc); 7135 break; 7136 case T_DOUBLE: 7137 evpermi2pd(dst, src1, src2, vlen_enc); 7138 break; 7139 default: 7140 fatal("Unsupported type %s", type2name(elem_bt)); 7141 break; 7142 } 7143 } 7144 7145 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) { 7146 if (is_unsigned) { 7147 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7148 } else { 7149 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7150 } 7151 } 7152 7153 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) { 7154 if (is_unsigned) { 7155 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7156 } else { 7157 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7158 } 7159 } 7160 7161 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 7162 switch(opcode) { 7163 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 7164 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 7165 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 7166 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 7167 default: assert(false, "%s", NodeClassNames[opcode]); break; 7168 } 7169 } 7170 7171 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7172 switch(opcode) { 7173 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 7174 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 7175 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 7176 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 7177 default: assert(false, "%s", NodeClassNames[opcode]); break; 7178 } 7179 } 7180 7181 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7182 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) { 7183 vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit); 7184 } 7185 7186 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7187 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 7188 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) { 7189 // Move sign bits of src2 to mask register. 7190 evpmovw2m(ktmp, src2, vlen_enc); 7191 // xtmp1 = src2 < 0 ? src2 : src1 7192 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7193 // xtmp2 = src2 < 0 ? ? src1 : src2 7194 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7195 // Idea behind above swapping is to make seconds source operand a +ve value. 7196 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in 7197 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction, 7198 // the second source operand, either a NaN or a valid floating-point value, is returned 7199 // dst = max(xtmp1, xtmp2) 7200 evmaxph(dst, xtmp1, xtmp2, vlen_enc); 7201 // isNaN = is_unordered_quiet(xtmp1) 7202 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7203 // Final result is same as first source if its a NaN value, 7204 // in case second operand holds a NaN value then as per above semantics 7205 // result is same as second operand. 7206 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7207 } else { 7208 assert(opcode == Op_MinVHF || opcode == Op_MinHF, ""); 7209 // Move sign bits of src1 to mask register. 7210 evpmovw2m(ktmp, src1, vlen_enc); 7211 // xtmp1 = src1 < 0 ? src2 : src1 7212 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7213 // xtmp2 = src1 < 0 ? src1 : src2 7214 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7215 // Idea behind above swapping is to make seconds source operand a -ve value. 7216 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in 7217 // the second source operand is returned. 7218 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN 7219 // or a valid floating-point value, is written to the result. 7220 // dst = min(xtmp1, xtmp2) 7221 evminph(dst, xtmp1, xtmp2, vlen_enc); 7222 // isNaN = is_unordered_quiet(xtmp1) 7223 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7224 // Final result is same as first source if its a NaN value, 7225 // in case second operand holds a NaN value then as per above semantics 7226 // result is same as second operand. 7227 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7228 } 7229 }