1 /* 2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "gc/shared/barrierSet.hpp" 28 #include "gc/shared/barrierSetAssembler.hpp" 29 #include "oops/methodData.hpp" 30 #include "opto/c2_MacroAssembler.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/output.hpp" 33 #include "opto/opcodes.hpp" 34 #include "opto/subnode.hpp" 35 #include "runtime/globals.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 #include "utilities/checkedCast.hpp" 39 #include "utilities/globalDefinitions.hpp" 40 #include "utilities/powerOfTwo.hpp" 41 #include "utilities/sizes.hpp" 42 43 #ifdef PRODUCT 44 #define BLOCK_COMMENT(str) /* nothing */ 45 #define STOP(error) stop(error) 46 #else 47 #define BLOCK_COMMENT(str) block_comment(str) 48 #define STOP(error) block_comment(error); stop(error) 49 #endif 50 51 // C2 compiled method's prolog code. 52 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) { 53 if (C->clinit_barrier_on_entry()) { 54 assert(VM_Version::supports_fast_class_init_checks(), "sanity"); 55 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started"); 56 57 Label L_skip_barrier; 58 Register klass = rscratch1; 59 60 mov_metadata(klass, C->method()->holder()->constant_encoding()); 61 clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/); 62 63 jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 64 65 bind(L_skip_barrier); 66 } 67 68 int framesize = C->output()->frame_size_in_bytes(); 69 int bangsize = C->output()->bang_size_in_bytes(); 70 bool fp_mode_24b = false; 71 int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0; 72 73 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 74 75 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 76 // Remove word for return addr 77 framesize -= wordSize; 78 stack_bang_size -= wordSize; 79 80 // Calls to C2R adapters often do not accept exceptional returns. 81 // We require that their callers must bang for them. But be careful, because 82 // some VM calls (such as call site linkage) can use several kilobytes of 83 // stack. But the stack safety zone should account for that. 84 // See bugs 4446381, 4468289, 4497237. 85 if (stack_bang_size > 0) { 86 generate_stack_overflow_check(stack_bang_size); 87 88 // We always push rbp, so that on return to interpreter rbp, will be 89 // restored correctly and we can correct the stack. 90 push(rbp); 91 // Save caller's stack pointer into RBP if the frame pointer is preserved. 92 if (PreserveFramePointer) { 93 mov(rbp, rsp); 94 } 95 // Remove word for ebp 96 framesize -= wordSize; 97 98 // Create frame 99 if (framesize) { 100 subptr(rsp, framesize); 101 } 102 } else { 103 subptr(rsp, framesize); 104 105 // Save RBP register now. 106 framesize -= wordSize; 107 movptr(Address(rsp, framesize), rbp); 108 // Save caller's stack pointer into RBP if the frame pointer is preserved. 109 if (PreserveFramePointer) { 110 movptr(rbp, rsp); 111 if (framesize > 0) { 112 addptr(rbp, framesize); 113 } 114 } 115 } 116 117 if (C->needs_stack_repair()) { 118 // Save stack increment just below the saved rbp (also account for fixed framesize and rbp) 119 assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned"); 120 movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize); 121 } 122 123 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 124 framesize -= wordSize; 125 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 126 } 127 128 #ifdef ASSERT 129 if (VerifyStackAtCalls) { 130 Label L; 131 push(rax); 132 mov(rax, rsp); 133 andptr(rax, StackAlignmentInBytes-1); 134 cmpptr(rax, StackAlignmentInBytes-wordSize); 135 pop(rax); 136 jcc(Assembler::equal, L); 137 STOP("Stack is not properly aligned!"); 138 bind(L); 139 } 140 #endif 141 } 142 143 void C2_MacroAssembler::entry_barrier() { 144 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 145 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 146 Label dummy_slow_path; 147 Label dummy_continuation; 148 Label* slow_path = &dummy_slow_path; 149 Label* continuation = &dummy_continuation; 150 if (!Compile::current()->output()->in_scratch_emit_size()) { 151 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 152 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 153 Compile::current()->output()->add_stub(stub); 154 slow_path = &stub->entry(); 155 continuation = &stub->continuation(); 156 } 157 bs->nmethod_entry_barrier(this, slow_path, continuation); 158 } 159 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 161 switch (vlen_in_bytes) { 162 case 4: // fall-through 163 case 8: // fall-through 164 case 16: return Assembler::AVX_128bit; 165 case 32: return Assembler::AVX_256bit; 166 case 64: return Assembler::AVX_512bit; 167 168 default: { 169 ShouldNotReachHere(); 170 return Assembler::AVX_NoVec; 171 } 172 } 173 } 174 175 // fast_lock and fast_unlock used by C2 176 177 // Because the transitions from emitted code to the runtime 178 // monitorenter/exit helper stubs are so slow it's critical that 179 // we inline both the stack-locking fast path and the inflated fast path. 180 // 181 // See also: cmpFastLock and cmpFastUnlock. 182 // 183 // What follows is a specialized inline transliteration of the code 184 // in enter() and exit(). If we're concerned about I$ bloat another 185 // option would be to emit TrySlowEnter and TrySlowExit methods 186 // at startup-time. These methods would accept arguments as 187 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 188 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 189 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 190 // In practice, however, the # of lock sites is bounded and is usually small. 191 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 192 // if the processor uses simple bimodal branch predictors keyed by EIP 193 // Since the helper routines would be called from multiple synchronization 194 // sites. 195 // 196 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 197 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 198 // to those specialized methods. That'd give us a mostly platform-independent 199 // implementation that the JITs could optimize and inline at their pleasure. 200 // Done correctly, the only time we'd need to cross to native could would be 201 // to park() or unpark() threads. We'd also need a few more unsafe operators 202 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 203 // (b) explicit barriers or fence operations. 204 // 205 // TODO: 206 // 207 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 208 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 209 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 210 // the lock operators would typically be faster than reifying Self. 211 // 212 // * Ideally I'd define the primitives as: 213 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 214 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 215 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 216 // Instead, we're stuck with a rather awkward and brittle register assignments below. 217 // Furthermore the register assignments are overconstrained, possibly resulting in 218 // sub-optimal code near the synchronization site. 219 // 220 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 221 // Alternately, use a better sp-proximity test. 222 // 223 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 224 // Either one is sufficient to uniquely identify a thread. 225 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 226 // 227 // * Intrinsify notify() and notifyAll() for the common cases where the 228 // object is locked by the calling thread but the waitlist is empty. 229 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 230 // 231 // * use jccb and jmpb instead of jcc and jmp to improve code density. 232 // But beware of excessive branch density on AMD Opterons. 233 // 234 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 235 // or failure of the fast path. If the fast path fails then we pass 236 // control to the slow path, typically in C. In fast_lock and 237 // fast_unlock we often branch to DONE_LABEL, just to find that C2 238 // will emit a conditional branch immediately after the node. 239 // So we have branches to branches and lots of ICC.ZF games. 240 // Instead, it might be better to have C2 pass a "FailureLabel" 241 // into fast_lock and fast_unlock. In the case of success, control 242 // will drop through the node. ICC.ZF is undefined at exit. 243 // In the case of failure, the node will branch directly to the 244 // FailureLabel 245 246 247 // obj: object to lock 248 // box: on-stack box address -- KILLED 249 // rax: tmp -- KILLED 250 // t : tmp -- KILLED 251 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 252 Register t, Register thread) { 253 assert(rax_reg == rax, "Used for CAS"); 254 assert_different_registers(obj, box, rax_reg, t, thread); 255 256 // Handle inflated monitor. 257 Label inflated; 258 // Finish fast lock successfully. ZF value is irrelevant. 259 Label locked; 260 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 261 Label slow_path; 262 263 if (UseObjectMonitorTable) { 264 // Clear cache in case fast locking succeeds or we need to take the slow-path. 265 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 266 } 267 268 if (DiagnoseSyncOnValueBasedClasses != 0) { 269 load_klass(rax_reg, obj, t); 270 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 271 jcc(Assembler::notZero, slow_path); 272 } 273 274 const Register mark = t; 275 276 { // Lightweight Lock 277 278 Label push; 279 280 const Register top = UseObjectMonitorTable ? rax_reg : box; 281 282 // Load the mark. 283 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 284 285 // Prefetch top. 286 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 287 288 // Check for monitor (0b10). 289 testptr(mark, markWord::monitor_value); 290 jcc(Assembler::notZero, inflated); 291 292 // Check if lock-stack is full. 293 cmpl(top, LockStack::end_offset() - 1); 294 jcc(Assembler::greater, slow_path); 295 296 // Check if recursive. 297 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 298 jccb(Assembler::equal, push); 299 300 // Try to lock. Transition lock bits 0b01 => 0b00 301 movptr(rax_reg, mark); 302 orptr(rax_reg, markWord::unlocked_value); 303 andptr(mark, ~(int32_t)markWord::unlocked_value); 304 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 305 jcc(Assembler::notEqual, slow_path); 306 307 if (UseObjectMonitorTable) { 308 // Need to reload top, clobbered by CAS. 309 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 310 } 311 bind(push); 312 // After successful lock, push object on lock-stack. 313 movptr(Address(thread, top), obj); 314 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 315 jmpb(locked); 316 } 317 318 { // Handle inflated monitor. 319 bind(inflated); 320 321 const Register monitor = t; 322 323 if (!UseObjectMonitorTable) { 324 assert(mark == monitor, "should be the same here"); 325 } else { 326 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 327 // Fetch ObjectMonitor* from the cache or take the slow-path. 328 Label monitor_found; 329 330 // Load cache address 331 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 332 333 const int num_unrolled = 2; 334 for (int i = 0; i < num_unrolled; i++) { 335 cmpptr(obj, Address(t)); 336 jccb(Assembler::equal, monitor_found); 337 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 338 } 339 340 Label loop; 341 342 // Search for obj in cache. 343 bind(loop); 344 345 // Check for match. 346 cmpptr(obj, Address(t)); 347 jccb(Assembler::equal, monitor_found); 348 349 // Search until null encountered, guaranteed _null_sentinel at end. 350 cmpptr(Address(t), 1); 351 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 352 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 353 jmpb(loop); 354 355 // Cache hit. 356 bind(monitor_found); 357 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 358 } 359 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 360 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 361 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 362 363 Label monitor_locked; 364 // Lock the monitor. 365 366 if (UseObjectMonitorTable) { 367 // Cache the monitor for unlock before trashing box. On failure to acquire 368 // the lock, the slow path will reset the entry accordingly (see CacheSetter). 369 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 370 } 371 372 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 373 xorptr(rax_reg, rax_reg); 374 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset())); 375 lock(); cmpxchgptr(box, owner_address); 376 jccb(Assembler::equal, monitor_locked); 377 378 // Check if recursive. 379 cmpptr(box, rax_reg); 380 jccb(Assembler::notEqual, slow_path); 381 382 // Recursive. 383 increment(recursions_address); 384 385 bind(monitor_locked); 386 } 387 388 bind(locked); 389 // Set ZF = 1 390 xorl(rax_reg, rax_reg); 391 392 #ifdef ASSERT 393 // Check that locked label is reached with ZF set. 394 Label zf_correct; 395 Label zf_bad_zero; 396 jcc(Assembler::zero, zf_correct); 397 jmp(zf_bad_zero); 398 #endif 399 400 bind(slow_path); 401 #ifdef ASSERT 402 // Check that slow_path label is reached with ZF not set. 403 jcc(Assembler::notZero, zf_correct); 404 stop("Fast Lock ZF != 0"); 405 bind(zf_bad_zero); 406 stop("Fast Lock ZF != 1"); 407 bind(zf_correct); 408 #endif 409 // C2 uses the value of ZF to determine the continuation. 410 } 411 412 // obj: object to lock 413 // rax: tmp -- KILLED 414 // t : tmp - cannot be obj nor rax -- KILLED 415 // 416 // Some commentary on balanced locking: 417 // 418 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 419 // Methods that don't have provably balanced locking are forced to run in the 420 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 421 // The interpreter provides two properties: 422 // I1: At return-time the interpreter automatically and quietly unlocks any 423 // objects acquired in the current activation (frame). Recall that the 424 // interpreter maintains an on-stack list of locks currently held by 425 // a frame. 426 // I2: If a method attempts to unlock an object that is not held by the 427 // frame the interpreter throws IMSX. 428 // 429 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 430 // B() doesn't have provably balanced locking so it runs in the interpreter. 431 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 432 // is still locked by A(). 433 // 434 // The only other source of unbalanced locking would be JNI. The "Java Native Interface 435 // Specification" states that an object locked by JNI's MonitorEnter should not be 436 // unlocked by "normal" java-level locking and vice-versa. The specification doesn't 437 // specify what will occur if a program engages in such mixed-mode locking, however. 438 // Arguably given that the spec legislates the JNI case as undefined our implementation 439 // could reasonably *avoid* checking owner in fast_unlock(). 440 // In the interest of performance we elide m->Owner==Self check in unlock. 441 // A perfectly viable alternative is to elide the owner check except when 442 // Xcheck:jni is enabled. 443 444 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 445 assert(reg_rax == rax, "Used for CAS"); 446 assert_different_registers(obj, reg_rax, t); 447 448 // Handle inflated monitor. 449 Label inflated, inflated_check_lock_stack; 450 // Finish fast unlock successfully. MUST jump with ZF == 1 451 Label unlocked, slow_path; 452 453 const Register mark = t; 454 const Register monitor = t; 455 const Register top = UseObjectMonitorTable ? t : reg_rax; 456 const Register box = reg_rax; 457 458 Label dummy; 459 C2FastUnlockLightweightStub* stub = nullptr; 460 461 if (!Compile::current()->output()->in_scratch_emit_size()) { 462 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 463 Compile::current()->output()->add_stub(stub); 464 } 465 466 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 467 468 { // Lightweight Unlock 469 470 // Load top. 471 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 472 473 if (!UseObjectMonitorTable) { 474 // Prefetch mark. 475 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 476 } 477 478 // Check if obj is top of lock-stack. 479 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 480 // Top of lock stack was not obj. Must be monitor. 481 jcc(Assembler::notEqual, inflated_check_lock_stack); 482 483 // Pop lock-stack. 484 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 485 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 486 487 // Check if recursive. 488 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 489 jcc(Assembler::equal, unlocked); 490 491 // We elide the monitor check, let the CAS fail instead. 492 493 if (UseObjectMonitorTable) { 494 // Load mark. 495 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 496 } 497 498 // Try to unlock. Transition lock bits 0b00 => 0b01 499 movptr(reg_rax, mark); 500 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 501 orptr(mark, markWord::unlocked_value); 502 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 503 jcc(Assembler::notEqual, push_and_slow_path); 504 jmp(unlocked); 505 } 506 507 508 { // Handle inflated monitor. 509 bind(inflated_check_lock_stack); 510 #ifdef ASSERT 511 Label check_done; 512 subl(top, oopSize); 513 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 514 jcc(Assembler::below, check_done); 515 cmpptr(obj, Address(thread, top)); 516 jccb(Assembler::notEqual, inflated_check_lock_stack); 517 stop("Fast Unlock lock on stack"); 518 bind(check_done); 519 if (UseObjectMonitorTable) { 520 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 521 } 522 testptr(mark, markWord::monitor_value); 523 jccb(Assembler::notZero, inflated); 524 stop("Fast Unlock not monitor"); 525 #endif 526 527 bind(inflated); 528 529 if (!UseObjectMonitorTable) { 530 assert(mark == monitor, "should be the same here"); 531 } else { 532 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 533 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 534 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 535 cmpptr(monitor, alignof(ObjectMonitor*)); 536 jcc(Assembler::below, slow_path); 537 } 538 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 539 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 540 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 541 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag}; 542 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 543 544 Label recursive; 545 546 // Check if recursive. 547 cmpptr(recursions_address, 0); 548 jccb(Assembler::notZero, recursive); 549 550 // Set owner to null. 551 // Release to satisfy the JMM 552 movptr(owner_address, NULL_WORD); 553 // We need a full fence after clearing owner to avoid stranding. 554 // StoreLoad achieves this. 555 membar(StoreLoad); 556 557 // Check if the entry_list is empty. 558 cmpptr(entry_list_address, NULL_WORD); 559 jccb(Assembler::zero, unlocked); // If so we are done. 560 561 // Check if there is a successor. 562 cmpptr(succ_address, NULL_WORD); 563 jccb(Assembler::notZero, unlocked); // If so we are done. 564 565 // Save the monitor pointer in the current thread, so we can try to 566 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 567 if (!UseObjectMonitorTable) { 568 andptr(monitor, ~(int32_t)markWord::monitor_value); 569 } 570 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 571 572 orl(t, 1); // Fast Unlock ZF = 0 573 jmpb(slow_path); 574 575 // Recursive unlock. 576 bind(recursive); 577 decrement(recursions_address); 578 } 579 580 bind(unlocked); 581 xorl(t, t); // Fast Unlock ZF = 1 582 583 #ifdef ASSERT 584 // Check that unlocked label is reached with ZF set. 585 Label zf_correct; 586 Label zf_bad_zero; 587 jcc(Assembler::zero, zf_correct); 588 jmp(zf_bad_zero); 589 #endif 590 591 bind(slow_path); 592 if (stub != nullptr) { 593 bind(stub->slow_path_continuation()); 594 } 595 #ifdef ASSERT 596 // Check that stub->continuation() label is reached with ZF not set. 597 jcc(Assembler::notZero, zf_correct); 598 stop("Fast Unlock ZF != 0"); 599 bind(zf_bad_zero); 600 stop("Fast Unlock ZF != 1"); 601 bind(zf_correct); 602 #endif 603 // C2 uses the value of ZF to determine the continuation. 604 } 605 606 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) { 607 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi); 608 } 609 610 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) { 611 const int framesize = Compile::current()->output()->frame_size_in_bytes(); 612 masm->movptr(dst, rsp); 613 if (framesize > 2 * wordSize) { 614 masm->addptr(dst, framesize - 2 * wordSize); 615 } 616 } 617 618 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) { 619 if (PreserveFramePointer) { 620 // frame pointer is valid 621 #ifdef ASSERT 622 // Verify frame pointer value in rbp. 623 reconstruct_frame_pointer_helper(this, rtmp); 624 Label L_success; 625 cmpq(rbp, rtmp); 626 jccb(Assembler::equal, L_success); 627 STOP("frame pointer mismatch"); 628 bind(L_success); 629 #endif // ASSERT 630 } else { 631 reconstruct_frame_pointer_helper(this, rbp); 632 } 633 } 634 635 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) { 636 jint lo = t->_lo; 637 jint hi = t->_hi; 638 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi); 639 if (t == TypeInt::INT) { 640 return; 641 } 642 643 BLOCK_COMMENT("CastII {"); 644 Label fail; 645 Label succeed; 646 if (hi == max_jint) { 647 cmpl(val, lo); 648 jccb(Assembler::greaterEqual, succeed); 649 } else { 650 if (lo != min_jint) { 651 cmpl(val, lo); 652 jccb(Assembler::less, fail); 653 } 654 cmpl(val, hi); 655 jccb(Assembler::lessEqual, succeed); 656 } 657 658 bind(fail); 659 movl(c_rarg0, idx); 660 movl(c_rarg1, val); 661 movl(c_rarg2, lo); 662 movl(c_rarg3, hi); 663 reconstruct_frame_pointer(rscratch1); 664 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range))); 665 hlt(); 666 bind(succeed); 667 BLOCK_COMMENT("} // CastII"); 668 } 669 670 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) { 671 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi); 672 } 673 674 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) { 675 jlong lo = t->_lo; 676 jlong hi = t->_hi; 677 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi); 678 if (t == TypeLong::LONG) { 679 return; 680 } 681 682 BLOCK_COMMENT("CastLL {"); 683 Label fail; 684 Label succeed; 685 686 auto cmp_val = [&](jlong bound) { 687 if (is_simm32(bound)) { 688 cmpq(val, checked_cast<int>(bound)); 689 } else { 690 mov64(tmp, bound); 691 cmpq(val, tmp); 692 } 693 }; 694 695 if (hi == max_jlong) { 696 cmp_val(lo); 697 jccb(Assembler::greaterEqual, succeed); 698 } else { 699 if (lo != min_jlong) { 700 cmp_val(lo); 701 jccb(Assembler::less, fail); 702 } 703 cmp_val(hi); 704 jccb(Assembler::lessEqual, succeed); 705 } 706 707 bind(fail); 708 movl(c_rarg0, idx); 709 movq(c_rarg1, val); 710 mov64(c_rarg2, lo); 711 mov64(c_rarg3, hi); 712 reconstruct_frame_pointer(rscratch1); 713 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range))); 714 hlt(); 715 bind(succeed); 716 BLOCK_COMMENT("} // CastLL"); 717 } 718 719 //------------------------------------------------------------------------------------------- 720 // Generic instructions support for use in .ad files C2 code generation 721 722 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 723 if (dst != src) { 724 movdqu(dst, src); 725 } 726 if (opcode == Op_AbsVD) { 727 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 728 } else { 729 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 730 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 731 } 732 } 733 734 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 735 if (opcode == Op_AbsVD) { 736 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 737 } else { 738 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 739 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 740 } 741 } 742 743 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 744 if (dst != src) { 745 movdqu(dst, src); 746 } 747 if (opcode == Op_AbsVF) { 748 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 749 } else { 750 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 751 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 752 } 753 } 754 755 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 756 if (opcode == Op_AbsVF) { 757 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 758 } else { 759 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 760 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 761 } 762 } 763 764 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 765 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 766 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 767 768 if (opcode == Op_MinV) { 769 if (elem_bt == T_BYTE) { 770 pminsb(dst, src); 771 } else if (elem_bt == T_SHORT) { 772 pminsw(dst, src); 773 } else if (elem_bt == T_INT) { 774 pminsd(dst, src); 775 } else { 776 assert(elem_bt == T_LONG, "required"); 777 assert(tmp == xmm0, "required"); 778 assert_different_registers(dst, src, tmp); 779 movdqu(xmm0, dst); 780 pcmpgtq(xmm0, src); 781 blendvpd(dst, src); // xmm0 as mask 782 } 783 } else { // opcode == Op_MaxV 784 if (elem_bt == T_BYTE) { 785 pmaxsb(dst, src); 786 } else if (elem_bt == T_SHORT) { 787 pmaxsw(dst, src); 788 } else if (elem_bt == T_INT) { 789 pmaxsd(dst, src); 790 } else { 791 assert(elem_bt == T_LONG, "required"); 792 assert(tmp == xmm0, "required"); 793 assert_different_registers(dst, src, tmp); 794 movdqu(xmm0, src); 795 pcmpgtq(xmm0, dst); 796 blendvpd(dst, src); // xmm0 as mask 797 } 798 } 799 } 800 801 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 802 XMMRegister src1, Address src2, int vlen_enc) { 803 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 804 if (opcode == Op_UMinV) { 805 switch(elem_bt) { 806 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 807 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 808 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 809 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 810 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 811 } 812 } else { 813 assert(opcode == Op_UMaxV, "required"); 814 switch(elem_bt) { 815 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 816 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 817 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 818 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 819 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 820 } 821 } 822 } 823 824 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 825 // For optimality, leverage a full vector width of 512 bits 826 // for operations over smaller vector sizes on AVX512 targets. 827 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { 828 if (opcode == Op_UMaxV) { 829 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 830 } else { 831 assert(opcode == Op_UMinV, "required"); 832 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 833 } 834 } else { 835 // T1 = -1 836 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc); 837 // T1 = -1 << 63 838 vpsllq(xtmp1, xtmp1, 63, vlen_enc); 839 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2 840 vpaddq(xtmp2, xtmp1, src2, vlen_enc); 841 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1 842 vpaddq(xtmp1, xtmp1, src1, vlen_enc); 843 // Mask = T2 > T1 844 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc); 845 if (opcode == Op_UMaxV) { 846 // Res = Mask ? Src2 : Src1 847 vpblendvb(dst, src1, src2, xtmp1, vlen_enc); 848 } else { 849 // Res = Mask ? Src1 : Src2 850 vpblendvb(dst, src2, src1, xtmp1, vlen_enc); 851 } 852 } 853 } 854 855 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 856 XMMRegister src1, XMMRegister src2, int vlen_enc) { 857 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 858 if (opcode == Op_UMinV) { 859 switch(elem_bt) { 860 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 861 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 862 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 863 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 864 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 865 } 866 } else { 867 assert(opcode == Op_UMaxV, "required"); 868 switch(elem_bt) { 869 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 870 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 871 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 872 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 873 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 874 } 875 } 876 } 877 878 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 879 XMMRegister dst, XMMRegister src1, XMMRegister src2, 880 int vlen_enc) { 881 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 882 883 if (opcode == Op_MinV) { 884 if (elem_bt == T_BYTE) { 885 vpminsb(dst, src1, src2, vlen_enc); 886 } else if (elem_bt == T_SHORT) { 887 vpminsw(dst, src1, src2, vlen_enc); 888 } else if (elem_bt == T_INT) { 889 vpminsd(dst, src1, src2, vlen_enc); 890 } else { 891 assert(elem_bt == T_LONG, "required"); 892 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 893 vpminsq(dst, src1, src2, vlen_enc); 894 } else { 895 assert_different_registers(dst, src1, src2); 896 vpcmpgtq(dst, src1, src2, vlen_enc); 897 vblendvpd(dst, src1, src2, dst, vlen_enc); 898 } 899 } 900 } else { // opcode == Op_MaxV 901 if (elem_bt == T_BYTE) { 902 vpmaxsb(dst, src1, src2, vlen_enc); 903 } else if (elem_bt == T_SHORT) { 904 vpmaxsw(dst, src1, src2, vlen_enc); 905 } else if (elem_bt == T_INT) { 906 vpmaxsd(dst, src1, src2, vlen_enc); 907 } else { 908 assert(elem_bt == T_LONG, "required"); 909 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 910 vpmaxsq(dst, src1, src2, vlen_enc); 911 } else { 912 assert_different_registers(dst, src1, src2); 913 vpcmpgtq(dst, src1, src2, vlen_enc); 914 vblendvpd(dst, src2, src1, dst, vlen_enc); 915 } 916 } 917 } 918 } 919 920 // Float/Double min max 921 922 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 923 XMMRegister dst, XMMRegister a, XMMRegister b, 924 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 925 int vlen_enc) { 926 assert(UseAVX > 0, "required"); 927 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 928 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 929 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 930 assert_different_registers(a, tmp, atmp, btmp); 931 assert_different_registers(b, tmp, atmp, btmp); 932 933 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 934 bool is_double_word = is_double_word_type(elem_bt); 935 936 /* Note on 'non-obvious' assembly sequence: 937 * 938 * While there are vminps/vmaxps instructions, there are two important differences between hardware 939 * and Java on how they handle floats: 940 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 941 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 942 * 943 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 944 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 945 * (only useful when signs differ, noop otherwise) 946 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 947 948 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 949 * btmp = (b < +0.0) ? a : b 950 * atmp = (b < +0.0) ? b : a 951 * Tmp = Max_Float(atmp , btmp) 952 * Res = (atmp == NaN) ? atmp : Tmp 953 */ 954 955 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 956 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 957 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 958 XMMRegister mask; 959 960 if (!is_double_word && is_min) { 961 mask = a; 962 vblend = &MacroAssembler::vblendvps; 963 vmaxmin = &MacroAssembler::vminps; 964 vcmp = &MacroAssembler::vcmpps; 965 } else if (!is_double_word && !is_min) { 966 mask = b; 967 vblend = &MacroAssembler::vblendvps; 968 vmaxmin = &MacroAssembler::vmaxps; 969 vcmp = &MacroAssembler::vcmpps; 970 } else if (is_double_word && is_min) { 971 mask = a; 972 vblend = &MacroAssembler::vblendvpd; 973 vmaxmin = &MacroAssembler::vminpd; 974 vcmp = &MacroAssembler::vcmppd; 975 } else { 976 assert(is_double_word && !is_min, "sanity"); 977 mask = b; 978 vblend = &MacroAssembler::vblendvpd; 979 vmaxmin = &MacroAssembler::vmaxpd; 980 vcmp = &MacroAssembler::vcmppd; 981 } 982 983 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 984 XMMRegister maxmin, scratch; 985 if (dst == btmp) { 986 maxmin = btmp; 987 scratch = tmp; 988 } else { 989 maxmin = tmp; 990 scratch = btmp; 991 } 992 993 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 994 if (precompute_mask && !is_double_word) { 995 vpsrad(tmp, mask, 32, vlen_enc); 996 mask = tmp; 997 } else if (precompute_mask && is_double_word) { 998 vpxor(tmp, tmp, tmp, vlen_enc); 999 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1000 mask = tmp; 1001 } 1002 1003 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1004 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1005 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1006 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1007 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1008 } 1009 1010 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1011 XMMRegister dst, XMMRegister a, XMMRegister b, 1012 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1013 int vlen_enc) { 1014 assert(UseAVX > 2, "required"); 1015 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1016 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1017 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1018 assert_different_registers(dst, a, atmp, btmp); 1019 assert_different_registers(dst, b, atmp, btmp); 1020 1021 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1022 bool is_double_word = is_double_word_type(elem_bt); 1023 bool merge = true; 1024 1025 if (!is_double_word && is_min) { 1026 evpmovd2m(ktmp, a, vlen_enc); 1027 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1028 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1029 vminps(dst, atmp, btmp, vlen_enc); 1030 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1031 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1032 } else if (!is_double_word && !is_min) { 1033 evpmovd2m(ktmp, b, vlen_enc); 1034 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1035 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1036 vmaxps(dst, atmp, btmp, vlen_enc); 1037 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1038 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1039 } else if (is_double_word && is_min) { 1040 evpmovq2m(ktmp, a, vlen_enc); 1041 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1042 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1043 vminpd(dst, atmp, btmp, vlen_enc); 1044 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1045 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1046 } else { 1047 assert(is_double_word && !is_min, "sanity"); 1048 evpmovq2m(ktmp, b, vlen_enc); 1049 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1050 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1051 vmaxpd(dst, atmp, btmp, vlen_enc); 1052 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1053 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1054 } 1055 } 1056 1057 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask, 1058 XMMRegister src1, XMMRegister src2, int vlen_enc) { 1059 assert(opc == Op_MinV || opc == Op_MinReductionV || 1060 opc == Op_MaxV || opc == Op_MaxReductionV, "sanity"); 1061 1062 int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_MINMAX_MIN_COMPARE_SIGN 1063 : AVX10_MINMAX_MAX_COMPARE_SIGN; 1064 if (elem_bt == T_FLOAT) { 1065 evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc); 1066 } else { 1067 assert(elem_bt == T_DOUBLE, ""); 1068 evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc); 1069 } 1070 } 1071 1072 // Float/Double signum 1073 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1074 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1075 1076 Label DONE_LABEL; 1077 1078 if (opcode == Op_SignumF) { 1079 ucomiss(dst, zero); 1080 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1081 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1082 movflt(dst, one); 1083 jcc(Assembler::above, DONE_LABEL); 1084 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1085 } else if (opcode == Op_SignumD) { 1086 ucomisd(dst, zero); 1087 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1088 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1089 movdbl(dst, one); 1090 jcc(Assembler::above, DONE_LABEL); 1091 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1092 } 1093 1094 bind(DONE_LABEL); 1095 } 1096 1097 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1098 if (sign) { 1099 pmovsxbw(dst, src); 1100 } else { 1101 pmovzxbw(dst, src); 1102 } 1103 } 1104 1105 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1106 if (sign) { 1107 vpmovsxbw(dst, src, vector_len); 1108 } else { 1109 vpmovzxbw(dst, src, vector_len); 1110 } 1111 } 1112 1113 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1114 if (sign) { 1115 vpmovsxbd(dst, src, vector_len); 1116 } else { 1117 vpmovzxbd(dst, src, vector_len); 1118 } 1119 } 1120 1121 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1122 if (sign) { 1123 vpmovsxwd(dst, src, vector_len); 1124 } else { 1125 vpmovzxwd(dst, src, vector_len); 1126 } 1127 } 1128 1129 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1130 int shift, int vector_len) { 1131 if (opcode == Op_RotateLeftV) { 1132 if (etype == T_INT) { 1133 evprold(dst, src, shift, vector_len); 1134 } else { 1135 assert(etype == T_LONG, "expected type T_LONG"); 1136 evprolq(dst, src, shift, vector_len); 1137 } 1138 } else { 1139 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1140 if (etype == T_INT) { 1141 evprord(dst, src, shift, vector_len); 1142 } else { 1143 assert(etype == T_LONG, "expected type T_LONG"); 1144 evprorq(dst, src, shift, vector_len); 1145 } 1146 } 1147 } 1148 1149 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1150 XMMRegister shift, int vector_len) { 1151 if (opcode == Op_RotateLeftV) { 1152 if (etype == T_INT) { 1153 evprolvd(dst, src, shift, vector_len); 1154 } else { 1155 assert(etype == T_LONG, "expected type T_LONG"); 1156 evprolvq(dst, src, shift, vector_len); 1157 } 1158 } else { 1159 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1160 if (etype == T_INT) { 1161 evprorvd(dst, src, shift, vector_len); 1162 } else { 1163 assert(etype == T_LONG, "expected type T_LONG"); 1164 evprorvq(dst, src, shift, vector_len); 1165 } 1166 } 1167 } 1168 1169 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1170 if (opcode == Op_RShiftVI) { 1171 psrad(dst, shift); 1172 } else if (opcode == Op_LShiftVI) { 1173 pslld(dst, shift); 1174 } else { 1175 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1176 psrld(dst, shift); 1177 } 1178 } 1179 1180 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1181 switch (opcode) { 1182 case Op_RShiftVI: psrad(dst, shift); break; 1183 case Op_LShiftVI: pslld(dst, shift); break; 1184 case Op_URShiftVI: psrld(dst, shift); break; 1185 1186 default: assert(false, "%s", NodeClassNames[opcode]); 1187 } 1188 } 1189 1190 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1191 if (opcode == Op_RShiftVI) { 1192 vpsrad(dst, nds, shift, vector_len); 1193 } else if (opcode == Op_LShiftVI) { 1194 vpslld(dst, nds, shift, vector_len); 1195 } else { 1196 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1197 vpsrld(dst, nds, shift, vector_len); 1198 } 1199 } 1200 1201 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1202 switch (opcode) { 1203 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1204 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1205 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1206 1207 default: assert(false, "%s", NodeClassNames[opcode]); 1208 } 1209 } 1210 1211 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1212 switch (opcode) { 1213 case Op_RShiftVB: // fall-through 1214 case Op_RShiftVS: psraw(dst, shift); break; 1215 1216 case Op_LShiftVB: // fall-through 1217 case Op_LShiftVS: psllw(dst, shift); break; 1218 1219 case Op_URShiftVS: // fall-through 1220 case Op_URShiftVB: psrlw(dst, shift); break; 1221 1222 default: assert(false, "%s", NodeClassNames[opcode]); 1223 } 1224 } 1225 1226 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1227 switch (opcode) { 1228 case Op_RShiftVB: // fall-through 1229 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1230 1231 case Op_LShiftVB: // fall-through 1232 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1233 1234 case Op_URShiftVS: // fall-through 1235 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1236 1237 default: assert(false, "%s", NodeClassNames[opcode]); 1238 } 1239 } 1240 1241 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1242 switch (opcode) { 1243 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1244 case Op_LShiftVL: psllq(dst, shift); break; 1245 case Op_URShiftVL: psrlq(dst, shift); break; 1246 1247 default: assert(false, "%s", NodeClassNames[opcode]); 1248 } 1249 } 1250 1251 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1252 if (opcode == Op_RShiftVL) { 1253 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1254 } else if (opcode == Op_LShiftVL) { 1255 psllq(dst, shift); 1256 } else { 1257 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1258 psrlq(dst, shift); 1259 } 1260 } 1261 1262 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1263 switch (opcode) { 1264 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1265 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1266 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1267 1268 default: assert(false, "%s", NodeClassNames[opcode]); 1269 } 1270 } 1271 1272 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1273 if (opcode == Op_RShiftVL) { 1274 evpsraq(dst, nds, shift, vector_len); 1275 } else if (opcode == Op_LShiftVL) { 1276 vpsllq(dst, nds, shift, vector_len); 1277 } else { 1278 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1279 vpsrlq(dst, nds, shift, vector_len); 1280 } 1281 } 1282 1283 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1284 switch (opcode) { 1285 case Op_RShiftVB: // fall-through 1286 case Op_RShiftVS: // fall-through 1287 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1288 1289 case Op_LShiftVB: // fall-through 1290 case Op_LShiftVS: // fall-through 1291 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1292 1293 case Op_URShiftVB: // fall-through 1294 case Op_URShiftVS: // fall-through 1295 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1296 1297 default: assert(false, "%s", NodeClassNames[opcode]); 1298 } 1299 } 1300 1301 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1302 switch (opcode) { 1303 case Op_RShiftVB: // fall-through 1304 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1305 1306 case Op_LShiftVB: // fall-through 1307 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1308 1309 case Op_URShiftVB: // fall-through 1310 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1311 1312 default: assert(false, "%s", NodeClassNames[opcode]); 1313 } 1314 } 1315 1316 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1317 assert(UseAVX >= 2, "required"); 1318 switch (opcode) { 1319 case Op_RShiftVL: { 1320 if (UseAVX > 2) { 1321 assert(tmp == xnoreg, "not used"); 1322 if (!VM_Version::supports_avx512vl()) { 1323 vlen_enc = Assembler::AVX_512bit; 1324 } 1325 evpsravq(dst, src, shift, vlen_enc); 1326 } else { 1327 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1328 vpsrlvq(dst, src, shift, vlen_enc); 1329 vpsrlvq(tmp, tmp, shift, vlen_enc); 1330 vpxor(dst, dst, tmp, vlen_enc); 1331 vpsubq(dst, dst, tmp, vlen_enc); 1332 } 1333 break; 1334 } 1335 case Op_LShiftVL: { 1336 assert(tmp == xnoreg, "not used"); 1337 vpsllvq(dst, src, shift, vlen_enc); 1338 break; 1339 } 1340 case Op_URShiftVL: { 1341 assert(tmp == xnoreg, "not used"); 1342 vpsrlvq(dst, src, shift, vlen_enc); 1343 break; 1344 } 1345 default: assert(false, "%s", NodeClassNames[opcode]); 1346 } 1347 } 1348 1349 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1350 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1351 assert(opcode == Op_LShiftVB || 1352 opcode == Op_RShiftVB || 1353 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1354 bool sign = (opcode != Op_URShiftVB); 1355 assert(vector_len == 0, "required"); 1356 vextendbd(sign, dst, src, 1); 1357 vpmovzxbd(vtmp, shift, 1); 1358 varshiftd(opcode, dst, dst, vtmp, 1); 1359 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1360 vextracti128_high(vtmp, dst); 1361 vpackusdw(dst, dst, vtmp, 0); 1362 } 1363 1364 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1365 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1366 assert(opcode == Op_LShiftVB || 1367 opcode == Op_RShiftVB || 1368 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1369 bool sign = (opcode != Op_URShiftVB); 1370 int ext_vector_len = vector_len + 1; 1371 vextendbw(sign, dst, src, ext_vector_len); 1372 vpmovzxbw(vtmp, shift, ext_vector_len); 1373 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1374 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1375 if (vector_len == 0) { 1376 vextracti128_high(vtmp, dst); 1377 vpackuswb(dst, dst, vtmp, vector_len); 1378 } else { 1379 vextracti64x4_high(vtmp, dst); 1380 vpackuswb(dst, dst, vtmp, vector_len); 1381 vpermq(dst, dst, 0xD8, vector_len); 1382 } 1383 } 1384 1385 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1386 switch(typ) { 1387 case T_BYTE: 1388 pinsrb(dst, val, idx); 1389 break; 1390 case T_SHORT: 1391 pinsrw(dst, val, idx); 1392 break; 1393 case T_INT: 1394 pinsrd(dst, val, idx); 1395 break; 1396 case T_LONG: 1397 pinsrq(dst, val, idx); 1398 break; 1399 default: 1400 assert(false,"Should not reach here."); 1401 break; 1402 } 1403 } 1404 1405 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1406 switch(typ) { 1407 case T_BYTE: 1408 vpinsrb(dst, src, val, idx); 1409 break; 1410 case T_SHORT: 1411 vpinsrw(dst, src, val, idx); 1412 break; 1413 case T_INT: 1414 vpinsrd(dst, src, val, idx); 1415 break; 1416 case T_LONG: 1417 vpinsrq(dst, src, val, idx); 1418 break; 1419 default: 1420 assert(false,"Should not reach here."); 1421 break; 1422 } 1423 } 1424 1425 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst, 1426 Register base, Register idx_base, 1427 Register mask, Register mask_idx, 1428 Register rtmp, int vlen_enc) { 1429 vpxor(dst, dst, dst, vlen_enc); 1430 if (elem_bt == T_SHORT) { 1431 for (int i = 0; i < 4; i++) { 1432 // dst[i] = mask[i] ? src[idx_base[i]] : 0 1433 Label skip_load; 1434 btq(mask, mask_idx); 1435 jccb(Assembler::carryClear, skip_load); 1436 movl(rtmp, Address(idx_base, i * 4)); 1437 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1438 bind(skip_load); 1439 incq(mask_idx); 1440 } 1441 } else { 1442 assert(elem_bt == T_BYTE, ""); 1443 for (int i = 0; i < 8; i++) { 1444 // dst[i] = mask[i] ? src[idx_base[i]] : 0 1445 Label skip_load; 1446 btq(mask, mask_idx); 1447 jccb(Assembler::carryClear, skip_load); 1448 movl(rtmp, Address(idx_base, i * 4)); 1449 pinsrb(dst, Address(base, rtmp), i); 1450 bind(skip_load); 1451 incq(mask_idx); 1452 } 1453 } 1454 } 1455 1456 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst, 1457 Register base, Register idx_base, 1458 Register rtmp, int vlen_enc) { 1459 vpxor(dst, dst, dst, vlen_enc); 1460 if (elem_bt == T_SHORT) { 1461 for (int i = 0; i < 4; i++) { 1462 // dst[i] = src[idx_base[i]] 1463 movl(rtmp, Address(idx_base, i * 4)); 1464 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1465 } 1466 } else { 1467 assert(elem_bt == T_BYTE, ""); 1468 for (int i = 0; i < 8; i++) { 1469 // dst[i] = src[idx_base[i]] 1470 movl(rtmp, Address(idx_base, i * 4)); 1471 pinsrb(dst, Address(base, rtmp), i); 1472 } 1473 } 1474 } 1475 1476 /* 1477 * Gather using hybrid algorithm, first partially unroll scalar loop 1478 * to accumulate values from gather indices into a quad-word(64bit) slice. 1479 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1480 * permutation to place the slice into appropriate vector lane 1481 * locations in destination vector. Following pseudo code describes the 1482 * algorithm in detail: 1483 * 1484 * DST_VEC = ZERO_VEC 1485 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1486 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1487 * FOREACH_ITER: 1488 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1489 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1490 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1491 * PERM_INDEX = PERM_INDEX - TWO_VEC 1492 * 1493 * With each iteration, doubleword permute indices (0,1) corresponding 1494 * to gathered quadword gets right shifted by two lane positions. 1495 * 1496 */ 1497 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1498 Register base, Register idx_base, 1499 Register mask, XMMRegister xtmp1, 1500 XMMRegister xtmp2, XMMRegister temp_dst, 1501 Register rtmp, Register mask_idx, 1502 Register length, int vector_len, int vlen_enc) { 1503 Label GATHER8_LOOP; 1504 assert(is_subword_type(elem_ty), ""); 1505 movl(length, vector_len); 1506 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1507 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1508 vallones(xtmp2, vlen_enc); 1509 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1510 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1511 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1512 1513 bind(GATHER8_LOOP); 1514 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1515 if (mask == noreg) { 1516 vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc); 1517 } else { 1518 vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc); 1519 } 1520 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1521 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1522 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1523 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1524 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1525 vpor(dst, dst, temp_dst, vlen_enc); 1526 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1527 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1528 jcc(Assembler::notEqual, GATHER8_LOOP); 1529 } 1530 1531 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1532 switch(typ) { 1533 case T_INT: 1534 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1535 break; 1536 case T_FLOAT: 1537 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1538 break; 1539 case T_LONG: 1540 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1541 break; 1542 case T_DOUBLE: 1543 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1544 break; 1545 default: 1546 assert(false,"Should not reach here."); 1547 break; 1548 } 1549 } 1550 1551 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1552 switch(typ) { 1553 case T_INT: 1554 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1555 break; 1556 case T_FLOAT: 1557 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1558 break; 1559 case T_LONG: 1560 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1561 break; 1562 case T_DOUBLE: 1563 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1564 break; 1565 default: 1566 assert(false,"Should not reach here."); 1567 break; 1568 } 1569 } 1570 1571 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1572 switch(typ) { 1573 case T_INT: 1574 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1575 break; 1576 case T_FLOAT: 1577 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1578 break; 1579 case T_LONG: 1580 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1581 break; 1582 case T_DOUBLE: 1583 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1584 break; 1585 default: 1586 assert(false,"Should not reach here."); 1587 break; 1588 } 1589 } 1590 1591 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1592 if (vlen_in_bytes <= 16) { 1593 pxor (dst, dst); 1594 psubb(dst, src); 1595 switch (elem_bt) { 1596 case T_BYTE: /* nothing to do */ break; 1597 case T_SHORT: pmovsxbw(dst, dst); break; 1598 case T_INT: pmovsxbd(dst, dst); break; 1599 case T_FLOAT: pmovsxbd(dst, dst); break; 1600 case T_LONG: pmovsxbq(dst, dst); break; 1601 case T_DOUBLE: pmovsxbq(dst, dst); break; 1602 1603 default: assert(false, "%s", type2name(elem_bt)); 1604 } 1605 } else { 1606 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1607 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1608 1609 vpxor (dst, dst, dst, vlen_enc); 1610 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1611 1612 switch (elem_bt) { 1613 case T_BYTE: /* nothing to do */ break; 1614 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1615 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1616 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1617 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1618 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1619 1620 default: assert(false, "%s", type2name(elem_bt)); 1621 } 1622 } 1623 } 1624 1625 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1626 if (novlbwdq) { 1627 vpmovsxbd(xtmp, src, vlen_enc); 1628 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1629 Assembler::eq, true, vlen_enc, noreg); 1630 } else { 1631 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1632 vpsubb(xtmp, xtmp, src, vlen_enc); 1633 evpmovb2m(dst, xtmp, vlen_enc); 1634 } 1635 } 1636 1637 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) { 1638 if (is_integral_type(bt)) { 1639 switch (vlen_in_bytes) { 1640 case 4: movdl(dst, src); break; 1641 case 8: movq(dst, src); break; 1642 case 16: movdqu(dst, src); break; 1643 case 32: vmovdqu(dst, src); break; 1644 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1645 default: ShouldNotReachHere(); 1646 } 1647 } else { 1648 switch (vlen_in_bytes) { 1649 case 4: movflt(dst, src); break; 1650 case 8: movdbl(dst, src); break; 1651 case 16: movups(dst, src); break; 1652 case 32: vmovups(dst, src, Assembler::AVX_256bit); break; 1653 case 64: vmovups(dst, src, Assembler::AVX_512bit); break; 1654 default: ShouldNotReachHere(); 1655 } 1656 } 1657 } 1658 1659 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1660 assert(rscratch != noreg || always_reachable(src), "missing"); 1661 1662 if (reachable(src)) { 1663 load_vector(bt, dst, as_Address(src), vlen_in_bytes); 1664 } else { 1665 lea(rscratch, src); 1666 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes); 1667 } 1668 } 1669 1670 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1671 int vlen_enc = vector_length_encoding(vlen); 1672 if (VM_Version::supports_avx()) { 1673 if (bt == T_LONG) { 1674 if (VM_Version::supports_avx2()) { 1675 vpbroadcastq(dst, src, vlen_enc); 1676 } else { 1677 vmovddup(dst, src, vlen_enc); 1678 } 1679 } else if (bt == T_DOUBLE) { 1680 if (vlen_enc != Assembler::AVX_128bit) { 1681 vbroadcastsd(dst, src, vlen_enc, noreg); 1682 } else { 1683 vmovddup(dst, src, vlen_enc); 1684 } 1685 } else { 1686 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1687 vpbroadcastd(dst, src, vlen_enc); 1688 } else { 1689 vbroadcastss(dst, src, vlen_enc); 1690 } 1691 } 1692 } else if (VM_Version::supports_sse3()) { 1693 movddup(dst, src); 1694 } else { 1695 load_vector(bt, dst, src, vlen); 1696 } 1697 } 1698 1699 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1700 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1701 int offset = exact_log2(type2aelembytes(bt)) << 6; 1702 if (is_floating_point_type(bt)) { 1703 offset += 128; 1704 } 1705 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1706 load_vector(T_BYTE, dst, addr, vlen_in_bytes); 1707 } 1708 1709 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1710 1711 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1712 int vector_len = Assembler::AVX_128bit; 1713 1714 switch (opcode) { 1715 case Op_AndReductionV: pand(dst, src); break; 1716 case Op_OrReductionV: por (dst, src); break; 1717 case Op_XorReductionV: pxor(dst, src); break; 1718 case Op_MinReductionV: 1719 switch (typ) { 1720 case T_BYTE: pminsb(dst, src); break; 1721 case T_SHORT: pminsw(dst, src); break; 1722 case T_INT: pminsd(dst, src); break; 1723 case T_LONG: assert(UseAVX > 2, "required"); 1724 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1725 default: assert(false, "wrong type"); 1726 } 1727 break; 1728 case Op_MaxReductionV: 1729 switch (typ) { 1730 case T_BYTE: pmaxsb(dst, src); break; 1731 case T_SHORT: pmaxsw(dst, src); break; 1732 case T_INT: pmaxsd(dst, src); break; 1733 case T_LONG: assert(UseAVX > 2, "required"); 1734 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1735 default: assert(false, "wrong type"); 1736 } 1737 break; 1738 case Op_AddReductionVF: addss(dst, src); break; 1739 case Op_AddReductionVD: addsd(dst, src); break; 1740 case Op_AddReductionVI: 1741 switch (typ) { 1742 case T_BYTE: paddb(dst, src); break; 1743 case T_SHORT: paddw(dst, src); break; 1744 case T_INT: paddd(dst, src); break; 1745 default: assert(false, "wrong type"); 1746 } 1747 break; 1748 case Op_AddReductionVL: paddq(dst, src); break; 1749 case Op_MulReductionVF: mulss(dst, src); break; 1750 case Op_MulReductionVD: mulsd(dst, src); break; 1751 case Op_MulReductionVI: 1752 switch (typ) { 1753 case T_SHORT: pmullw(dst, src); break; 1754 case T_INT: pmulld(dst, src); break; 1755 default: assert(false, "wrong type"); 1756 } 1757 break; 1758 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1759 evpmullq(dst, dst, src, vector_len); break; 1760 default: assert(false, "wrong opcode"); 1761 } 1762 } 1763 1764 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1765 switch (opcode) { 1766 case Op_AddReductionVF: addps(dst, src); break; 1767 case Op_AddReductionVD: addpd(dst, src); break; 1768 case Op_MulReductionVF: mulps(dst, src); break; 1769 case Op_MulReductionVD: mulpd(dst, src); break; 1770 default: assert(false, "%s", NodeClassNames[opcode]); 1771 } 1772 } 1773 1774 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1775 int vector_len = Assembler::AVX_256bit; 1776 1777 switch (opcode) { 1778 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1779 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1780 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1781 case Op_MinReductionV: 1782 switch (typ) { 1783 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1784 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1785 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1786 case T_LONG: assert(UseAVX > 2, "required"); 1787 vpminsq(dst, src1, src2, vector_len); break; 1788 default: assert(false, "wrong type"); 1789 } 1790 break; 1791 case Op_MaxReductionV: 1792 switch (typ) { 1793 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1794 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1795 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1796 case T_LONG: assert(UseAVX > 2, "required"); 1797 vpmaxsq(dst, src1, src2, vector_len); break; 1798 default: assert(false, "wrong type"); 1799 } 1800 break; 1801 case Op_AddReductionVI: 1802 switch (typ) { 1803 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1804 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1805 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1806 default: assert(false, "wrong type"); 1807 } 1808 break; 1809 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1810 case Op_MulReductionVI: 1811 switch (typ) { 1812 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1813 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1814 default: assert(false, "wrong type"); 1815 } 1816 break; 1817 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1818 default: assert(false, "wrong opcode"); 1819 } 1820 } 1821 1822 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1823 int vector_len = Assembler::AVX_256bit; 1824 1825 switch (opcode) { 1826 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 1827 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 1828 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 1829 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 1830 default: assert(false, "%s", NodeClassNames[opcode]); 1831 } 1832 } 1833 1834 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1835 XMMRegister dst, XMMRegister src, 1836 XMMRegister vtmp1, XMMRegister vtmp2) { 1837 switch (opcode) { 1838 case Op_AddReductionVF: 1839 case Op_MulReductionVF: 1840 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1841 break; 1842 1843 case Op_AddReductionVD: 1844 case Op_MulReductionVD: 1845 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1846 break; 1847 1848 default: assert(false, "wrong opcode"); 1849 } 1850 } 1851 1852 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 1853 XMMRegister dst, XMMRegister src, 1854 XMMRegister vtmp1, XMMRegister vtmp2) { 1855 switch (opcode) { 1856 case Op_AddReductionVF: 1857 case Op_MulReductionVF: 1858 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1859 break; 1860 1861 case Op_AddReductionVD: 1862 case Op_MulReductionVD: 1863 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1864 break; 1865 1866 default: assert(false, "%s", NodeClassNames[opcode]); 1867 } 1868 } 1869 1870 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1871 Register dst, Register src1, XMMRegister src2, 1872 XMMRegister vtmp1, XMMRegister vtmp2) { 1873 switch (vlen) { 1874 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1875 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1876 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1877 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1878 1879 default: assert(false, "wrong vector length"); 1880 } 1881 } 1882 1883 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1884 Register dst, Register src1, XMMRegister src2, 1885 XMMRegister vtmp1, XMMRegister vtmp2) { 1886 switch (vlen) { 1887 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1888 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1889 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1890 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1891 1892 default: assert(false, "wrong vector length"); 1893 } 1894 } 1895 1896 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1897 Register dst, Register src1, XMMRegister src2, 1898 XMMRegister vtmp1, XMMRegister vtmp2) { 1899 switch (vlen) { 1900 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1901 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1902 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1903 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1904 1905 default: assert(false, "wrong vector length"); 1906 } 1907 } 1908 1909 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1910 Register dst, Register src1, XMMRegister src2, 1911 XMMRegister vtmp1, XMMRegister vtmp2) { 1912 switch (vlen) { 1913 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1914 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1915 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1916 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1917 1918 default: assert(false, "wrong vector length"); 1919 } 1920 } 1921 1922 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1923 Register dst, Register src1, XMMRegister src2, 1924 XMMRegister vtmp1, XMMRegister vtmp2) { 1925 switch (vlen) { 1926 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1927 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1928 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1929 1930 default: assert(false, "wrong vector length"); 1931 } 1932 } 1933 1934 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1935 switch (vlen) { 1936 case 2: 1937 assert(vtmp2 == xnoreg, ""); 1938 reduce2F(opcode, dst, src, vtmp1); 1939 break; 1940 case 4: 1941 assert(vtmp2 == xnoreg, ""); 1942 reduce4F(opcode, dst, src, vtmp1); 1943 break; 1944 case 8: 1945 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1946 break; 1947 case 16: 1948 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1949 break; 1950 default: assert(false, "wrong vector length"); 1951 } 1952 } 1953 1954 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1955 switch (vlen) { 1956 case 2: 1957 assert(vtmp2 == xnoreg, ""); 1958 reduce2D(opcode, dst, src, vtmp1); 1959 break; 1960 case 4: 1961 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1962 break; 1963 case 8: 1964 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1965 break; 1966 default: assert(false, "wrong vector length"); 1967 } 1968 } 1969 1970 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1971 switch (vlen) { 1972 case 2: 1973 assert(vtmp1 == xnoreg, ""); 1974 assert(vtmp2 == xnoreg, ""); 1975 unorderedReduce2F(opcode, dst, src); 1976 break; 1977 case 4: 1978 assert(vtmp2 == xnoreg, ""); 1979 unorderedReduce4F(opcode, dst, src, vtmp1); 1980 break; 1981 case 8: 1982 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 1983 break; 1984 case 16: 1985 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 1986 break; 1987 default: assert(false, "wrong vector length"); 1988 } 1989 } 1990 1991 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1992 switch (vlen) { 1993 case 2: 1994 assert(vtmp1 == xnoreg, ""); 1995 assert(vtmp2 == xnoreg, ""); 1996 unorderedReduce2D(opcode, dst, src); 1997 break; 1998 case 4: 1999 assert(vtmp2 == xnoreg, ""); 2000 unorderedReduce4D(opcode, dst, src, vtmp1); 2001 break; 2002 case 8: 2003 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2004 break; 2005 default: assert(false, "wrong vector length"); 2006 } 2007 } 2008 2009 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2010 if (opcode == Op_AddReductionVI) { 2011 if (vtmp1 != src2) { 2012 movdqu(vtmp1, src2); 2013 } 2014 phaddd(vtmp1, vtmp1); 2015 } else { 2016 pshufd(vtmp1, src2, 0x1); 2017 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2018 } 2019 movdl(vtmp2, src1); 2020 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2021 movdl(dst, vtmp1); 2022 } 2023 2024 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2025 if (opcode == Op_AddReductionVI) { 2026 if (vtmp1 != src2) { 2027 movdqu(vtmp1, src2); 2028 } 2029 phaddd(vtmp1, src2); 2030 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2031 } else { 2032 pshufd(vtmp2, src2, 0xE); 2033 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2034 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2035 } 2036 } 2037 2038 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2039 if (opcode == Op_AddReductionVI) { 2040 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2041 vextracti128_high(vtmp2, vtmp1); 2042 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2043 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2044 } else { 2045 vextracti128_high(vtmp1, src2); 2046 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2047 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2048 } 2049 } 2050 2051 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2052 vextracti64x4_high(vtmp2, src2); 2053 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2054 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2055 } 2056 2057 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2058 pshufd(vtmp2, src2, 0x1); 2059 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2060 movdqu(vtmp1, vtmp2); 2061 psrldq(vtmp1, 2); 2062 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2063 movdqu(vtmp2, vtmp1); 2064 psrldq(vtmp2, 1); 2065 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2066 movdl(vtmp2, src1); 2067 pmovsxbd(vtmp1, vtmp1); 2068 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2069 pextrb(dst, vtmp1, 0x0); 2070 movsbl(dst, dst); 2071 } 2072 2073 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2074 pshufd(vtmp1, src2, 0xE); 2075 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2076 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2077 } 2078 2079 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2080 vextracti128_high(vtmp2, src2); 2081 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2082 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2083 } 2084 2085 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2086 vextracti64x4_high(vtmp1, src2); 2087 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2088 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2089 } 2090 2091 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2092 pmovsxbw(vtmp2, src2); 2093 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2094 } 2095 2096 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2097 if (UseAVX > 1) { 2098 int vector_len = Assembler::AVX_256bit; 2099 vpmovsxbw(vtmp1, src2, vector_len); 2100 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2101 } else { 2102 pmovsxbw(vtmp2, src2); 2103 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2104 pshufd(vtmp2, src2, 0x1); 2105 pmovsxbw(vtmp2, src2); 2106 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2107 } 2108 } 2109 2110 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2111 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2112 int vector_len = Assembler::AVX_512bit; 2113 vpmovsxbw(vtmp1, src2, vector_len); 2114 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2115 } else { 2116 assert(UseAVX >= 2,"Should not reach here."); 2117 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2118 vextracti128_high(vtmp2, src2); 2119 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2120 } 2121 } 2122 2123 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2124 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2125 vextracti64x4_high(vtmp2, src2); 2126 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2127 } 2128 2129 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2130 if (opcode == Op_AddReductionVI) { 2131 if (vtmp1 != src2) { 2132 movdqu(vtmp1, src2); 2133 } 2134 phaddw(vtmp1, vtmp1); 2135 phaddw(vtmp1, vtmp1); 2136 } else { 2137 pshufd(vtmp2, src2, 0x1); 2138 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2139 movdqu(vtmp1, vtmp2); 2140 psrldq(vtmp1, 2); 2141 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2142 } 2143 movdl(vtmp2, src1); 2144 pmovsxwd(vtmp1, vtmp1); 2145 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2146 pextrw(dst, vtmp1, 0x0); 2147 movswl(dst, dst); 2148 } 2149 2150 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2151 if (opcode == Op_AddReductionVI) { 2152 if (vtmp1 != src2) { 2153 movdqu(vtmp1, src2); 2154 } 2155 phaddw(vtmp1, src2); 2156 } else { 2157 pshufd(vtmp1, src2, 0xE); 2158 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2159 } 2160 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2161 } 2162 2163 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2164 if (opcode == Op_AddReductionVI) { 2165 int vector_len = Assembler::AVX_256bit; 2166 vphaddw(vtmp2, src2, src2, vector_len); 2167 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2168 } else { 2169 vextracti128_high(vtmp2, src2); 2170 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2171 } 2172 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2173 } 2174 2175 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2176 int vector_len = Assembler::AVX_256bit; 2177 vextracti64x4_high(vtmp1, src2); 2178 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2179 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2180 } 2181 2182 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2183 pshufd(vtmp2, src2, 0xE); 2184 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2185 movdq(vtmp1, src1); 2186 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2187 movdq(dst, vtmp1); 2188 } 2189 2190 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2191 vextracti128_high(vtmp1, src2); 2192 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2193 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2194 } 2195 2196 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2197 vextracti64x4_high(vtmp2, src2); 2198 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2199 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2200 } 2201 2202 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2203 mov64(temp, -1L); 2204 bzhiq(temp, temp, len); 2205 kmovql(dst, temp); 2206 } 2207 2208 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2209 reduce_operation_128(T_FLOAT, opcode, dst, src); 2210 pshufd(vtmp, src, 0x1); 2211 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2212 } 2213 2214 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2215 reduce2F(opcode, dst, src, vtmp); 2216 pshufd(vtmp, src, 0x2); 2217 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2218 pshufd(vtmp, src, 0x3); 2219 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2220 } 2221 2222 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2223 reduce4F(opcode, dst, src, vtmp2); 2224 vextractf128_high(vtmp2, src); 2225 reduce4F(opcode, dst, vtmp2, vtmp1); 2226 } 2227 2228 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2229 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2230 vextracti64x4_high(vtmp1, src); 2231 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2232 } 2233 2234 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2235 pshufd(dst, src, 0x1); 2236 reduce_operation_128(T_FLOAT, opcode, dst, src); 2237 } 2238 2239 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2240 pshufd(vtmp, src, 0xE); 2241 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2242 unorderedReduce2F(opcode, dst, vtmp); 2243 } 2244 2245 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2246 vextractf128_high(vtmp1, src); 2247 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2248 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2249 } 2250 2251 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2252 vextractf64x4_high(vtmp2, src); 2253 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2254 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2255 } 2256 2257 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2258 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2259 pshufd(vtmp, src, 0xE); 2260 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2261 } 2262 2263 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2264 reduce2D(opcode, dst, src, vtmp2); 2265 vextractf128_high(vtmp2, src); 2266 reduce2D(opcode, dst, vtmp2, vtmp1); 2267 } 2268 2269 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2270 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2271 vextracti64x4_high(vtmp1, src); 2272 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2273 } 2274 2275 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2276 pshufd(dst, src, 0xE); 2277 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2278 } 2279 2280 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2281 vextractf128_high(vtmp, src); 2282 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2283 unorderedReduce2D(opcode, dst, vtmp); 2284 } 2285 2286 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2287 vextractf64x4_high(vtmp2, src); 2288 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2289 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2290 } 2291 2292 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2293 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2294 } 2295 2296 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2297 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2298 } 2299 2300 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 2301 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2302 } 2303 2304 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2305 int vec_enc) { 2306 switch(elem_bt) { 2307 case T_INT: 2308 case T_FLOAT: 2309 vmaskmovps(dst, src, mask, vec_enc); 2310 break; 2311 case T_LONG: 2312 case T_DOUBLE: 2313 vmaskmovpd(dst, src, mask, vec_enc); 2314 break; 2315 default: 2316 fatal("Unsupported type %s", type2name(elem_bt)); 2317 break; 2318 } 2319 } 2320 2321 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2322 int vec_enc) { 2323 switch(elem_bt) { 2324 case T_INT: 2325 case T_FLOAT: 2326 vmaskmovps(dst, src, mask, vec_enc); 2327 break; 2328 case T_LONG: 2329 case T_DOUBLE: 2330 vmaskmovpd(dst, src, mask, vec_enc); 2331 break; 2332 default: 2333 fatal("Unsupported type %s", type2name(elem_bt)); 2334 break; 2335 } 2336 } 2337 2338 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2339 XMMRegister dst, XMMRegister src, 2340 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2341 XMMRegister xmm_0, XMMRegister xmm_1) { 2342 const int permconst[] = {1, 14}; 2343 XMMRegister wsrc = src; 2344 XMMRegister wdst = xmm_0; 2345 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2346 2347 int vlen_enc = Assembler::AVX_128bit; 2348 if (vlen == 16) { 2349 vlen_enc = Assembler::AVX_256bit; 2350 } 2351 2352 for (int i = log2(vlen) - 1; i >=0; i--) { 2353 if (i == 0 && !is_dst_valid) { 2354 wdst = dst; 2355 } 2356 if (i == 3) { 2357 vextracti64x4_high(wtmp, wsrc); 2358 } else if (i == 2) { 2359 vextracti128_high(wtmp, wsrc); 2360 } else { // i = [0,1] 2361 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2362 } 2363 2364 if (VM_Version::supports_avx10_2()) { 2365 vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc); 2366 } else { 2367 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2368 } 2369 wsrc = wdst; 2370 vlen_enc = Assembler::AVX_128bit; 2371 } 2372 if (is_dst_valid) { 2373 if (VM_Version::supports_avx10_2()) { 2374 vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit); 2375 } else { 2376 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2377 } 2378 } 2379 } 2380 2381 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2382 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2383 XMMRegister xmm_0, XMMRegister xmm_1) { 2384 XMMRegister wsrc = src; 2385 XMMRegister wdst = xmm_0; 2386 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2387 int vlen_enc = Assembler::AVX_128bit; 2388 if (vlen == 8) { 2389 vlen_enc = Assembler::AVX_256bit; 2390 } 2391 for (int i = log2(vlen) - 1; i >=0; i--) { 2392 if (i == 0 && !is_dst_valid) { 2393 wdst = dst; 2394 } 2395 if (i == 1) { 2396 vextracti128_high(wtmp, wsrc); 2397 } else if (i == 2) { 2398 vextracti64x4_high(wtmp, wsrc); 2399 } else { 2400 assert(i == 0, "%d", i); 2401 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2402 } 2403 2404 if (VM_Version::supports_avx10_2()) { 2405 vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc); 2406 } else { 2407 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2408 } 2409 2410 wsrc = wdst; 2411 vlen_enc = Assembler::AVX_128bit; 2412 } 2413 2414 if (is_dst_valid) { 2415 if (VM_Version::supports_avx10_2()) { 2416 vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit); 2417 } else { 2418 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2419 } 2420 } 2421 } 2422 2423 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2424 switch (bt) { 2425 case T_BYTE: pextrb(dst, src, idx); break; 2426 case T_SHORT: pextrw(dst, src, idx); break; 2427 case T_INT: pextrd(dst, src, idx); break; 2428 case T_LONG: pextrq(dst, src, idx); break; 2429 2430 default: 2431 assert(false,"Should not reach here."); 2432 break; 2433 } 2434 } 2435 2436 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2437 int esize = type2aelembytes(typ); 2438 int elem_per_lane = 16/esize; 2439 int lane = elemindex / elem_per_lane; 2440 int eindex = elemindex % elem_per_lane; 2441 2442 if (lane >= 2) { 2443 assert(UseAVX > 2, "required"); 2444 vextractf32x4(dst, src, lane & 3); 2445 return dst; 2446 } else if (lane > 0) { 2447 assert(UseAVX > 0, "required"); 2448 vextractf128(dst, src, lane); 2449 return dst; 2450 } else { 2451 return src; 2452 } 2453 } 2454 2455 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2456 if (typ == T_BYTE) { 2457 movsbl(dst, dst); 2458 } else if (typ == T_SHORT) { 2459 movswl(dst, dst); 2460 } 2461 } 2462 2463 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2464 int esize = type2aelembytes(typ); 2465 int elem_per_lane = 16/esize; 2466 int eindex = elemindex % elem_per_lane; 2467 assert(is_integral_type(typ),"required"); 2468 2469 if (eindex == 0) { 2470 if (typ == T_LONG) { 2471 movq(dst, src); 2472 } else { 2473 movdl(dst, src); 2474 movsxl(typ, dst); 2475 } 2476 } else { 2477 extract(typ, dst, src, eindex); 2478 movsxl(typ, dst); 2479 } 2480 } 2481 2482 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2483 int esize = type2aelembytes(typ); 2484 int elem_per_lane = 16/esize; 2485 int eindex = elemindex % elem_per_lane; 2486 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2487 2488 if (eindex == 0) { 2489 movq(dst, src); 2490 } else { 2491 if (typ == T_FLOAT) { 2492 if (UseAVX == 0) { 2493 movdqu(dst, src); 2494 shufps(dst, dst, eindex); 2495 } else { 2496 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2497 } 2498 } else { 2499 if (UseAVX == 0) { 2500 movdqu(dst, src); 2501 psrldq(dst, eindex*esize); 2502 } else { 2503 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2504 } 2505 movq(dst, dst); 2506 } 2507 } 2508 // Zero upper bits 2509 if (typ == T_FLOAT) { 2510 if (UseAVX == 0) { 2511 assert(vtmp != xnoreg, "required."); 2512 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2513 pand(dst, vtmp); 2514 } else { 2515 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2516 } 2517 } 2518 } 2519 2520 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2521 switch(typ) { 2522 case T_BYTE: 2523 case T_BOOLEAN: 2524 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2525 break; 2526 case T_SHORT: 2527 case T_CHAR: 2528 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2529 break; 2530 case T_INT: 2531 case T_FLOAT: 2532 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2533 break; 2534 case T_LONG: 2535 case T_DOUBLE: 2536 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2537 break; 2538 default: 2539 assert(false,"Should not reach here."); 2540 break; 2541 } 2542 } 2543 2544 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2545 assert(rscratch != noreg || always_reachable(src2), "missing"); 2546 2547 switch(typ) { 2548 case T_BOOLEAN: 2549 case T_BYTE: 2550 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2551 break; 2552 case T_CHAR: 2553 case T_SHORT: 2554 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2555 break; 2556 case T_INT: 2557 case T_FLOAT: 2558 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2559 break; 2560 case T_LONG: 2561 case T_DOUBLE: 2562 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2563 break; 2564 default: 2565 assert(false,"Should not reach here."); 2566 break; 2567 } 2568 } 2569 2570 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2571 switch(typ) { 2572 case T_BYTE: 2573 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2574 break; 2575 case T_SHORT: 2576 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2577 break; 2578 case T_INT: 2579 case T_FLOAT: 2580 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2581 break; 2582 case T_LONG: 2583 case T_DOUBLE: 2584 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2585 break; 2586 default: 2587 assert(false,"Should not reach here."); 2588 break; 2589 } 2590 } 2591 2592 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2593 assert(vlen_in_bytes <= 32, ""); 2594 int esize = type2aelembytes(bt); 2595 if (vlen_in_bytes == 32) { 2596 assert(vtmp == xnoreg, "required."); 2597 if (esize >= 4) { 2598 vtestps(src1, src2, AVX_256bit); 2599 } else { 2600 vptest(src1, src2, AVX_256bit); 2601 } 2602 return; 2603 } 2604 if (vlen_in_bytes < 16) { 2605 // Duplicate the lower part to fill the whole register, 2606 // Don't need to do so for src2 2607 assert(vtmp != xnoreg, "required"); 2608 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2609 pshufd(vtmp, src1, shuffle_imm); 2610 } else { 2611 assert(vtmp == xnoreg, "required"); 2612 vtmp = src1; 2613 } 2614 if (esize >= 4 && VM_Version::supports_avx()) { 2615 vtestps(vtmp, src2, AVX_128bit); 2616 } else { 2617 ptest(vtmp, src2); 2618 } 2619 } 2620 2621 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2622 #ifdef ASSERT 2623 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2624 bool is_bw_supported = VM_Version::supports_avx512bw(); 2625 if (is_bw && !is_bw_supported) { 2626 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2627 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2628 "XMM register should be 0-15"); 2629 } 2630 #endif // ASSERT 2631 switch (elem_bt) { 2632 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2633 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2634 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2635 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2636 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2637 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2638 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2639 } 2640 } 2641 2642 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2643 assert(UseAVX >= 2, "required"); 2644 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2645 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2646 if ((UseAVX > 2) && 2647 (!is_bw || VM_Version::supports_avx512bw()) && 2648 (!is_vl || VM_Version::supports_avx512vl())) { 2649 switch (elem_bt) { 2650 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2651 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2652 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2653 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2654 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2655 } 2656 } else { 2657 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2658 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2659 switch (elem_bt) { 2660 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2661 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2662 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2663 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2664 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2665 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2666 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2667 } 2668 } 2669 } 2670 2671 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2672 switch (to_elem_bt) { 2673 case T_SHORT: 2674 vpmovsxbw(dst, src, vlen_enc); 2675 break; 2676 case T_INT: 2677 vpmovsxbd(dst, src, vlen_enc); 2678 break; 2679 case T_FLOAT: 2680 vpmovsxbd(dst, src, vlen_enc); 2681 vcvtdq2ps(dst, dst, vlen_enc); 2682 break; 2683 case T_LONG: 2684 vpmovsxbq(dst, src, vlen_enc); 2685 break; 2686 case T_DOUBLE: { 2687 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2688 vpmovsxbd(dst, src, mid_vlen_enc); 2689 vcvtdq2pd(dst, dst, vlen_enc); 2690 break; 2691 } 2692 default: 2693 fatal("Unsupported type %s", type2name(to_elem_bt)); 2694 break; 2695 } 2696 } 2697 2698 //------------------------------------------------------------------------------------------- 2699 2700 // IndexOf for constant substrings with size >= 8 chars 2701 // which don't need to be loaded through stack. 2702 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2703 Register cnt1, Register cnt2, 2704 int int_cnt2, Register result, 2705 XMMRegister vec, Register tmp, 2706 int ae) { 2707 ShortBranchVerifier sbv(this); 2708 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2709 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2710 2711 // This method uses the pcmpestri instruction with bound registers 2712 // inputs: 2713 // xmm - substring 2714 // rax - substring length (elements count) 2715 // mem - scanned string 2716 // rdx - string length (elements count) 2717 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2718 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2719 // outputs: 2720 // rcx - matched index in string 2721 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2722 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2723 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2724 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2725 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2726 2727 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2728 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2729 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2730 2731 // Note, inline_string_indexOf() generates checks: 2732 // if (substr.count > string.count) return -1; 2733 // if (substr.count == 0) return 0; 2734 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2735 2736 // Load substring. 2737 if (ae == StrIntrinsicNode::UL) { 2738 pmovzxbw(vec, Address(str2, 0)); 2739 } else { 2740 movdqu(vec, Address(str2, 0)); 2741 } 2742 movl(cnt2, int_cnt2); 2743 movptr(result, str1); // string addr 2744 2745 if (int_cnt2 > stride) { 2746 jmpb(SCAN_TO_SUBSTR); 2747 2748 // Reload substr for rescan, this code 2749 // is executed only for large substrings (> 8 chars) 2750 bind(RELOAD_SUBSTR); 2751 if (ae == StrIntrinsicNode::UL) { 2752 pmovzxbw(vec, Address(str2, 0)); 2753 } else { 2754 movdqu(vec, Address(str2, 0)); 2755 } 2756 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2757 2758 bind(RELOAD_STR); 2759 // We came here after the beginning of the substring was 2760 // matched but the rest of it was not so we need to search 2761 // again. Start from the next element after the previous match. 2762 2763 // cnt2 is number of substring reminding elements and 2764 // cnt1 is number of string reminding elements when cmp failed. 2765 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2766 subl(cnt1, cnt2); 2767 addl(cnt1, int_cnt2); 2768 movl(cnt2, int_cnt2); // Now restore cnt2 2769 2770 decrementl(cnt1); // Shift to next element 2771 cmpl(cnt1, cnt2); 2772 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2773 2774 addptr(result, (1<<scale1)); 2775 2776 } // (int_cnt2 > 8) 2777 2778 // Scan string for start of substr in 16-byte vectors 2779 bind(SCAN_TO_SUBSTR); 2780 pcmpestri(vec, Address(result, 0), mode); 2781 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2782 subl(cnt1, stride); 2783 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2784 cmpl(cnt1, cnt2); 2785 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2786 addptr(result, 16); 2787 jmpb(SCAN_TO_SUBSTR); 2788 2789 // Found a potential substr 2790 bind(FOUND_CANDIDATE); 2791 // Matched whole vector if first element matched (tmp(rcx) == 0). 2792 if (int_cnt2 == stride) { 2793 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2794 } else { // int_cnt2 > 8 2795 jccb(Assembler::overflow, FOUND_SUBSTR); 2796 } 2797 // After pcmpestri tmp(rcx) contains matched element index 2798 // Compute start addr of substr 2799 lea(result, Address(result, tmp, scale1)); 2800 2801 // Make sure string is still long enough 2802 subl(cnt1, tmp); 2803 cmpl(cnt1, cnt2); 2804 if (int_cnt2 == stride) { 2805 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2806 } else { // int_cnt2 > 8 2807 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2808 } 2809 // Left less then substring. 2810 2811 bind(RET_NOT_FOUND); 2812 movl(result, -1); 2813 jmp(EXIT); 2814 2815 if (int_cnt2 > stride) { 2816 // This code is optimized for the case when whole substring 2817 // is matched if its head is matched. 2818 bind(MATCH_SUBSTR_HEAD); 2819 pcmpestri(vec, Address(result, 0), mode); 2820 // Reload only string if does not match 2821 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2822 2823 Label CONT_SCAN_SUBSTR; 2824 // Compare the rest of substring (> 8 chars). 2825 bind(FOUND_SUBSTR); 2826 // First 8 chars are already matched. 2827 negptr(cnt2); 2828 addptr(cnt2, stride); 2829 2830 bind(SCAN_SUBSTR); 2831 subl(cnt1, stride); 2832 cmpl(cnt2, -stride); // Do not read beyond substring 2833 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2834 // Back-up strings to avoid reading beyond substring: 2835 // cnt1 = cnt1 - cnt2 + 8 2836 addl(cnt1, cnt2); // cnt2 is negative 2837 addl(cnt1, stride); 2838 movl(cnt2, stride); negptr(cnt2); 2839 bind(CONT_SCAN_SUBSTR); 2840 if (int_cnt2 < (int)G) { 2841 int tail_off1 = int_cnt2<<scale1; 2842 int tail_off2 = int_cnt2<<scale2; 2843 if (ae == StrIntrinsicNode::UL) { 2844 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2845 } else { 2846 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2847 } 2848 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2849 } else { 2850 // calculate index in register to avoid integer overflow (int_cnt2*2) 2851 movl(tmp, int_cnt2); 2852 addptr(tmp, cnt2); 2853 if (ae == StrIntrinsicNode::UL) { 2854 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2855 } else { 2856 movdqu(vec, Address(str2, tmp, scale2, 0)); 2857 } 2858 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2859 } 2860 // Need to reload strings pointers if not matched whole vector 2861 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2862 addptr(cnt2, stride); 2863 jcc(Assembler::negative, SCAN_SUBSTR); 2864 // Fall through if found full substring 2865 2866 } // (int_cnt2 > 8) 2867 2868 bind(RET_FOUND); 2869 // Found result if we matched full small substring. 2870 // Compute substr offset 2871 subptr(result, str1); 2872 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2873 shrl(result, 1); // index 2874 } 2875 bind(EXIT); 2876 2877 } // string_indexofC8 2878 2879 // Small strings are loaded through stack if they cross page boundary. 2880 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2881 Register cnt1, Register cnt2, 2882 int int_cnt2, Register result, 2883 XMMRegister vec, Register tmp, 2884 int ae) { 2885 ShortBranchVerifier sbv(this); 2886 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2887 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2888 2889 // 2890 // int_cnt2 is length of small (< 8 chars) constant substring 2891 // or (-1) for non constant substring in which case its length 2892 // is in cnt2 register. 2893 // 2894 // Note, inline_string_indexOf() generates checks: 2895 // if (substr.count > string.count) return -1; 2896 // if (substr.count == 0) return 0; 2897 // 2898 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2899 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2900 // This method uses the pcmpestri instruction with bound registers 2901 // inputs: 2902 // xmm - substring 2903 // rax - substring length (elements count) 2904 // mem - scanned string 2905 // rdx - string length (elements count) 2906 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2907 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2908 // outputs: 2909 // rcx - matched index in string 2910 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2911 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2912 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2913 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2914 2915 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2916 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2917 FOUND_CANDIDATE; 2918 2919 { //======================================================== 2920 // We don't know where these strings are located 2921 // and we can't read beyond them. Load them through stack. 2922 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2923 2924 movptr(tmp, rsp); // save old SP 2925 2926 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2927 if (int_cnt2 == (1>>scale2)) { // One byte 2928 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2929 load_unsigned_byte(result, Address(str2, 0)); 2930 movdl(vec, result); // move 32 bits 2931 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2932 // Not enough header space in 32-bit VM: 12+3 = 15. 2933 movl(result, Address(str2, -1)); 2934 shrl(result, 8); 2935 movdl(vec, result); // move 32 bits 2936 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2937 load_unsigned_short(result, Address(str2, 0)); 2938 movdl(vec, result); // move 32 bits 2939 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2940 movdl(vec, Address(str2, 0)); // move 32 bits 2941 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2942 movq(vec, Address(str2, 0)); // move 64 bits 2943 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2944 // Array header size is 12 bytes in 32-bit VM 2945 // + 6 bytes for 3 chars == 18 bytes, 2946 // enough space to load vec and shift. 2947 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2948 if (ae == StrIntrinsicNode::UL) { 2949 int tail_off = int_cnt2-8; 2950 pmovzxbw(vec, Address(str2, tail_off)); 2951 psrldq(vec, -2*tail_off); 2952 } 2953 else { 2954 int tail_off = int_cnt2*(1<<scale2); 2955 movdqu(vec, Address(str2, tail_off-16)); 2956 psrldq(vec, 16-tail_off); 2957 } 2958 } 2959 } else { // not constant substring 2960 cmpl(cnt2, stride); 2961 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2962 2963 // We can read beyond string if srt+16 does not cross page boundary 2964 // since heaps are aligned and mapped by pages. 2965 assert(os::vm_page_size() < (int)G, "default page should be small"); 2966 movl(result, str2); // We need only low 32 bits 2967 andl(result, ((int)os::vm_page_size()-1)); 2968 cmpl(result, ((int)os::vm_page_size()-16)); 2969 jccb(Assembler::belowEqual, CHECK_STR); 2970 2971 // Move small strings to stack to allow load 16 bytes into vec. 2972 subptr(rsp, 16); 2973 int stk_offset = wordSize-(1<<scale2); 2974 push(cnt2); 2975 2976 bind(COPY_SUBSTR); 2977 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2978 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2979 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2980 } else if (ae == StrIntrinsicNode::UU) { 2981 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2982 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2983 } 2984 decrement(cnt2); 2985 jccb(Assembler::notZero, COPY_SUBSTR); 2986 2987 pop(cnt2); 2988 movptr(str2, rsp); // New substring address 2989 } // non constant 2990 2991 bind(CHECK_STR); 2992 cmpl(cnt1, stride); 2993 jccb(Assembler::aboveEqual, BIG_STRINGS); 2994 2995 // Check cross page boundary. 2996 movl(result, str1); // We need only low 32 bits 2997 andl(result, ((int)os::vm_page_size()-1)); 2998 cmpl(result, ((int)os::vm_page_size()-16)); 2999 jccb(Assembler::belowEqual, BIG_STRINGS); 3000 3001 subptr(rsp, 16); 3002 int stk_offset = -(1<<scale1); 3003 if (int_cnt2 < 0) { // not constant 3004 push(cnt2); 3005 stk_offset += wordSize; 3006 } 3007 movl(cnt2, cnt1); 3008 3009 bind(COPY_STR); 3010 if (ae == StrIntrinsicNode::LL) { 3011 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3012 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3013 } else { 3014 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3015 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3016 } 3017 decrement(cnt2); 3018 jccb(Assembler::notZero, COPY_STR); 3019 3020 if (int_cnt2 < 0) { // not constant 3021 pop(cnt2); 3022 } 3023 movptr(str1, rsp); // New string address 3024 3025 bind(BIG_STRINGS); 3026 // Load substring. 3027 if (int_cnt2 < 0) { // -1 3028 if (ae == StrIntrinsicNode::UL) { 3029 pmovzxbw(vec, Address(str2, 0)); 3030 } else { 3031 movdqu(vec, Address(str2, 0)); 3032 } 3033 push(cnt2); // substr count 3034 push(str2); // substr addr 3035 push(str1); // string addr 3036 } else { 3037 // Small (< 8 chars) constant substrings are loaded already. 3038 movl(cnt2, int_cnt2); 3039 } 3040 push(tmp); // original SP 3041 3042 } // Finished loading 3043 3044 //======================================================== 3045 // Start search 3046 // 3047 3048 movptr(result, str1); // string addr 3049 3050 if (int_cnt2 < 0) { // Only for non constant substring 3051 jmpb(SCAN_TO_SUBSTR); 3052 3053 // SP saved at sp+0 3054 // String saved at sp+1*wordSize 3055 // Substr saved at sp+2*wordSize 3056 // Substr count saved at sp+3*wordSize 3057 3058 // Reload substr for rescan, this code 3059 // is executed only for large substrings (> 8 chars) 3060 bind(RELOAD_SUBSTR); 3061 movptr(str2, Address(rsp, 2*wordSize)); 3062 movl(cnt2, Address(rsp, 3*wordSize)); 3063 if (ae == StrIntrinsicNode::UL) { 3064 pmovzxbw(vec, Address(str2, 0)); 3065 } else { 3066 movdqu(vec, Address(str2, 0)); 3067 } 3068 // We came here after the beginning of the substring was 3069 // matched but the rest of it was not so we need to search 3070 // again. Start from the next element after the previous match. 3071 subptr(str1, result); // Restore counter 3072 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3073 shrl(str1, 1); 3074 } 3075 addl(cnt1, str1); 3076 decrementl(cnt1); // Shift to next element 3077 cmpl(cnt1, cnt2); 3078 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3079 3080 addptr(result, (1<<scale1)); 3081 } // non constant 3082 3083 // Scan string for start of substr in 16-byte vectors 3084 bind(SCAN_TO_SUBSTR); 3085 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3086 pcmpestri(vec, Address(result, 0), mode); 3087 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3088 subl(cnt1, stride); 3089 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3090 cmpl(cnt1, cnt2); 3091 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3092 addptr(result, 16); 3093 3094 bind(ADJUST_STR); 3095 cmpl(cnt1, stride); // Do not read beyond string 3096 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3097 // Back-up string to avoid reading beyond string. 3098 lea(result, Address(result, cnt1, scale1, -16)); 3099 movl(cnt1, stride); 3100 jmpb(SCAN_TO_SUBSTR); 3101 3102 // Found a potential substr 3103 bind(FOUND_CANDIDATE); 3104 // After pcmpestri tmp(rcx) contains matched element index 3105 3106 // Make sure string is still long enough 3107 subl(cnt1, tmp); 3108 cmpl(cnt1, cnt2); 3109 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3110 // Left less then substring. 3111 3112 bind(RET_NOT_FOUND); 3113 movl(result, -1); 3114 jmp(CLEANUP); 3115 3116 bind(FOUND_SUBSTR); 3117 // Compute start addr of substr 3118 lea(result, Address(result, tmp, scale1)); 3119 if (int_cnt2 > 0) { // Constant substring 3120 // Repeat search for small substring (< 8 chars) 3121 // from new point without reloading substring. 3122 // Have to check that we don't read beyond string. 3123 cmpl(tmp, stride-int_cnt2); 3124 jccb(Assembler::greater, ADJUST_STR); 3125 // Fall through if matched whole substring. 3126 } else { // non constant 3127 assert(int_cnt2 == -1, "should be != 0"); 3128 3129 addl(tmp, cnt2); 3130 // Found result if we matched whole substring. 3131 cmpl(tmp, stride); 3132 jcc(Assembler::lessEqual, RET_FOUND); 3133 3134 // Repeat search for small substring (<= 8 chars) 3135 // from new point 'str1' without reloading substring. 3136 cmpl(cnt2, stride); 3137 // Have to check that we don't read beyond string. 3138 jccb(Assembler::lessEqual, ADJUST_STR); 3139 3140 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3141 // Compare the rest of substring (> 8 chars). 3142 movptr(str1, result); 3143 3144 cmpl(tmp, cnt2); 3145 // First 8 chars are already matched. 3146 jccb(Assembler::equal, CHECK_NEXT); 3147 3148 bind(SCAN_SUBSTR); 3149 pcmpestri(vec, Address(str1, 0), mode); 3150 // Need to reload strings pointers if not matched whole vector 3151 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3152 3153 bind(CHECK_NEXT); 3154 subl(cnt2, stride); 3155 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3156 addptr(str1, 16); 3157 if (ae == StrIntrinsicNode::UL) { 3158 addptr(str2, 8); 3159 } else { 3160 addptr(str2, 16); 3161 } 3162 subl(cnt1, stride); 3163 cmpl(cnt2, stride); // Do not read beyond substring 3164 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3165 // Back-up strings to avoid reading beyond substring. 3166 3167 if (ae == StrIntrinsicNode::UL) { 3168 lea(str2, Address(str2, cnt2, scale2, -8)); 3169 lea(str1, Address(str1, cnt2, scale1, -16)); 3170 } else { 3171 lea(str2, Address(str2, cnt2, scale2, -16)); 3172 lea(str1, Address(str1, cnt2, scale1, -16)); 3173 } 3174 subl(cnt1, cnt2); 3175 movl(cnt2, stride); 3176 addl(cnt1, stride); 3177 bind(CONT_SCAN_SUBSTR); 3178 if (ae == StrIntrinsicNode::UL) { 3179 pmovzxbw(vec, Address(str2, 0)); 3180 } else { 3181 movdqu(vec, Address(str2, 0)); 3182 } 3183 jmp(SCAN_SUBSTR); 3184 3185 bind(RET_FOUND_LONG); 3186 movptr(str1, Address(rsp, wordSize)); 3187 } // non constant 3188 3189 bind(RET_FOUND); 3190 // Compute substr offset 3191 subptr(result, str1); 3192 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3193 shrl(result, 1); // index 3194 } 3195 bind(CLEANUP); 3196 pop(rsp); // restore SP 3197 3198 } // string_indexof 3199 3200 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3201 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3202 ShortBranchVerifier sbv(this); 3203 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3204 3205 int stride = 8; 3206 3207 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3208 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3209 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3210 FOUND_SEQ_CHAR, DONE_LABEL; 3211 3212 movptr(result, str1); 3213 if (UseAVX >= 2) { 3214 cmpl(cnt1, stride); 3215 jcc(Assembler::less, SCAN_TO_CHAR); 3216 cmpl(cnt1, 2*stride); 3217 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3218 movdl(vec1, ch); 3219 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3220 vpxor(vec2, vec2); 3221 movl(tmp, cnt1); 3222 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3223 andl(cnt1,0x0000000F); //tail count (in chars) 3224 3225 bind(SCAN_TO_16_CHAR_LOOP); 3226 vmovdqu(vec3, Address(result, 0)); 3227 vpcmpeqw(vec3, vec3, vec1, 1); 3228 vptest(vec2, vec3); 3229 jcc(Assembler::carryClear, FOUND_CHAR); 3230 addptr(result, 32); 3231 subl(tmp, 2*stride); 3232 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3233 jmp(SCAN_TO_8_CHAR); 3234 bind(SCAN_TO_8_CHAR_INIT); 3235 movdl(vec1, ch); 3236 pshuflw(vec1, vec1, 0x00); 3237 pshufd(vec1, vec1, 0); 3238 pxor(vec2, vec2); 3239 } 3240 bind(SCAN_TO_8_CHAR); 3241 cmpl(cnt1, stride); 3242 jcc(Assembler::less, SCAN_TO_CHAR); 3243 if (UseAVX < 2) { 3244 movdl(vec1, ch); 3245 pshuflw(vec1, vec1, 0x00); 3246 pshufd(vec1, vec1, 0); 3247 pxor(vec2, vec2); 3248 } 3249 movl(tmp, cnt1); 3250 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3251 andl(cnt1,0x00000007); //tail count (in chars) 3252 3253 bind(SCAN_TO_8_CHAR_LOOP); 3254 movdqu(vec3, Address(result, 0)); 3255 pcmpeqw(vec3, vec1); 3256 ptest(vec2, vec3); 3257 jcc(Assembler::carryClear, FOUND_CHAR); 3258 addptr(result, 16); 3259 subl(tmp, stride); 3260 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3261 bind(SCAN_TO_CHAR); 3262 testl(cnt1, cnt1); 3263 jcc(Assembler::zero, RET_NOT_FOUND); 3264 bind(SCAN_TO_CHAR_LOOP); 3265 load_unsigned_short(tmp, Address(result, 0)); 3266 cmpl(ch, tmp); 3267 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3268 addptr(result, 2); 3269 subl(cnt1, 1); 3270 jccb(Assembler::zero, RET_NOT_FOUND); 3271 jmp(SCAN_TO_CHAR_LOOP); 3272 3273 bind(RET_NOT_FOUND); 3274 movl(result, -1); 3275 jmpb(DONE_LABEL); 3276 3277 bind(FOUND_CHAR); 3278 if (UseAVX >= 2) { 3279 vpmovmskb(tmp, vec3); 3280 } else { 3281 pmovmskb(tmp, vec3); 3282 } 3283 bsfl(ch, tmp); 3284 addptr(result, ch); 3285 3286 bind(FOUND_SEQ_CHAR); 3287 subptr(result, str1); 3288 shrl(result, 1); 3289 3290 bind(DONE_LABEL); 3291 } // string_indexof_char 3292 3293 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3294 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3295 ShortBranchVerifier sbv(this); 3296 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3297 3298 int stride = 16; 3299 3300 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3301 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3302 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3303 FOUND_SEQ_CHAR, DONE_LABEL; 3304 3305 movptr(result, str1); 3306 if (UseAVX >= 2) { 3307 cmpl(cnt1, stride); 3308 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3309 cmpl(cnt1, stride*2); 3310 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3311 movdl(vec1, ch); 3312 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3313 vpxor(vec2, vec2); 3314 movl(tmp, cnt1); 3315 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3316 andl(cnt1,0x0000001F); //tail count (in chars) 3317 3318 bind(SCAN_TO_32_CHAR_LOOP); 3319 vmovdqu(vec3, Address(result, 0)); 3320 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3321 vptest(vec2, vec3); 3322 jcc(Assembler::carryClear, FOUND_CHAR); 3323 addptr(result, 32); 3324 subl(tmp, stride*2); 3325 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3326 jmp(SCAN_TO_16_CHAR); 3327 3328 bind(SCAN_TO_16_CHAR_INIT); 3329 movdl(vec1, ch); 3330 pxor(vec2, vec2); 3331 pshufb(vec1, vec2); 3332 } 3333 3334 bind(SCAN_TO_16_CHAR); 3335 cmpl(cnt1, stride); 3336 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3337 if (UseAVX < 2) { 3338 movdl(vec1, ch); 3339 pxor(vec2, vec2); 3340 pshufb(vec1, vec2); 3341 } 3342 movl(tmp, cnt1); 3343 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3344 andl(cnt1,0x0000000F); //tail count (in bytes) 3345 3346 bind(SCAN_TO_16_CHAR_LOOP); 3347 movdqu(vec3, Address(result, 0)); 3348 pcmpeqb(vec3, vec1); 3349 ptest(vec2, vec3); 3350 jcc(Assembler::carryClear, FOUND_CHAR); 3351 addptr(result, 16); 3352 subl(tmp, stride); 3353 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3354 3355 bind(SCAN_TO_CHAR_INIT); 3356 testl(cnt1, cnt1); 3357 jcc(Assembler::zero, RET_NOT_FOUND); 3358 bind(SCAN_TO_CHAR_LOOP); 3359 load_unsigned_byte(tmp, Address(result, 0)); 3360 cmpl(ch, tmp); 3361 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3362 addptr(result, 1); 3363 subl(cnt1, 1); 3364 jccb(Assembler::zero, RET_NOT_FOUND); 3365 jmp(SCAN_TO_CHAR_LOOP); 3366 3367 bind(RET_NOT_FOUND); 3368 movl(result, -1); 3369 jmpb(DONE_LABEL); 3370 3371 bind(FOUND_CHAR); 3372 if (UseAVX >= 2) { 3373 vpmovmskb(tmp, vec3); 3374 } else { 3375 pmovmskb(tmp, vec3); 3376 } 3377 bsfl(ch, tmp); 3378 addptr(result, ch); 3379 3380 bind(FOUND_SEQ_CHAR); 3381 subptr(result, str1); 3382 3383 bind(DONE_LABEL); 3384 } // stringL_indexof_char 3385 3386 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3387 switch (eltype) { 3388 case T_BOOLEAN: return sizeof(jboolean); 3389 case T_BYTE: return sizeof(jbyte); 3390 case T_SHORT: return sizeof(jshort); 3391 case T_CHAR: return sizeof(jchar); 3392 case T_INT: return sizeof(jint); 3393 default: 3394 ShouldNotReachHere(); 3395 return -1; 3396 } 3397 } 3398 3399 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3400 switch (eltype) { 3401 // T_BOOLEAN used as surrogate for unsigned byte 3402 case T_BOOLEAN: movzbl(dst, src); break; 3403 case T_BYTE: movsbl(dst, src); break; 3404 case T_SHORT: movswl(dst, src); break; 3405 case T_CHAR: movzwl(dst, src); break; 3406 case T_INT: movl(dst, src); break; 3407 default: 3408 ShouldNotReachHere(); 3409 } 3410 } 3411 3412 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3413 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3414 } 3415 3416 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3417 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3418 } 3419 3420 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3421 const int vlen = Assembler::AVX_256bit; 3422 switch (eltype) { 3423 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3424 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3425 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3426 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3427 case T_INT: 3428 // do nothing 3429 break; 3430 default: 3431 ShouldNotReachHere(); 3432 } 3433 } 3434 3435 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3436 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3437 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3438 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3439 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3440 BasicType eltype) { 3441 ShortBranchVerifier sbv(this); 3442 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3443 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3444 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3445 3446 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3447 SHORT_UNROLLED_LOOP_EXIT, 3448 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3449 UNROLLED_VECTOR_LOOP_BEGIN, 3450 END; 3451 switch (eltype) { 3452 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3453 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3454 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3455 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3456 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3457 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3458 } 3459 3460 // For "renaming" for readibility of the code 3461 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3462 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3463 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3464 3465 const int elsize = arrays_hashcode_elsize(eltype); 3466 3467 /* 3468 if (cnt1 >= 2) { 3469 if (cnt1 >= 32) { 3470 UNROLLED VECTOR LOOP 3471 } 3472 UNROLLED SCALAR LOOP 3473 } 3474 SINGLE SCALAR 3475 */ 3476 3477 cmpl(cnt1, 32); 3478 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3479 3480 // cnt1 >= 32 && generate_vectorized_loop 3481 xorl(index, index); 3482 3483 // vresult = IntVector.zero(I256); 3484 for (int idx = 0; idx < 4; idx++) { 3485 vpxor(vresult[idx], vresult[idx]); 3486 } 3487 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3488 Register bound = tmp2; 3489 Register next = tmp3; 3490 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3491 movl(next, Address(tmp2, 0)); 3492 movdl(vnext, next); 3493 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3494 3495 // index = 0; 3496 // bound = cnt1 & ~(32 - 1); 3497 movl(bound, cnt1); 3498 andl(bound, ~(32 - 1)); 3499 // for (; index < bound; index += 32) { 3500 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3501 // result *= next; 3502 imull(result, next); 3503 // loop fission to upfront the cost of fetching from memory, OOO execution 3504 // can then hopefully do a better job of prefetching 3505 for (int idx = 0; idx < 4; idx++) { 3506 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3507 } 3508 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3509 for (int idx = 0; idx < 4; idx++) { 3510 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3511 arrays_hashcode_elvcast(vtmp[idx], eltype); 3512 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3513 } 3514 // index += 32; 3515 addl(index, 32); 3516 // index < bound; 3517 cmpl(index, bound); 3518 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3519 // } 3520 3521 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3522 subl(cnt1, bound); 3523 // release bound 3524 3525 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3526 for (int idx = 0; idx < 4; idx++) { 3527 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3528 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3529 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3530 } 3531 // result += vresult.reduceLanes(ADD); 3532 for (int idx = 0; idx < 4; idx++) { 3533 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3534 } 3535 3536 // } else if (cnt1 < 32) { 3537 3538 bind(SHORT_UNROLLED_BEGIN); 3539 // int i = 1; 3540 movl(index, 1); 3541 cmpl(index, cnt1); 3542 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3543 3544 // for (; i < cnt1 ; i += 2) { 3545 bind(SHORT_UNROLLED_LOOP_BEGIN); 3546 movl(tmp3, 961); 3547 imull(result, tmp3); 3548 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3549 movl(tmp3, tmp2); 3550 shll(tmp3, 5); 3551 subl(tmp3, tmp2); 3552 addl(result, tmp3); 3553 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3554 addl(result, tmp3); 3555 addl(index, 2); 3556 cmpl(index, cnt1); 3557 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3558 3559 // } 3560 // if (i >= cnt1) { 3561 bind(SHORT_UNROLLED_LOOP_EXIT); 3562 jccb(Assembler::greater, END); 3563 movl(tmp2, result); 3564 shll(result, 5); 3565 subl(result, tmp2); 3566 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3567 addl(result, tmp3); 3568 // } 3569 bind(END); 3570 3571 BLOCK_COMMENT("} // arrays_hashcode"); 3572 3573 } // arrays_hashcode 3574 3575 // helper function for string_compare 3576 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3577 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3578 Address::ScaleFactor scale2, Register index, int ae) { 3579 if (ae == StrIntrinsicNode::LL) { 3580 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3581 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3582 } else if (ae == StrIntrinsicNode::UU) { 3583 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3584 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3585 } else { 3586 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3587 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3588 } 3589 } 3590 3591 // Compare strings, used for char[] and byte[]. 3592 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3593 Register cnt1, Register cnt2, Register result, 3594 XMMRegister vec1, int ae, KRegister mask) { 3595 ShortBranchVerifier sbv(this); 3596 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3597 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3 3598 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3599 int stride2x2 = 0x40; 3600 Address::ScaleFactor scale = Address::no_scale; 3601 Address::ScaleFactor scale1 = Address::no_scale; 3602 Address::ScaleFactor scale2 = Address::no_scale; 3603 3604 if (ae != StrIntrinsicNode::LL) { 3605 stride2x2 = 0x20; 3606 } 3607 3608 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3609 shrl(cnt2, 1); 3610 } 3611 // Compute the minimum of the string lengths and the 3612 // difference of the string lengths (stack). 3613 // Do the conditional move stuff 3614 movl(result, cnt1); 3615 subl(cnt1, cnt2); 3616 push(cnt1); 3617 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3618 3619 // Is the minimum length zero? 3620 testl(cnt2, cnt2); 3621 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3622 if (ae == StrIntrinsicNode::LL) { 3623 // Load first bytes 3624 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3625 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3626 } else if (ae == StrIntrinsicNode::UU) { 3627 // Load first characters 3628 load_unsigned_short(result, Address(str1, 0)); 3629 load_unsigned_short(cnt1, Address(str2, 0)); 3630 } else { 3631 load_unsigned_byte(result, Address(str1, 0)); 3632 load_unsigned_short(cnt1, Address(str2, 0)); 3633 } 3634 subl(result, cnt1); 3635 jcc(Assembler::notZero, POP_LABEL); 3636 3637 if (ae == StrIntrinsicNode::UU) { 3638 // Divide length by 2 to get number of chars 3639 shrl(cnt2, 1); 3640 } 3641 cmpl(cnt2, 1); 3642 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3643 3644 // Check if the strings start at the same location and setup scale and stride 3645 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3646 cmpptr(str1, str2); 3647 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3648 if (ae == StrIntrinsicNode::LL) { 3649 scale = Address::times_1; 3650 stride = 16; 3651 } else { 3652 scale = Address::times_2; 3653 stride = 8; 3654 } 3655 } else { 3656 scale1 = Address::times_1; 3657 scale2 = Address::times_2; 3658 // scale not used 3659 stride = 8; 3660 } 3661 3662 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3663 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3664 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3665 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3666 Label COMPARE_TAIL_LONG; 3667 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3 3668 3669 int pcmpmask = 0x19; 3670 if (ae == StrIntrinsicNode::LL) { 3671 pcmpmask &= ~0x01; 3672 } 3673 3674 // Setup to compare 16-chars (32-bytes) vectors, 3675 // start from first character again because it has aligned address. 3676 if (ae == StrIntrinsicNode::LL) { 3677 stride2 = 32; 3678 } else { 3679 stride2 = 16; 3680 } 3681 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3682 adr_stride = stride << scale; 3683 } else { 3684 adr_stride1 = 8; //stride << scale1; 3685 adr_stride2 = 16; //stride << scale2; 3686 } 3687 3688 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3689 // rax and rdx are used by pcmpestri as elements counters 3690 movl(result, cnt2); 3691 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3692 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3693 3694 // fast path : compare first 2 8-char vectors. 3695 bind(COMPARE_16_CHARS); 3696 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3697 movdqu(vec1, Address(str1, 0)); 3698 } else { 3699 pmovzxbw(vec1, Address(str1, 0)); 3700 } 3701 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3702 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3703 3704 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3705 movdqu(vec1, Address(str1, adr_stride)); 3706 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3707 } else { 3708 pmovzxbw(vec1, Address(str1, adr_stride1)); 3709 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3710 } 3711 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3712 addl(cnt1, stride); 3713 3714 // Compare the characters at index in cnt1 3715 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3716 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3717 subl(result, cnt2); 3718 jmp(POP_LABEL); 3719 3720 // Setup the registers to start vector comparison loop 3721 bind(COMPARE_WIDE_VECTORS); 3722 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3723 lea(str1, Address(str1, result, scale)); 3724 lea(str2, Address(str2, result, scale)); 3725 } else { 3726 lea(str1, Address(str1, result, scale1)); 3727 lea(str2, Address(str2, result, scale2)); 3728 } 3729 subl(result, stride2); 3730 subl(cnt2, stride2); 3731 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3732 negptr(result); 3733 3734 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3735 bind(COMPARE_WIDE_VECTORS_LOOP); 3736 3737 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3738 cmpl(cnt2, stride2x2); 3739 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3740 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3741 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3742 3743 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3744 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3745 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3746 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3747 } else { 3748 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3749 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3750 } 3751 kortestql(mask, mask); 3752 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3753 addptr(result, stride2x2); // update since we already compared at this addr 3754 subl(cnt2, stride2x2); // and sub the size too 3755 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3756 3757 vpxor(vec1, vec1); 3758 jmpb(COMPARE_WIDE_TAIL); 3759 }//if (VM_Version::supports_avx512vlbw()) 3760 3761 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3762 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3763 vmovdqu(vec1, Address(str1, result, scale)); 3764 vpxor(vec1, Address(str2, result, scale)); 3765 } else { 3766 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3767 vpxor(vec1, Address(str2, result, scale2)); 3768 } 3769 vptest(vec1, vec1); 3770 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3771 addptr(result, stride2); 3772 subl(cnt2, stride2); 3773 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3774 // clean upper bits of YMM registers 3775 vpxor(vec1, vec1); 3776 3777 // compare wide vectors tail 3778 bind(COMPARE_WIDE_TAIL); 3779 testptr(result, result); 3780 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3781 3782 movl(result, stride2); 3783 movl(cnt2, result); 3784 negptr(result); 3785 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3786 3787 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3788 bind(VECTOR_NOT_EQUAL); 3789 // clean upper bits of YMM registers 3790 vpxor(vec1, vec1); 3791 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3792 lea(str1, Address(str1, result, scale)); 3793 lea(str2, Address(str2, result, scale)); 3794 } else { 3795 lea(str1, Address(str1, result, scale1)); 3796 lea(str2, Address(str2, result, scale2)); 3797 } 3798 jmp(COMPARE_16_CHARS); 3799 3800 // Compare tail chars, length between 1 to 15 chars 3801 bind(COMPARE_TAIL_LONG); 3802 movl(cnt2, result); 3803 cmpl(cnt2, stride); 3804 jcc(Assembler::less, COMPARE_SMALL_STR); 3805 3806 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3807 movdqu(vec1, Address(str1, 0)); 3808 } else { 3809 pmovzxbw(vec1, Address(str1, 0)); 3810 } 3811 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3812 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3813 subptr(cnt2, stride); 3814 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3815 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3816 lea(str1, Address(str1, result, scale)); 3817 lea(str2, Address(str2, result, scale)); 3818 } else { 3819 lea(str1, Address(str1, result, scale1)); 3820 lea(str2, Address(str2, result, scale2)); 3821 } 3822 negptr(cnt2); 3823 jmpb(WHILE_HEAD_LABEL); 3824 3825 bind(COMPARE_SMALL_STR); 3826 } else if (UseSSE42Intrinsics) { 3827 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3828 int pcmpmask = 0x19; 3829 // Setup to compare 8-char (16-byte) vectors, 3830 // start from first character again because it has aligned address. 3831 movl(result, cnt2); 3832 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3833 if (ae == StrIntrinsicNode::LL) { 3834 pcmpmask &= ~0x01; 3835 } 3836 jcc(Assembler::zero, COMPARE_TAIL); 3837 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3838 lea(str1, Address(str1, result, scale)); 3839 lea(str2, Address(str2, result, scale)); 3840 } else { 3841 lea(str1, Address(str1, result, scale1)); 3842 lea(str2, Address(str2, result, scale2)); 3843 } 3844 negptr(result); 3845 3846 // pcmpestri 3847 // inputs: 3848 // vec1- substring 3849 // rax - negative string length (elements count) 3850 // mem - scanned string 3851 // rdx - string length (elements count) 3852 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3853 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3854 // outputs: 3855 // rcx - first mismatched element index 3856 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3857 3858 bind(COMPARE_WIDE_VECTORS); 3859 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3860 movdqu(vec1, Address(str1, result, scale)); 3861 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3862 } else { 3863 pmovzxbw(vec1, Address(str1, result, scale1)); 3864 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3865 } 3866 // After pcmpestri cnt1(rcx) contains mismatched element index 3867 3868 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3869 addptr(result, stride); 3870 subptr(cnt2, stride); 3871 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3872 3873 // compare wide vectors tail 3874 testptr(result, result); 3875 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3876 3877 movl(cnt2, stride); 3878 movl(result, stride); 3879 negptr(result); 3880 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3881 movdqu(vec1, Address(str1, result, scale)); 3882 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3883 } else { 3884 pmovzxbw(vec1, Address(str1, result, scale1)); 3885 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3886 } 3887 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3888 3889 // Mismatched characters in the vectors 3890 bind(VECTOR_NOT_EQUAL); 3891 addptr(cnt1, result); 3892 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3893 subl(result, cnt2); 3894 jmpb(POP_LABEL); 3895 3896 bind(COMPARE_TAIL); // limit is zero 3897 movl(cnt2, result); 3898 // Fallthru to tail compare 3899 } 3900 // Shift str2 and str1 to the end of the arrays, negate min 3901 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3902 lea(str1, Address(str1, cnt2, scale)); 3903 lea(str2, Address(str2, cnt2, scale)); 3904 } else { 3905 lea(str1, Address(str1, cnt2, scale1)); 3906 lea(str2, Address(str2, cnt2, scale2)); 3907 } 3908 decrementl(cnt2); // first character was compared already 3909 negptr(cnt2); 3910 3911 // Compare the rest of the elements 3912 bind(WHILE_HEAD_LABEL); 3913 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3914 subl(result, cnt1); 3915 jccb(Assembler::notZero, POP_LABEL); 3916 increment(cnt2); 3917 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3918 3919 // Strings are equal up to min length. Return the length difference. 3920 bind(LENGTH_DIFF_LABEL); 3921 pop(result); 3922 if (ae == StrIntrinsicNode::UU) { 3923 // Divide diff by 2 to get number of chars 3924 sarl(result, 1); 3925 } 3926 jmpb(DONE_LABEL); 3927 3928 if (VM_Version::supports_avx512vlbw()) { 3929 3930 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3931 3932 kmovql(cnt1, mask); 3933 notq(cnt1); 3934 bsfq(cnt2, cnt1); 3935 if (ae != StrIntrinsicNode::LL) { 3936 // Divide diff by 2 to get number of chars 3937 sarl(cnt2, 1); 3938 } 3939 addq(result, cnt2); 3940 if (ae == StrIntrinsicNode::LL) { 3941 load_unsigned_byte(cnt1, Address(str2, result)); 3942 load_unsigned_byte(result, Address(str1, result)); 3943 } else if (ae == StrIntrinsicNode::UU) { 3944 load_unsigned_short(cnt1, Address(str2, result, scale)); 3945 load_unsigned_short(result, Address(str1, result, scale)); 3946 } else { 3947 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3948 load_unsigned_byte(result, Address(str1, result, scale1)); 3949 } 3950 subl(result, cnt1); 3951 jmpb(POP_LABEL); 3952 }//if (VM_Version::supports_avx512vlbw()) 3953 3954 // Discard the stored length difference 3955 bind(POP_LABEL); 3956 pop(cnt1); 3957 3958 // That's it 3959 bind(DONE_LABEL); 3960 if(ae == StrIntrinsicNode::UL) { 3961 negl(result); 3962 } 3963 3964 } 3965 3966 // Search for Non-ASCII character (Negative byte value) in a byte array, 3967 // return the index of the first such character, otherwise the length 3968 // of the array segment searched. 3969 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3970 // @IntrinsicCandidate 3971 // public static int countPositives(byte[] ba, int off, int len) { 3972 // for (int i = off; i < off + len; i++) { 3973 // if (ba[i] < 0) { 3974 // return i - off; 3975 // } 3976 // } 3977 // return len; 3978 // } 3979 void C2_MacroAssembler::count_positives(Register ary1, Register len, 3980 Register result, Register tmp1, 3981 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 3982 // rsi: byte array 3983 // rcx: len 3984 // rax: result 3985 ShortBranchVerifier sbv(this); 3986 assert_different_registers(ary1, len, result, tmp1); 3987 assert_different_registers(vec1, vec2); 3988 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3989 3990 movl(result, len); // copy 3991 // len == 0 3992 testl(len, len); 3993 jcc(Assembler::zero, DONE); 3994 3995 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3996 VM_Version::supports_avx512vlbw() && 3997 VM_Version::supports_bmi2()) { 3998 3999 Label test_64_loop, test_tail, BREAK_LOOP; 4000 movl(tmp1, len); 4001 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4002 4003 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4004 andl(len, 0xffffffc0); // vector count (in chars) 4005 jccb(Assembler::zero, test_tail); 4006 4007 lea(ary1, Address(ary1, len, Address::times_1)); 4008 negptr(len); 4009 4010 bind(test_64_loop); 4011 // Check whether our 64 elements of size byte contain negatives 4012 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4013 kortestql(mask1, mask1); 4014 jcc(Assembler::notZero, BREAK_LOOP); 4015 4016 addptr(len, 64); 4017 jccb(Assembler::notZero, test_64_loop); 4018 4019 bind(test_tail); 4020 // bail out when there is nothing to be done 4021 testl(tmp1, -1); 4022 jcc(Assembler::zero, DONE); 4023 4024 4025 // check the tail for absense of negatives 4026 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4027 { 4028 Register tmp3_aliased = len; 4029 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4030 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4031 notq(tmp3_aliased); 4032 kmovql(mask2, tmp3_aliased); 4033 } 4034 4035 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4036 ktestq(mask1, mask2); 4037 jcc(Assembler::zero, DONE); 4038 4039 // do a full check for negative registers in the tail 4040 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4041 // ary1 already pointing to the right place 4042 jmpb(TAIL_START); 4043 4044 bind(BREAK_LOOP); 4045 // At least one byte in the last 64 byte block was negative. 4046 // Set up to look at the last 64 bytes as if they were a tail 4047 lea(ary1, Address(ary1, len, Address::times_1)); 4048 addptr(result, len); 4049 // Ignore the very last byte: if all others are positive, 4050 // it must be negative, so we can skip right to the 2+1 byte 4051 // end comparison at this point 4052 orl(result, 63); 4053 movl(len, 63); 4054 // Fallthru to tail compare 4055 } else { 4056 4057 if (UseAVX >= 2) { 4058 // With AVX2, use 32-byte vector compare 4059 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4060 4061 // Compare 32-byte vectors 4062 testl(len, 0xffffffe0); // vector count (in bytes) 4063 jccb(Assembler::zero, TAIL_START); 4064 4065 andl(len, 0xffffffe0); 4066 lea(ary1, Address(ary1, len, Address::times_1)); 4067 negptr(len); 4068 4069 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4070 movdl(vec2, tmp1); 4071 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4072 4073 bind(COMPARE_WIDE_VECTORS); 4074 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4075 vptest(vec1, vec2); 4076 jccb(Assembler::notZero, BREAK_LOOP); 4077 addptr(len, 32); 4078 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4079 4080 testl(result, 0x0000001f); // any bytes remaining? 4081 jcc(Assembler::zero, DONE); 4082 4083 // Quick test using the already prepared vector mask 4084 movl(len, result); 4085 andl(len, 0x0000001f); 4086 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4087 vptest(vec1, vec2); 4088 jcc(Assembler::zero, DONE); 4089 // There are zeros, jump to the tail to determine exactly where 4090 jmpb(TAIL_START); 4091 4092 bind(BREAK_LOOP); 4093 // At least one byte in the last 32-byte vector is negative. 4094 // Set up to look at the last 32 bytes as if they were a tail 4095 lea(ary1, Address(ary1, len, Address::times_1)); 4096 addptr(result, len); 4097 // Ignore the very last byte: if all others are positive, 4098 // it must be negative, so we can skip right to the 2+1 byte 4099 // end comparison at this point 4100 orl(result, 31); 4101 movl(len, 31); 4102 // Fallthru to tail compare 4103 } else if (UseSSE42Intrinsics) { 4104 // With SSE4.2, use double quad vector compare 4105 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4106 4107 // Compare 16-byte vectors 4108 testl(len, 0xfffffff0); // vector count (in bytes) 4109 jcc(Assembler::zero, TAIL_START); 4110 4111 andl(len, 0xfffffff0); 4112 lea(ary1, Address(ary1, len, Address::times_1)); 4113 negptr(len); 4114 4115 movl(tmp1, 0x80808080); 4116 movdl(vec2, tmp1); 4117 pshufd(vec2, vec2, 0); 4118 4119 bind(COMPARE_WIDE_VECTORS); 4120 movdqu(vec1, Address(ary1, len, Address::times_1)); 4121 ptest(vec1, vec2); 4122 jccb(Assembler::notZero, BREAK_LOOP); 4123 addptr(len, 16); 4124 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4125 4126 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4127 jcc(Assembler::zero, DONE); 4128 4129 // Quick test using the already prepared vector mask 4130 movl(len, result); 4131 andl(len, 0x0000000f); // tail count (in bytes) 4132 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4133 ptest(vec1, vec2); 4134 jcc(Assembler::zero, DONE); 4135 jmpb(TAIL_START); 4136 4137 bind(BREAK_LOOP); 4138 // At least one byte in the last 16-byte vector is negative. 4139 // Set up and look at the last 16 bytes as if they were a tail 4140 lea(ary1, Address(ary1, len, Address::times_1)); 4141 addptr(result, len); 4142 // Ignore the very last byte: if all others are positive, 4143 // it must be negative, so we can skip right to the 2+1 byte 4144 // end comparison at this point 4145 orl(result, 15); 4146 movl(len, 15); 4147 // Fallthru to tail compare 4148 } 4149 } 4150 4151 bind(TAIL_START); 4152 // Compare 4-byte vectors 4153 andl(len, 0xfffffffc); // vector count (in bytes) 4154 jccb(Assembler::zero, COMPARE_CHAR); 4155 4156 lea(ary1, Address(ary1, len, Address::times_1)); 4157 negptr(len); 4158 4159 bind(COMPARE_VECTORS); 4160 movl(tmp1, Address(ary1, len, Address::times_1)); 4161 andl(tmp1, 0x80808080); 4162 jccb(Assembler::notZero, TAIL_ADJUST); 4163 addptr(len, 4); 4164 jccb(Assembler::notZero, COMPARE_VECTORS); 4165 4166 // Compare trailing char (final 2-3 bytes), if any 4167 bind(COMPARE_CHAR); 4168 4169 testl(result, 0x2); // tail char 4170 jccb(Assembler::zero, COMPARE_BYTE); 4171 load_unsigned_short(tmp1, Address(ary1, 0)); 4172 andl(tmp1, 0x00008080); 4173 jccb(Assembler::notZero, CHAR_ADJUST); 4174 lea(ary1, Address(ary1, 2)); 4175 4176 bind(COMPARE_BYTE); 4177 testl(result, 0x1); // tail byte 4178 jccb(Assembler::zero, DONE); 4179 load_unsigned_byte(tmp1, Address(ary1, 0)); 4180 testl(tmp1, 0x00000080); 4181 jccb(Assembler::zero, DONE); 4182 subptr(result, 1); 4183 jmpb(DONE); 4184 4185 bind(TAIL_ADJUST); 4186 // there are negative bits in the last 4 byte block. 4187 // Adjust result and check the next three bytes 4188 addptr(result, len); 4189 orl(result, 3); 4190 lea(ary1, Address(ary1, len, Address::times_1)); 4191 jmpb(COMPARE_CHAR); 4192 4193 bind(CHAR_ADJUST); 4194 // We are looking at a char + optional byte tail, and found that one 4195 // of the bytes in the char is negative. Adjust the result, check the 4196 // first byte and readjust if needed. 4197 andl(result, 0xfffffffc); 4198 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4199 jccb(Assembler::notZero, DONE); 4200 addptr(result, 1); 4201 4202 // That's it 4203 bind(DONE); 4204 if (UseAVX >= 2) { 4205 // clean upper bits of YMM registers 4206 vpxor(vec1, vec1); 4207 vpxor(vec2, vec2); 4208 } 4209 } 4210 4211 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4212 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4213 Register limit, Register result, Register chr, 4214 XMMRegister vec1, XMMRegister vec2, bool is_char, 4215 KRegister mask, bool expand_ary2) { 4216 // for expand_ary2, limit is the (smaller) size of the second array. 4217 ShortBranchVerifier sbv(this); 4218 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4219 4220 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4221 "Expansion only implemented for AVX2"); 4222 4223 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4224 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4225 4226 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4227 int scaleIncr = expand_ary2 ? 8 : 16; 4228 4229 if (is_array_equ) { 4230 // Check the input args 4231 cmpoop(ary1, ary2); 4232 jcc(Assembler::equal, TRUE_LABEL); 4233 4234 // Need additional checks for arrays_equals. 4235 testptr(ary1, ary1); 4236 jcc(Assembler::zero, FALSE_LABEL); 4237 testptr(ary2, ary2); 4238 jcc(Assembler::zero, FALSE_LABEL); 4239 4240 // Check the lengths 4241 movl(limit, Address(ary1, length_offset)); 4242 cmpl(limit, Address(ary2, length_offset)); 4243 jcc(Assembler::notEqual, FALSE_LABEL); 4244 } 4245 4246 // count == 0 4247 testl(limit, limit); 4248 jcc(Assembler::zero, TRUE_LABEL); 4249 4250 if (is_array_equ) { 4251 // Load array address 4252 lea(ary1, Address(ary1, base_offset)); 4253 lea(ary2, Address(ary2, base_offset)); 4254 } 4255 4256 if (is_array_equ && is_char) { 4257 // arrays_equals when used for char[]. 4258 shll(limit, 1); // byte count != 0 4259 } 4260 movl(result, limit); // copy 4261 4262 if (UseAVX >= 2) { 4263 // With AVX2, use 32-byte vector compare 4264 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4265 4266 // Compare 32-byte vectors 4267 if (expand_ary2) { 4268 andl(result, 0x0000000f); // tail count (in bytes) 4269 andl(limit, 0xfffffff0); // vector count (in bytes) 4270 jcc(Assembler::zero, COMPARE_TAIL); 4271 } else { 4272 andl(result, 0x0000001f); // tail count (in bytes) 4273 andl(limit, 0xffffffe0); // vector count (in bytes) 4274 jcc(Assembler::zero, COMPARE_TAIL_16); 4275 } 4276 4277 lea(ary1, Address(ary1, limit, scaleFactor)); 4278 lea(ary2, Address(ary2, limit, Address::times_1)); 4279 negptr(limit); 4280 4281 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4282 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4283 4284 cmpl(limit, -64); 4285 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4286 4287 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4288 4289 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4290 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4291 kortestql(mask, mask); 4292 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4293 addptr(limit, 64); // update since we already compared at this addr 4294 cmpl(limit, -64); 4295 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4296 4297 // At this point we may still need to compare -limit+result bytes. 4298 // We could execute the next two instruction and just continue via non-wide path: 4299 // cmpl(limit, 0); 4300 // jcc(Assembler::equal, COMPARE_TAIL); // true 4301 // But since we stopped at the points ary{1,2}+limit which are 4302 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4303 // (|limit| <= 32 and result < 32), 4304 // we may just compare the last 64 bytes. 4305 // 4306 addptr(result, -64); // it is safe, bc we just came from this area 4307 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4308 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4309 kortestql(mask, mask); 4310 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4311 4312 jmp(TRUE_LABEL); 4313 4314 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4315 4316 }//if (VM_Version::supports_avx512vlbw()) 4317 4318 bind(COMPARE_WIDE_VECTORS); 4319 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4320 if (expand_ary2) { 4321 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4322 } else { 4323 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4324 } 4325 vpxor(vec1, vec2); 4326 4327 vptest(vec1, vec1); 4328 jcc(Assembler::notZero, FALSE_LABEL); 4329 addptr(limit, scaleIncr * 2); 4330 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4331 4332 testl(result, result); 4333 jcc(Assembler::zero, TRUE_LABEL); 4334 4335 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4336 if (expand_ary2) { 4337 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4338 } else { 4339 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4340 } 4341 vpxor(vec1, vec2); 4342 4343 vptest(vec1, vec1); 4344 jcc(Assembler::notZero, FALSE_LABEL); 4345 jmp(TRUE_LABEL); 4346 4347 bind(COMPARE_TAIL_16); // limit is zero 4348 movl(limit, result); 4349 4350 // Compare 16-byte chunks 4351 andl(result, 0x0000000f); // tail count (in bytes) 4352 andl(limit, 0xfffffff0); // vector count (in bytes) 4353 jcc(Assembler::zero, COMPARE_TAIL); 4354 4355 lea(ary1, Address(ary1, limit, scaleFactor)); 4356 lea(ary2, Address(ary2, limit, Address::times_1)); 4357 negptr(limit); 4358 4359 bind(COMPARE_WIDE_VECTORS_16); 4360 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4361 if (expand_ary2) { 4362 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4363 } else { 4364 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4365 } 4366 pxor(vec1, vec2); 4367 4368 ptest(vec1, vec1); 4369 jcc(Assembler::notZero, FALSE_LABEL); 4370 addptr(limit, scaleIncr); 4371 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4372 4373 bind(COMPARE_TAIL); // limit is zero 4374 movl(limit, result); 4375 // Fallthru to tail compare 4376 } else if (UseSSE42Intrinsics) { 4377 // With SSE4.2, use double quad vector compare 4378 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4379 4380 // Compare 16-byte vectors 4381 andl(result, 0x0000000f); // tail count (in bytes) 4382 andl(limit, 0xfffffff0); // vector count (in bytes) 4383 jcc(Assembler::zero, COMPARE_TAIL); 4384 4385 lea(ary1, Address(ary1, limit, Address::times_1)); 4386 lea(ary2, Address(ary2, limit, Address::times_1)); 4387 negptr(limit); 4388 4389 bind(COMPARE_WIDE_VECTORS); 4390 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4391 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4392 pxor(vec1, vec2); 4393 4394 ptest(vec1, vec1); 4395 jcc(Assembler::notZero, FALSE_LABEL); 4396 addptr(limit, 16); 4397 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4398 4399 testl(result, result); 4400 jcc(Assembler::zero, TRUE_LABEL); 4401 4402 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4403 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4404 pxor(vec1, vec2); 4405 4406 ptest(vec1, vec1); 4407 jccb(Assembler::notZero, FALSE_LABEL); 4408 jmpb(TRUE_LABEL); 4409 4410 bind(COMPARE_TAIL); // limit is zero 4411 movl(limit, result); 4412 // Fallthru to tail compare 4413 } 4414 4415 // Compare 4-byte vectors 4416 if (expand_ary2) { 4417 testl(result, result); 4418 jccb(Assembler::zero, TRUE_LABEL); 4419 } else { 4420 andl(limit, 0xfffffffc); // vector count (in bytes) 4421 jccb(Assembler::zero, COMPARE_CHAR); 4422 } 4423 4424 lea(ary1, Address(ary1, limit, scaleFactor)); 4425 lea(ary2, Address(ary2, limit, Address::times_1)); 4426 negptr(limit); 4427 4428 bind(COMPARE_VECTORS); 4429 if (expand_ary2) { 4430 // There are no "vector" operations for bytes to shorts 4431 movzbl(chr, Address(ary2, limit, Address::times_1)); 4432 cmpw(Address(ary1, limit, Address::times_2), chr); 4433 jccb(Assembler::notEqual, FALSE_LABEL); 4434 addptr(limit, 1); 4435 jcc(Assembler::notZero, COMPARE_VECTORS); 4436 jmp(TRUE_LABEL); 4437 } else { 4438 movl(chr, Address(ary1, limit, Address::times_1)); 4439 cmpl(chr, Address(ary2, limit, Address::times_1)); 4440 jccb(Assembler::notEqual, FALSE_LABEL); 4441 addptr(limit, 4); 4442 jcc(Assembler::notZero, COMPARE_VECTORS); 4443 } 4444 4445 // Compare trailing char (final 2 bytes), if any 4446 bind(COMPARE_CHAR); 4447 testl(result, 0x2); // tail char 4448 jccb(Assembler::zero, COMPARE_BYTE); 4449 load_unsigned_short(chr, Address(ary1, 0)); 4450 load_unsigned_short(limit, Address(ary2, 0)); 4451 cmpl(chr, limit); 4452 jccb(Assembler::notEqual, FALSE_LABEL); 4453 4454 if (is_array_equ && is_char) { 4455 bind(COMPARE_BYTE); 4456 } else { 4457 lea(ary1, Address(ary1, 2)); 4458 lea(ary2, Address(ary2, 2)); 4459 4460 bind(COMPARE_BYTE); 4461 testl(result, 0x1); // tail byte 4462 jccb(Assembler::zero, TRUE_LABEL); 4463 load_unsigned_byte(chr, Address(ary1, 0)); 4464 load_unsigned_byte(limit, Address(ary2, 0)); 4465 cmpl(chr, limit); 4466 jccb(Assembler::notEqual, FALSE_LABEL); 4467 } 4468 bind(TRUE_LABEL); 4469 movl(result, 1); // return true 4470 jmpb(DONE); 4471 4472 bind(FALSE_LABEL); 4473 xorl(result, result); // return false 4474 4475 // That's it 4476 bind(DONE); 4477 if (UseAVX >= 2) { 4478 // clean upper bits of YMM registers 4479 vpxor(vec1, vec1); 4480 vpxor(vec2, vec2); 4481 } 4482 } 4483 4484 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4485 #define __ masm. 4486 Register dst = stub.data<0>(); 4487 XMMRegister src = stub.data<1>(); 4488 address target = stub.data<2>(); 4489 __ bind(stub.entry()); 4490 __ subptr(rsp, 8); 4491 __ movdbl(Address(rsp), src); 4492 __ call(RuntimeAddress(target)); 4493 // APX REX2 encoding for pop(dst) increases the stub size by 1 byte. 4494 __ pop(dst); 4495 __ jmp(stub.continuation()); 4496 #undef __ 4497 } 4498 4499 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4500 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4501 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4502 4503 address slowpath_target; 4504 if (dst_bt == T_INT) { 4505 if (src_bt == T_FLOAT) { 4506 cvttss2sil(dst, src); 4507 cmpl(dst, 0x80000000); 4508 slowpath_target = StubRoutines::x86::f2i_fixup(); 4509 } else { 4510 cvttsd2sil(dst, src); 4511 cmpl(dst, 0x80000000); 4512 slowpath_target = StubRoutines::x86::d2i_fixup(); 4513 } 4514 } else { 4515 if (src_bt == T_FLOAT) { 4516 cvttss2siq(dst, src); 4517 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4518 slowpath_target = StubRoutines::x86::f2l_fixup(); 4519 } else { 4520 cvttsd2siq(dst, src); 4521 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4522 slowpath_target = StubRoutines::x86::d2l_fixup(); 4523 } 4524 } 4525 4526 // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte. 4527 int max_size = 23 + (UseAPX ? 1 : 0); 4528 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath); 4529 jcc(Assembler::equal, stub->entry()); 4530 bind(stub->continuation()); 4531 } 4532 4533 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4534 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4535 switch(ideal_opc) { 4536 case Op_LShiftVS: 4537 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4538 case Op_LShiftVI: 4539 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4540 case Op_LShiftVL: 4541 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4542 case Op_RShiftVS: 4543 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4544 case Op_RShiftVI: 4545 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4546 case Op_RShiftVL: 4547 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4548 case Op_URShiftVS: 4549 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4550 case Op_URShiftVI: 4551 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4552 case Op_URShiftVL: 4553 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4554 case Op_RotateRightV: 4555 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4556 case Op_RotateLeftV: 4557 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4558 default: 4559 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4560 break; 4561 } 4562 } 4563 4564 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4565 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) { 4566 if (is_unsigned) { 4567 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4568 } else { 4569 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4570 } 4571 } 4572 4573 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4574 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4575 switch (elem_bt) { 4576 case T_BYTE: 4577 if (ideal_opc == Op_SaturatingAddV) { 4578 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4579 } else { 4580 assert(ideal_opc == Op_SaturatingSubV, ""); 4581 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4582 } 4583 break; 4584 case T_SHORT: 4585 if (ideal_opc == Op_SaturatingAddV) { 4586 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4587 } else { 4588 assert(ideal_opc == Op_SaturatingSubV, ""); 4589 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4590 } 4591 break; 4592 default: 4593 fatal("Unsupported type %s", type2name(elem_bt)); 4594 break; 4595 } 4596 } 4597 4598 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4599 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4600 switch (elem_bt) { 4601 case T_BYTE: 4602 if (ideal_opc == Op_SaturatingAddV) { 4603 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4604 } else { 4605 assert(ideal_opc == Op_SaturatingSubV, ""); 4606 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4607 } 4608 break; 4609 case T_SHORT: 4610 if (ideal_opc == Op_SaturatingAddV) { 4611 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4612 } else { 4613 assert(ideal_opc == Op_SaturatingSubV, ""); 4614 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4615 } 4616 break; 4617 default: 4618 fatal("Unsupported type %s", type2name(elem_bt)); 4619 break; 4620 } 4621 } 4622 4623 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4624 Address src2, bool is_unsigned, bool merge, int vlen_enc) { 4625 if (is_unsigned) { 4626 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4627 } else { 4628 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4629 } 4630 } 4631 4632 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4633 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4634 switch (elem_bt) { 4635 case T_BYTE: 4636 if (ideal_opc == Op_SaturatingAddV) { 4637 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4638 } else { 4639 assert(ideal_opc == Op_SaturatingSubV, ""); 4640 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4641 } 4642 break; 4643 case T_SHORT: 4644 if (ideal_opc == Op_SaturatingAddV) { 4645 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4646 } else { 4647 assert(ideal_opc == Op_SaturatingSubV, ""); 4648 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4649 } 4650 break; 4651 default: 4652 fatal("Unsupported type %s", type2name(elem_bt)); 4653 break; 4654 } 4655 } 4656 4657 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4658 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4659 switch (elem_bt) { 4660 case T_BYTE: 4661 if (ideal_opc == Op_SaturatingAddV) { 4662 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4663 } else { 4664 assert(ideal_opc == Op_SaturatingSubV, ""); 4665 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4666 } 4667 break; 4668 case T_SHORT: 4669 if (ideal_opc == Op_SaturatingAddV) { 4670 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4671 } else { 4672 assert(ideal_opc == Op_SaturatingSubV, ""); 4673 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4674 } 4675 break; 4676 default: 4677 fatal("Unsupported type %s", type2name(elem_bt)); 4678 break; 4679 } 4680 } 4681 4682 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4683 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4684 bool is_varshift) { 4685 switch (ideal_opc) { 4686 case Op_AddVB: 4687 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4688 case Op_AddVS: 4689 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4690 case Op_AddVI: 4691 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4692 case Op_AddVL: 4693 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4694 case Op_AddVF: 4695 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4696 case Op_AddVD: 4697 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4698 case Op_SubVB: 4699 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4700 case Op_SubVS: 4701 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4702 case Op_SubVI: 4703 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4704 case Op_SubVL: 4705 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4706 case Op_SubVF: 4707 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4708 case Op_SubVD: 4709 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4710 case Op_MulVS: 4711 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4712 case Op_MulVI: 4713 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4714 case Op_MulVL: 4715 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4716 case Op_MulVF: 4717 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4718 case Op_MulVD: 4719 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4720 case Op_DivVF: 4721 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4722 case Op_DivVD: 4723 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4724 case Op_SqrtVF: 4725 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4726 case Op_SqrtVD: 4727 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4728 case Op_AbsVB: 4729 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4730 case Op_AbsVS: 4731 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4732 case Op_AbsVI: 4733 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4734 case Op_AbsVL: 4735 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4736 case Op_FmaVF: 4737 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4738 case Op_FmaVD: 4739 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4740 case Op_VectorRearrange: 4741 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4742 case Op_LShiftVS: 4743 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4744 case Op_LShiftVI: 4745 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4746 case Op_LShiftVL: 4747 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4748 case Op_RShiftVS: 4749 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4750 case Op_RShiftVI: 4751 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4752 case Op_RShiftVL: 4753 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4754 case Op_URShiftVS: 4755 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4756 case Op_URShiftVI: 4757 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4758 case Op_URShiftVL: 4759 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4760 case Op_RotateLeftV: 4761 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4762 case Op_RotateRightV: 4763 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4764 case Op_MaxV: 4765 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4766 case Op_MinV: 4767 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4768 case Op_UMinV: 4769 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4770 case Op_UMaxV: 4771 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4772 case Op_XorV: 4773 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4774 case Op_OrV: 4775 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4776 case Op_AndV: 4777 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4778 default: 4779 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4780 break; 4781 } 4782 } 4783 4784 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4785 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4786 switch (ideal_opc) { 4787 case Op_AddVB: 4788 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4789 case Op_AddVS: 4790 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4791 case Op_AddVI: 4792 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4793 case Op_AddVL: 4794 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4795 case Op_AddVF: 4796 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4797 case Op_AddVD: 4798 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4799 case Op_SubVB: 4800 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4801 case Op_SubVS: 4802 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4803 case Op_SubVI: 4804 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4805 case Op_SubVL: 4806 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4807 case Op_SubVF: 4808 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4809 case Op_SubVD: 4810 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4811 case Op_MulVS: 4812 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4813 case Op_MulVI: 4814 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4815 case Op_MulVL: 4816 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4817 case Op_MulVF: 4818 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4819 case Op_MulVD: 4820 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4821 case Op_DivVF: 4822 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4823 case Op_DivVD: 4824 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4825 case Op_FmaVF: 4826 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4827 case Op_FmaVD: 4828 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4829 case Op_MaxV: 4830 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4831 case Op_MinV: 4832 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4833 case Op_UMaxV: 4834 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4835 case Op_UMinV: 4836 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4837 case Op_XorV: 4838 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4839 case Op_OrV: 4840 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4841 case Op_AndV: 4842 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4843 default: 4844 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4845 break; 4846 } 4847 } 4848 4849 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4850 KRegister src1, KRegister src2) { 4851 BasicType etype = T_ILLEGAL; 4852 switch(mask_len) { 4853 case 2: 4854 case 4: 4855 case 8: etype = T_BYTE; break; 4856 case 16: etype = T_SHORT; break; 4857 case 32: etype = T_INT; break; 4858 case 64: etype = T_LONG; break; 4859 default: fatal("Unsupported type"); break; 4860 } 4861 assert(etype != T_ILLEGAL, ""); 4862 switch(ideal_opc) { 4863 case Op_AndVMask: 4864 kand(etype, dst, src1, src2); break; 4865 case Op_OrVMask: 4866 kor(etype, dst, src1, src2); break; 4867 case Op_XorVMask: 4868 kxor(etype, dst, src1, src2); break; 4869 default: 4870 fatal("Unsupported masked operation"); break; 4871 } 4872 } 4873 4874 /* 4875 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4876 * If src is NaN, the result is 0. 4877 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4878 * the result is equal to the value of Integer.MIN_VALUE. 4879 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4880 * the result is equal to the value of Integer.MAX_VALUE. 4881 */ 4882 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4883 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4884 Register rscratch, AddressLiteral float_sign_flip, 4885 int vec_enc) { 4886 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4887 Label done; 4888 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4889 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4890 vptest(xtmp2, xtmp2, vec_enc); 4891 jccb(Assembler::equal, done); 4892 4893 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4894 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4895 4896 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4897 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4898 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4899 4900 // Recompute the mask for remaining special value. 4901 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4902 // Extract SRC values corresponding to TRUE mask lanes. 4903 vpand(xtmp4, xtmp2, src, vec_enc); 4904 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4905 // values are set. 4906 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4907 4908 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4909 bind(done); 4910 } 4911 4912 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4913 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4914 Register rscratch, AddressLiteral float_sign_flip, 4915 int vec_enc) { 4916 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4917 Label done; 4918 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4919 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4920 kortestwl(ktmp1, ktmp1); 4921 jccb(Assembler::equal, done); 4922 4923 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4924 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4925 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4926 4927 kxorwl(ktmp1, ktmp1, ktmp2); 4928 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4929 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4930 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4931 bind(done); 4932 } 4933 4934 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4935 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4936 Register rscratch, AddressLiteral double_sign_flip, 4937 int vec_enc) { 4938 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4939 4940 Label done; 4941 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4942 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4943 kortestwl(ktmp1, ktmp1); 4944 jccb(Assembler::equal, done); 4945 4946 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4947 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4948 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4949 4950 kxorwl(ktmp1, ktmp1, ktmp2); 4951 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4952 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4953 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4954 bind(done); 4955 } 4956 4957 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4958 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4959 Register rscratch, AddressLiteral float_sign_flip, 4960 int vec_enc) { 4961 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4962 Label done; 4963 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4964 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4965 kortestwl(ktmp1, ktmp1); 4966 jccb(Assembler::equal, done); 4967 4968 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4969 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4970 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4971 4972 kxorwl(ktmp1, ktmp1, ktmp2); 4973 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4974 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4975 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4976 bind(done); 4977 } 4978 4979 /* 4980 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4981 * If src is NaN, the result is 0. 4982 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4983 * the result is equal to the value of Long.MIN_VALUE. 4984 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4985 * the result is equal to the value of Long.MAX_VALUE. 4986 */ 4987 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4988 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4989 Register rscratch, AddressLiteral double_sign_flip, 4990 int vec_enc) { 4991 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4992 4993 Label done; 4994 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4995 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4996 kortestwl(ktmp1, ktmp1); 4997 jccb(Assembler::equal, done); 4998 4999 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5000 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5001 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5002 5003 kxorwl(ktmp1, ktmp1, ktmp2); 5004 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5005 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5006 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5007 bind(done); 5008 } 5009 5010 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5011 XMMRegister xtmp, int index, int vec_enc) { 5012 assert(vec_enc < Assembler::AVX_512bit, ""); 5013 if (vec_enc == Assembler::AVX_256bit) { 5014 vextractf128_high(xtmp, src); 5015 vshufps(dst, src, xtmp, index, vec_enc); 5016 } else { 5017 vshufps(dst, src, zero, index, vec_enc); 5018 } 5019 } 5020 5021 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5022 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5023 AddressLiteral float_sign_flip, int src_vec_enc) { 5024 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5025 5026 Label done; 5027 // Compare the destination lanes with float_sign_flip 5028 // value to get mask for all special values. 5029 movdqu(xtmp1, float_sign_flip, rscratch); 5030 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5031 ptest(xtmp2, xtmp2); 5032 jccb(Assembler::equal, done); 5033 5034 // Flip float_sign_flip to get max integer value. 5035 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5036 pxor(xtmp1, xtmp4); 5037 5038 // Set detination lanes corresponding to unordered source lanes as zero. 5039 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5040 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5041 5042 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5043 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5044 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5045 5046 // Recompute the mask for remaining special value. 5047 pxor(xtmp2, xtmp3); 5048 // Extract mask corresponding to non-negative source lanes. 5049 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5050 5051 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5052 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5053 pand(xtmp3, xtmp2); 5054 5055 // Replace destination lanes holding special value(0x80000000) with max int 5056 // if corresponding source lane holds a +ve value. 5057 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5058 bind(done); 5059 } 5060 5061 5062 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5063 XMMRegister xtmp, Register rscratch, int vec_enc) { 5064 switch(to_elem_bt) { 5065 case T_SHORT: 5066 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5067 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5068 vpackusdw(dst, dst, zero, vec_enc); 5069 if (vec_enc == Assembler::AVX_256bit) { 5070 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5071 } 5072 break; 5073 case T_BYTE: 5074 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5075 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5076 vpackusdw(dst, dst, zero, vec_enc); 5077 if (vec_enc == Assembler::AVX_256bit) { 5078 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5079 } 5080 vpackuswb(dst, dst, zero, vec_enc); 5081 break; 5082 default: assert(false, "%s", type2name(to_elem_bt)); 5083 } 5084 } 5085 5086 /* 5087 * Algorithm for vector D2L and F2I conversions:- 5088 * a) Perform vector D2L/F2I cast. 5089 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5090 * It signifies that source value could be any of the special floating point 5091 * values(NaN,-Inf,Inf,Max,-Min). 5092 * c) Set destination to zero if source is NaN value. 5093 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5094 */ 5095 5096 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5097 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5098 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5099 int to_elem_sz = type2aelembytes(to_elem_bt); 5100 assert(to_elem_sz <= 4, ""); 5101 vcvttps2dq(dst, src, vec_enc); 5102 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5103 if (to_elem_sz < 4) { 5104 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5105 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5106 } 5107 } 5108 5109 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5110 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5111 Register rscratch, int vec_enc) { 5112 int to_elem_sz = type2aelembytes(to_elem_bt); 5113 assert(to_elem_sz <= 4, ""); 5114 vcvttps2dq(dst, src, vec_enc); 5115 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5116 switch(to_elem_bt) { 5117 case T_INT: 5118 break; 5119 case T_SHORT: 5120 evpmovdw(dst, dst, vec_enc); 5121 break; 5122 case T_BYTE: 5123 evpmovdb(dst, dst, vec_enc); 5124 break; 5125 default: assert(false, "%s", type2name(to_elem_bt)); 5126 } 5127 } 5128 5129 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5130 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5131 Register rscratch, int vec_enc) { 5132 evcvttps2qq(dst, src, vec_enc); 5133 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5134 } 5135 5136 // Handling for downcasting from double to integer or sub-word types on AVX2. 5137 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5138 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5139 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5140 int to_elem_sz = type2aelembytes(to_elem_bt); 5141 assert(to_elem_sz < 8, ""); 5142 vcvttpd2dq(dst, src, vec_enc); 5143 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5144 float_sign_flip, vec_enc); 5145 if (to_elem_sz < 4) { 5146 // xtmp4 holds all zero lanes. 5147 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5148 } 5149 } 5150 5151 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5152 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5153 KRegister ktmp2, AddressLiteral sign_flip, 5154 Register rscratch, int vec_enc) { 5155 if (VM_Version::supports_avx512dq()) { 5156 evcvttpd2qq(dst, src, vec_enc); 5157 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5158 switch(to_elem_bt) { 5159 case T_LONG: 5160 break; 5161 case T_INT: 5162 evpmovsqd(dst, dst, vec_enc); 5163 break; 5164 case T_SHORT: 5165 evpmovsqd(dst, dst, vec_enc); 5166 evpmovdw(dst, dst, vec_enc); 5167 break; 5168 case T_BYTE: 5169 evpmovsqd(dst, dst, vec_enc); 5170 evpmovdb(dst, dst, vec_enc); 5171 break; 5172 default: assert(false, "%s", type2name(to_elem_bt)); 5173 } 5174 } else { 5175 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5176 vcvttpd2dq(dst, src, vec_enc); 5177 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5178 switch(to_elem_bt) { 5179 case T_INT: 5180 break; 5181 case T_SHORT: 5182 evpmovdw(dst, dst, vec_enc); 5183 break; 5184 case T_BYTE: 5185 evpmovdb(dst, dst, vec_enc); 5186 break; 5187 default: assert(false, "%s", type2name(to_elem_bt)); 5188 } 5189 } 5190 } 5191 5192 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5193 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5194 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5195 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5196 // and re-instantiate original MXCSR.RC mode after that. 5197 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5198 5199 mov64(tmp, julong_cast(0.5L)); 5200 evpbroadcastq(xtmp1, tmp, vec_enc); 5201 vaddpd(xtmp1, src , xtmp1, vec_enc); 5202 evcvtpd2qq(dst, xtmp1, vec_enc); 5203 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5204 double_sign_flip, vec_enc);; 5205 5206 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5207 } 5208 5209 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5210 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5211 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5212 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5213 // and re-instantiate original MXCSR.RC mode after that. 5214 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5215 5216 movl(tmp, jint_cast(0.5)); 5217 movq(xtmp1, tmp); 5218 vbroadcastss(xtmp1, xtmp1, vec_enc); 5219 vaddps(xtmp1, src , xtmp1, vec_enc); 5220 vcvtps2dq(dst, xtmp1, vec_enc); 5221 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5222 float_sign_flip, vec_enc); 5223 5224 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5225 } 5226 5227 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5228 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5229 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5230 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5231 // and re-instantiate original MXCSR.RC mode after that. 5232 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5233 5234 movl(tmp, jint_cast(0.5)); 5235 movq(xtmp1, tmp); 5236 vbroadcastss(xtmp1, xtmp1, vec_enc); 5237 vaddps(xtmp1, src , xtmp1, vec_enc); 5238 vcvtps2dq(dst, xtmp1, vec_enc); 5239 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5240 5241 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5242 } 5243 5244 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5245 BasicType from_elem_bt, BasicType to_elem_bt) { 5246 switch (from_elem_bt) { 5247 case T_BYTE: 5248 switch (to_elem_bt) { 5249 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5250 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5251 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5252 default: ShouldNotReachHere(); 5253 } 5254 break; 5255 case T_SHORT: 5256 switch (to_elem_bt) { 5257 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5258 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5259 default: ShouldNotReachHere(); 5260 } 5261 break; 5262 case T_INT: 5263 assert(to_elem_bt == T_LONG, ""); 5264 vpmovzxdq(dst, src, vlen_enc); 5265 break; 5266 default: 5267 ShouldNotReachHere(); 5268 } 5269 } 5270 5271 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5272 BasicType from_elem_bt, BasicType to_elem_bt) { 5273 switch (from_elem_bt) { 5274 case T_BYTE: 5275 switch (to_elem_bt) { 5276 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5277 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5278 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5279 default: ShouldNotReachHere(); 5280 } 5281 break; 5282 case T_SHORT: 5283 switch (to_elem_bt) { 5284 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5285 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5286 default: ShouldNotReachHere(); 5287 } 5288 break; 5289 case T_INT: 5290 assert(to_elem_bt == T_LONG, ""); 5291 vpmovsxdq(dst, src, vlen_enc); 5292 break; 5293 default: 5294 ShouldNotReachHere(); 5295 } 5296 } 5297 5298 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5299 BasicType dst_bt, BasicType src_bt, int vlen) { 5300 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5301 assert(vlen_enc != AVX_512bit, ""); 5302 5303 int dst_bt_size = type2aelembytes(dst_bt); 5304 int src_bt_size = type2aelembytes(src_bt); 5305 if (dst_bt_size > src_bt_size) { 5306 switch (dst_bt_size / src_bt_size) { 5307 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5308 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5309 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5310 default: ShouldNotReachHere(); 5311 } 5312 } else { 5313 assert(dst_bt_size < src_bt_size, ""); 5314 switch (src_bt_size / dst_bt_size) { 5315 case 2: { 5316 if (vlen_enc == AVX_128bit) { 5317 vpacksswb(dst, src, src, vlen_enc); 5318 } else { 5319 vpacksswb(dst, src, src, vlen_enc); 5320 vpermq(dst, dst, 0x08, vlen_enc); 5321 } 5322 break; 5323 } 5324 case 4: { 5325 if (vlen_enc == AVX_128bit) { 5326 vpackssdw(dst, src, src, vlen_enc); 5327 vpacksswb(dst, dst, dst, vlen_enc); 5328 } else { 5329 vpackssdw(dst, src, src, vlen_enc); 5330 vpermq(dst, dst, 0x08, vlen_enc); 5331 vpacksswb(dst, dst, dst, AVX_128bit); 5332 } 5333 break; 5334 } 5335 case 8: { 5336 if (vlen_enc == AVX_128bit) { 5337 vpshufd(dst, src, 0x08, vlen_enc); 5338 vpackssdw(dst, dst, dst, vlen_enc); 5339 vpacksswb(dst, dst, dst, vlen_enc); 5340 } else { 5341 vpshufd(dst, src, 0x08, vlen_enc); 5342 vpermq(dst, dst, 0x08, vlen_enc); 5343 vpackssdw(dst, dst, dst, AVX_128bit); 5344 vpacksswb(dst, dst, dst, AVX_128bit); 5345 } 5346 break; 5347 } 5348 default: ShouldNotReachHere(); 5349 } 5350 } 5351 } 5352 5353 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5354 bool merge, BasicType bt, int vlen_enc) { 5355 if (bt == T_INT) { 5356 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5357 } else { 5358 assert(bt == T_LONG, ""); 5359 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5360 } 5361 } 5362 5363 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5364 bool merge, BasicType bt, int vlen_enc) { 5365 if (bt == T_INT) { 5366 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5367 } else { 5368 assert(bt == T_LONG, ""); 5369 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5370 } 5371 } 5372 5373 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5374 Register rtmp2, XMMRegister xtmp, int mask_len, 5375 int vec_enc) { 5376 int index = 0; 5377 int vindex = 0; 5378 mov64(rtmp1, 0x0101010101010101L); 5379 pdepq(rtmp1, src, rtmp1); 5380 if (mask_len > 8) { 5381 movq(rtmp2, src); 5382 vpxor(xtmp, xtmp, xtmp, vec_enc); 5383 movq(xtmp, rtmp1); 5384 } 5385 movq(dst, rtmp1); 5386 5387 mask_len -= 8; 5388 while (mask_len > 0) { 5389 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5390 index++; 5391 if ((index % 2) == 0) { 5392 pxor(xtmp, xtmp); 5393 } 5394 mov64(rtmp1, 0x0101010101010101L); 5395 shrq(rtmp2, 8); 5396 pdepq(rtmp1, rtmp2, rtmp1); 5397 pinsrq(xtmp, rtmp1, index % 2); 5398 vindex = index / 2; 5399 if (vindex) { 5400 // Write entire 16 byte vector when both 64 bit 5401 // lanes are update to save redundant instructions. 5402 if (index % 2) { 5403 vinsertf128(dst, dst, xtmp, vindex); 5404 } 5405 } else { 5406 vmovdqu(dst, xtmp); 5407 } 5408 mask_len -= 8; 5409 } 5410 } 5411 5412 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5413 switch(opc) { 5414 case Op_VectorMaskTrueCount: 5415 popcntq(dst, tmp); 5416 break; 5417 case Op_VectorMaskLastTrue: 5418 if (VM_Version::supports_lzcnt()) { 5419 lzcntq(tmp, tmp); 5420 movl(dst, 63); 5421 subl(dst, tmp); 5422 } else { 5423 movl(dst, -1); 5424 bsrq(tmp, tmp); 5425 cmov32(Assembler::notZero, dst, tmp); 5426 } 5427 break; 5428 case Op_VectorMaskFirstTrue: 5429 if (VM_Version::supports_bmi1()) { 5430 if (masklen < 32) { 5431 orl(tmp, 1 << masklen); 5432 tzcntl(dst, tmp); 5433 } else if (masklen == 32) { 5434 tzcntl(dst, tmp); 5435 } else { 5436 assert(masklen == 64, ""); 5437 tzcntq(dst, tmp); 5438 } 5439 } else { 5440 if (masklen < 32) { 5441 orl(tmp, 1 << masklen); 5442 bsfl(dst, tmp); 5443 } else { 5444 assert(masklen == 32 || masklen == 64, ""); 5445 movl(dst, masklen); 5446 if (masklen == 32) { 5447 bsfl(tmp, tmp); 5448 } else { 5449 bsfq(tmp, tmp); 5450 } 5451 cmov32(Assembler::notZero, dst, tmp); 5452 } 5453 } 5454 break; 5455 case Op_VectorMaskToLong: 5456 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5457 break; 5458 default: assert(false, "Unhandled mask operation"); 5459 } 5460 } 5461 5462 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5463 int masklen, int masksize, int vec_enc) { 5464 assert(VM_Version::supports_popcnt(), ""); 5465 5466 if(VM_Version::supports_avx512bw()) { 5467 kmovql(tmp, mask); 5468 } else { 5469 assert(masklen <= 16, ""); 5470 kmovwl(tmp, mask); 5471 } 5472 5473 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5474 // operations needs to be clipped. 5475 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5476 andq(tmp, (1 << masklen) - 1); 5477 } 5478 5479 vector_mask_operation_helper(opc, dst, tmp, masklen); 5480 } 5481 5482 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5483 Register tmp, int masklen, BasicType bt, int vec_enc) { 5484 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5485 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5486 assert(VM_Version::supports_popcnt(), ""); 5487 5488 bool need_clip = false; 5489 switch(bt) { 5490 case T_BOOLEAN: 5491 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5492 vpxor(xtmp, xtmp, xtmp, vec_enc); 5493 vpsubb(xtmp, xtmp, mask, vec_enc); 5494 vpmovmskb(tmp, xtmp, vec_enc); 5495 need_clip = masklen < 16; 5496 break; 5497 case T_BYTE: 5498 vpmovmskb(tmp, mask, vec_enc); 5499 need_clip = masklen < 16; 5500 break; 5501 case T_SHORT: 5502 vpacksswb(xtmp, mask, mask, vec_enc); 5503 if (masklen >= 16) { 5504 vpermpd(xtmp, xtmp, 8, vec_enc); 5505 } 5506 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5507 need_clip = masklen < 16; 5508 break; 5509 case T_INT: 5510 case T_FLOAT: 5511 vmovmskps(tmp, mask, vec_enc); 5512 need_clip = masklen < 4; 5513 break; 5514 case T_LONG: 5515 case T_DOUBLE: 5516 vmovmskpd(tmp, mask, vec_enc); 5517 need_clip = masklen < 2; 5518 break; 5519 default: assert(false, "Unhandled type, %s", type2name(bt)); 5520 } 5521 5522 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5523 // operations needs to be clipped. 5524 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5525 // need_clip implies masklen < 32 5526 andq(tmp, (1 << masklen) - 1); 5527 } 5528 5529 vector_mask_operation_helper(opc, dst, tmp, masklen); 5530 } 5531 5532 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5533 Register rtmp2, int mask_len) { 5534 kmov(rtmp1, src); 5535 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5536 mov64(rtmp2, -1L); 5537 pextq(rtmp2, rtmp2, rtmp1); 5538 kmov(dst, rtmp2); 5539 } 5540 5541 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5542 XMMRegister mask, Register rtmp, Register rscratch, 5543 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5544 int vec_enc) { 5545 assert(type2aelembytes(bt) >= 4, ""); 5546 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5547 address compress_perm_table = nullptr; 5548 address expand_perm_table = nullptr; 5549 if (type2aelembytes(bt) == 8) { 5550 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5551 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5552 vmovmskpd(rtmp, mask, vec_enc); 5553 } else { 5554 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5555 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5556 vmovmskps(rtmp, mask, vec_enc); 5557 } 5558 shlq(rtmp, 5); // for 32 byte permute row. 5559 if (opcode == Op_CompressV) { 5560 lea(rscratch, ExternalAddress(compress_perm_table)); 5561 } else { 5562 lea(rscratch, ExternalAddress(expand_perm_table)); 5563 } 5564 addptr(rtmp, rscratch); 5565 vmovdqu(permv, Address(rtmp)); 5566 vpermps(dst, permv, src, Assembler::AVX_256bit); 5567 vpxor(xtmp, xtmp, xtmp, vec_enc); 5568 // Blend the result with zero vector using permute mask, each column entry 5569 // in a permute table row contains either a valid permute index or a -1 (default) 5570 // value, this can potentially be used as a blending mask after 5571 // compressing/expanding the source vector lanes. 5572 vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv); 5573 } 5574 5575 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5576 bool merge, BasicType bt, int vec_enc) { 5577 if (opcode == Op_CompressV) { 5578 switch(bt) { 5579 case T_BYTE: 5580 evpcompressb(dst, mask, src, merge, vec_enc); 5581 break; 5582 case T_CHAR: 5583 case T_SHORT: 5584 evpcompressw(dst, mask, src, merge, vec_enc); 5585 break; 5586 case T_INT: 5587 evpcompressd(dst, mask, src, merge, vec_enc); 5588 break; 5589 case T_FLOAT: 5590 evcompressps(dst, mask, src, merge, vec_enc); 5591 break; 5592 case T_LONG: 5593 evpcompressq(dst, mask, src, merge, vec_enc); 5594 break; 5595 case T_DOUBLE: 5596 evcompresspd(dst, mask, src, merge, vec_enc); 5597 break; 5598 default: 5599 fatal("Unsupported type %s", type2name(bt)); 5600 break; 5601 } 5602 } else { 5603 assert(opcode == Op_ExpandV, ""); 5604 switch(bt) { 5605 case T_BYTE: 5606 evpexpandb(dst, mask, src, merge, vec_enc); 5607 break; 5608 case T_CHAR: 5609 case T_SHORT: 5610 evpexpandw(dst, mask, src, merge, vec_enc); 5611 break; 5612 case T_INT: 5613 evpexpandd(dst, mask, src, merge, vec_enc); 5614 break; 5615 case T_FLOAT: 5616 evexpandps(dst, mask, src, merge, vec_enc); 5617 break; 5618 case T_LONG: 5619 evpexpandq(dst, mask, src, merge, vec_enc); 5620 break; 5621 case T_DOUBLE: 5622 evexpandpd(dst, mask, src, merge, vec_enc); 5623 break; 5624 default: 5625 fatal("Unsupported type %s", type2name(bt)); 5626 break; 5627 } 5628 } 5629 } 5630 5631 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5632 KRegister ktmp1, int vec_enc) { 5633 if (opcode == Op_SignumVD) { 5634 vsubpd(dst, zero, one, vec_enc); 5635 // if src < 0 ? -1 : 1 5636 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5637 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5638 // if src == NaN, -0.0 or 0.0 return src. 5639 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5640 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5641 } else { 5642 assert(opcode == Op_SignumVF, ""); 5643 vsubps(dst, zero, one, vec_enc); 5644 // if src < 0 ? -1 : 1 5645 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5646 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5647 // if src == NaN, -0.0 or 0.0 return src. 5648 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5649 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5650 } 5651 } 5652 5653 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5654 XMMRegister xtmp1, int vec_enc) { 5655 if (opcode == Op_SignumVD) { 5656 vsubpd(dst, zero, one, vec_enc); 5657 // if src < 0 ? -1 : 1 5658 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5659 // if src == NaN, -0.0 or 0.0 return src. 5660 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5661 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5662 } else { 5663 assert(opcode == Op_SignumVF, ""); 5664 vsubps(dst, zero, one, vec_enc); 5665 // if src < 0 ? -1 : 1 5666 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5667 // if src == NaN, -0.0 or 0.0 return src. 5668 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5669 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5670 } 5671 } 5672 5673 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5674 if (VM_Version::supports_avx512bw()) { 5675 if (mask_len > 32) { 5676 kmovql(dst, src); 5677 } else { 5678 kmovdl(dst, src); 5679 if (mask_len != 32) { 5680 kshiftrdl(dst, dst, 32 - mask_len); 5681 } 5682 } 5683 } else { 5684 assert(mask_len <= 16, ""); 5685 kmovwl(dst, src); 5686 if (mask_len != 16) { 5687 kshiftrwl(dst, dst, 16 - mask_len); 5688 } 5689 } 5690 } 5691 5692 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5693 int lane_size = type2aelembytes(bt); 5694 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5695 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) { 5696 movptr(rtmp, imm32); 5697 switch(lane_size) { 5698 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5699 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5700 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5701 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5702 fatal("Unsupported lane size %d", lane_size); 5703 break; 5704 } 5705 } else { 5706 movptr(rtmp, imm32); 5707 movq(dst, rtmp); 5708 switch(lane_size) { 5709 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5710 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5711 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5712 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5713 fatal("Unsupported lane size %d", lane_size); 5714 break; 5715 } 5716 } 5717 } 5718 5719 // 5720 // Following is lookup table based popcount computation algorithm:- 5721 // Index Bit set count 5722 // [ 0000 -> 0, 5723 // 0001 -> 1, 5724 // 0010 -> 1, 5725 // 0011 -> 2, 5726 // 0100 -> 1, 5727 // 0101 -> 2, 5728 // 0110 -> 2, 5729 // 0111 -> 3, 5730 // 1000 -> 1, 5731 // 1001 -> 2, 5732 // 1010 -> 3, 5733 // 1011 -> 3, 5734 // 1100 -> 2, 5735 // 1101 -> 3, 5736 // 1111 -> 4 ] 5737 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5738 // shuffle indices for lookup table access. 5739 // b. Right shift each byte of vector lane by 4 positions. 5740 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5741 // shuffle indices for lookup table access. 5742 // d. Add the bitset count of upper and lower 4 bits of each byte. 5743 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5744 // count of all the bytes of a quadword. 5745 // f. Perform step e. for upper 128bit vector lane. 5746 // g. Pack the bitset count of quadwords back to double word. 5747 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5748 5749 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5750 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5751 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5752 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5753 vpsrlw(dst, src, 4, vec_enc); 5754 vpand(dst, dst, xtmp1, vec_enc); 5755 vpand(xtmp1, src, xtmp1, vec_enc); 5756 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5757 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5758 vpshufb(dst, xtmp2, dst, vec_enc); 5759 vpaddb(dst, dst, xtmp1, vec_enc); 5760 } 5761 5762 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5763 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5764 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5765 // Following code is as per steps e,f,g and h of above algorithm. 5766 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5767 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5768 vpsadbw(dst, dst, xtmp2, vec_enc); 5769 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5770 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5771 vpackuswb(dst, xtmp1, dst, vec_enc); 5772 } 5773 5774 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5775 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5776 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5777 // Add the popcount of upper and lower bytes of word. 5778 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5779 vpsrlw(dst, xtmp1, 8, vec_enc); 5780 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5781 vpaddw(dst, dst, xtmp1, vec_enc); 5782 } 5783 5784 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5785 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5786 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5787 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5788 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5789 } 5790 5791 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5792 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5793 switch(bt) { 5794 case T_LONG: 5795 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5796 break; 5797 case T_INT: 5798 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5799 break; 5800 case T_CHAR: 5801 case T_SHORT: 5802 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5803 break; 5804 case T_BYTE: 5805 case T_BOOLEAN: 5806 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5807 break; 5808 default: 5809 fatal("Unsupported type %s", type2name(bt)); 5810 break; 5811 } 5812 } 5813 5814 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5815 KRegister mask, bool merge, int vec_enc) { 5816 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5817 switch(bt) { 5818 case T_LONG: 5819 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5820 evpopcntq(dst, mask, src, merge, vec_enc); 5821 break; 5822 case T_INT: 5823 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5824 evpopcntd(dst, mask, src, merge, vec_enc); 5825 break; 5826 case T_CHAR: 5827 case T_SHORT: 5828 assert(VM_Version::supports_avx512_bitalg(), ""); 5829 evpopcntw(dst, mask, src, merge, vec_enc); 5830 break; 5831 case T_BYTE: 5832 case T_BOOLEAN: 5833 assert(VM_Version::supports_avx512_bitalg(), ""); 5834 evpopcntb(dst, mask, src, merge, vec_enc); 5835 break; 5836 default: 5837 fatal("Unsupported type %s", type2name(bt)); 5838 break; 5839 } 5840 } 5841 5842 // Bit reversal algorithm first reverses the bits of each byte followed by 5843 // a byte level reversal for multi-byte primitive types (short/int/long). 5844 // Algorithm performs a lookup table access to get reverse bit sequence 5845 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5846 // is obtained by swapping the reverse bit sequences of upper and lower 5847 // nibble of a byte. 5848 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5849 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5850 if (VM_Version::supports_avx512vlbw()) { 5851 5852 // Get the reverse bit sequence of lower nibble of each byte. 5853 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5854 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5855 evpandq(dst, xtmp2, src, vec_enc); 5856 vpshufb(dst, xtmp1, dst, vec_enc); 5857 vpsllq(dst, dst, 4, vec_enc); 5858 5859 // Get the reverse bit sequence of upper nibble of each byte. 5860 vpandn(xtmp2, xtmp2, src, vec_enc); 5861 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5862 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5863 5864 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5865 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5866 evporq(xtmp2, dst, xtmp2, vec_enc); 5867 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5868 5869 } else if(vec_enc == Assembler::AVX_512bit) { 5870 // Shift based bit reversal. 5871 assert(bt == T_LONG || bt == T_INT, ""); 5872 5873 // Swap lower and upper nibble of each byte. 5874 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5875 5876 // Swap two least and most significant bits of each nibble. 5877 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5878 5879 // Swap adjacent pair of bits. 5880 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5881 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5882 5883 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5884 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5885 } else { 5886 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5887 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5888 5889 // Get the reverse bit sequence of lower nibble of each byte. 5890 vpand(dst, xtmp2, src, vec_enc); 5891 vpshufb(dst, xtmp1, dst, vec_enc); 5892 vpsllq(dst, dst, 4, vec_enc); 5893 5894 // Get the reverse bit sequence of upper nibble of each byte. 5895 vpandn(xtmp2, xtmp2, src, vec_enc); 5896 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5897 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5898 5899 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5900 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5901 vpor(xtmp2, dst, xtmp2, vec_enc); 5902 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5903 } 5904 } 5905 5906 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5907 XMMRegister xtmp, Register rscratch) { 5908 assert(VM_Version::supports_gfni(), ""); 5909 assert(rscratch != noreg || always_reachable(mask), "missing"); 5910 5911 // Galois field instruction based bit reversal based on following algorithm. 5912 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5913 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5914 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5915 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5916 } 5917 5918 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5919 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5920 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5921 evpandq(dst, xtmp1, src, vec_enc); 5922 vpsllq(dst, dst, nbits, vec_enc); 5923 vpandn(xtmp1, xtmp1, src, vec_enc); 5924 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5925 evporq(dst, dst, xtmp1, vec_enc); 5926 } 5927 5928 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5929 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5930 // Shift based bit reversal. 5931 assert(VM_Version::supports_evex(), ""); 5932 switch(bt) { 5933 case T_LONG: 5934 // Swap upper and lower double word of each quad word. 5935 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5936 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5937 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5938 break; 5939 case T_INT: 5940 // Swap upper and lower word of each double word. 5941 evprord(xtmp1, k0, src, 16, true, vec_enc); 5942 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5943 break; 5944 case T_CHAR: 5945 case T_SHORT: 5946 // Swap upper and lower byte of each word. 5947 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 5948 break; 5949 case T_BYTE: 5950 evmovdquq(dst, k0, src, true, vec_enc); 5951 break; 5952 default: 5953 fatal("Unsupported type %s", type2name(bt)); 5954 break; 5955 } 5956 } 5957 5958 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5959 if (bt == T_BYTE) { 5960 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 5961 evmovdquq(dst, k0, src, true, vec_enc); 5962 } else { 5963 vmovdqu(dst, src); 5964 } 5965 return; 5966 } 5967 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 5968 // pre-computed shuffle indices. 5969 switch(bt) { 5970 case T_LONG: 5971 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 5972 break; 5973 case T_INT: 5974 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 5975 break; 5976 case T_CHAR: 5977 case T_SHORT: 5978 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 5979 break; 5980 default: 5981 fatal("Unsupported type %s", type2name(bt)); 5982 break; 5983 } 5984 vpshufb(dst, src, dst, vec_enc); 5985 } 5986 5987 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5988 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5989 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 5990 assert(is_integral_type(bt), ""); 5991 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5992 assert(VM_Version::supports_avx512cd(), ""); 5993 switch(bt) { 5994 case T_LONG: 5995 evplzcntq(dst, ktmp, src, merge, vec_enc); 5996 break; 5997 case T_INT: 5998 evplzcntd(dst, ktmp, src, merge, vec_enc); 5999 break; 6000 case T_SHORT: 6001 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6002 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6003 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6004 vpunpckhwd(dst, xtmp1, src, vec_enc); 6005 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6006 vpackusdw(dst, xtmp2, dst, vec_enc); 6007 break; 6008 case T_BYTE: 6009 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6010 // accessing the lookup table. 6011 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6012 // accessing the lookup table. 6013 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6014 assert(VM_Version::supports_avx512bw(), ""); 6015 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6016 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6017 vpand(xtmp2, dst, src, vec_enc); 6018 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6019 vpsrlw(xtmp3, src, 4, vec_enc); 6020 vpand(xtmp3, dst, xtmp3, vec_enc); 6021 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6022 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6023 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6024 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6025 break; 6026 default: 6027 fatal("Unsupported type %s", type2name(bt)); 6028 break; 6029 } 6030 } 6031 6032 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6033 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6034 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6035 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6036 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6037 // accessing the lookup table. 6038 vpand(dst, xtmp2, src, vec_enc); 6039 vpshufb(dst, xtmp1, dst, vec_enc); 6040 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6041 // accessing the lookup table. 6042 vpsrlw(xtmp3, src, 4, vec_enc); 6043 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6044 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6045 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6046 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6047 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6048 vpaddb(dst, dst, xtmp2, vec_enc); 6049 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6050 } 6051 6052 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6053 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6054 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6055 // Add zero counts of lower byte and upper byte of a word if 6056 // upper byte holds a zero value. 6057 vpsrlw(xtmp3, src, 8, vec_enc); 6058 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6059 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6060 vpsllw(xtmp2, dst, 8, vec_enc); 6061 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6062 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6063 vpsrlw(dst, dst, 8, vec_enc); 6064 } 6065 6066 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6067 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6068 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6069 // hence biased exponent can be used to compute leading zero count as per 6070 // following formula:- 6071 // LZCNT = 31 - (biased_exp - 127) 6072 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6073 6074 // Broadcast 0xFF 6075 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6076 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6077 6078 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher 6079 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit 6080 // contributes to the leading number of zeros. 6081 vpsrld(xtmp2, src, 1, vec_enc); 6082 vpandn(xtmp3, xtmp2, src, vec_enc); 6083 6084 // Extract biased exponent. 6085 vcvtdq2ps(dst, xtmp3, vec_enc); 6086 vpsrld(dst, dst, 23, vec_enc); 6087 vpand(dst, dst, xtmp1, vec_enc); 6088 6089 // Broadcast 127. 6090 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6091 // Exponent = biased_exp - 127 6092 vpsubd(dst, dst, xtmp1, vec_enc); 6093 6094 // Exponent_plus_one = Exponent + 1 6095 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6096 vpaddd(dst, dst, xtmp3, vec_enc); 6097 6098 // Replace -ve exponent with zero, exponent is -ve when src 6099 // lane contains a zero value. 6100 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6101 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6102 6103 // Rematerialize broadcast 32. 6104 vpslld(xtmp1, xtmp3, 5, vec_enc); 6105 // Exponent is 32 if corresponding source lane contains max_int value. 6106 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6107 // LZCNT = 32 - exponent_plus_one 6108 vpsubd(dst, xtmp1, dst, vec_enc); 6109 6110 // Replace LZCNT with a value 1 if corresponding source lane 6111 // contains max_int value. 6112 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6113 6114 // Replace biased_exp with 0 if source lane value is less than zero. 6115 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6116 vblendvps(dst, dst, xtmp2, src, vec_enc); 6117 } 6118 6119 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6120 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6121 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6122 // Add zero counts of lower word and upper word of a double word if 6123 // upper word holds a zero value. 6124 vpsrld(xtmp3, src, 16, vec_enc); 6125 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6126 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6127 vpslld(xtmp2, dst, 16, vec_enc); 6128 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6129 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6130 vpsrld(dst, dst, 16, vec_enc); 6131 // Add zero counts of lower doubleword and upper doubleword of a 6132 // quadword if upper doubleword holds a zero value. 6133 vpsrlq(xtmp3, src, 32, vec_enc); 6134 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6135 vpsllq(xtmp2, dst, 32, vec_enc); 6136 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6137 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6138 vpsrlq(dst, dst, 32, vec_enc); 6139 } 6140 6141 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6142 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6143 Register rtmp, int vec_enc) { 6144 assert(is_integral_type(bt), "unexpected type"); 6145 assert(vec_enc < Assembler::AVX_512bit, ""); 6146 switch(bt) { 6147 case T_LONG: 6148 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6149 break; 6150 case T_INT: 6151 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6152 break; 6153 case T_SHORT: 6154 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6155 break; 6156 case T_BYTE: 6157 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6158 break; 6159 default: 6160 fatal("Unsupported type %s", type2name(bt)); 6161 break; 6162 } 6163 } 6164 6165 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6166 switch(bt) { 6167 case T_BYTE: 6168 vpsubb(dst, src1, src2, vec_enc); 6169 break; 6170 case T_SHORT: 6171 vpsubw(dst, src1, src2, vec_enc); 6172 break; 6173 case T_INT: 6174 vpsubd(dst, src1, src2, vec_enc); 6175 break; 6176 case T_LONG: 6177 vpsubq(dst, src1, src2, vec_enc); 6178 break; 6179 default: 6180 fatal("Unsupported type %s", type2name(bt)); 6181 break; 6182 } 6183 } 6184 6185 // Trailing zero count computation is based on leading zero count operation as per 6186 // following equation. All AVX3 targets support AVX512CD feature which offers 6187 // direct vector instruction to compute leading zero count. 6188 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6189 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6190 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6191 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6192 assert(is_integral_type(bt), ""); 6193 // xtmp = -1 6194 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6195 // xtmp = xtmp + src 6196 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6197 // xtmp = xtmp & ~src 6198 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6199 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6200 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6201 vpsub(bt, dst, xtmp4, dst, vec_enc); 6202 } 6203 6204 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6205 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6206 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6207 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6208 assert(is_integral_type(bt), ""); 6209 // xtmp = 0 6210 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6211 // xtmp = 0 - src 6212 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6213 // xtmp = xtmp | src 6214 vpor(xtmp3, xtmp3, src, vec_enc); 6215 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6216 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6217 vpsub(bt, dst, xtmp1, dst, vec_enc); 6218 } 6219 6220 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6221 Label done; 6222 Label neg_divisor_fastpath; 6223 cmpl(divisor, 0); 6224 jccb(Assembler::less, neg_divisor_fastpath); 6225 xorl(rdx, rdx); 6226 divl(divisor); 6227 jmpb(done); 6228 bind(neg_divisor_fastpath); 6229 // Fastpath for divisor < 0: 6230 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6231 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6232 movl(rdx, rax); 6233 subl(rdx, divisor); 6234 if (VM_Version::supports_bmi1()) { 6235 andnl(rax, rdx, rax); 6236 } else { 6237 notl(rdx); 6238 andl(rax, rdx); 6239 } 6240 shrl(rax, 31); 6241 bind(done); 6242 } 6243 6244 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6245 Label done; 6246 Label neg_divisor_fastpath; 6247 cmpl(divisor, 0); 6248 jccb(Assembler::less, neg_divisor_fastpath); 6249 xorl(rdx, rdx); 6250 divl(divisor); 6251 jmpb(done); 6252 bind(neg_divisor_fastpath); 6253 // Fastpath when divisor < 0: 6254 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6255 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6256 movl(rdx, rax); 6257 subl(rax, divisor); 6258 if (VM_Version::supports_bmi1()) { 6259 andnl(rax, rax, rdx); 6260 } else { 6261 notl(rax); 6262 andl(rax, rdx); 6263 } 6264 sarl(rax, 31); 6265 andl(rax, divisor); 6266 subl(rdx, rax); 6267 bind(done); 6268 } 6269 6270 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6271 Label done; 6272 Label neg_divisor_fastpath; 6273 6274 cmpl(divisor, 0); 6275 jccb(Assembler::less, neg_divisor_fastpath); 6276 xorl(rdx, rdx); 6277 divl(divisor); 6278 jmpb(done); 6279 bind(neg_divisor_fastpath); 6280 // Fastpath for divisor < 0: 6281 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6282 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6283 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6284 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6285 movl(rdx, rax); 6286 subl(rax, divisor); 6287 if (VM_Version::supports_bmi1()) { 6288 andnl(rax, rax, rdx); 6289 } else { 6290 notl(rax); 6291 andl(rax, rdx); 6292 } 6293 movl(tmp, rax); 6294 shrl(rax, 31); // quotient 6295 sarl(tmp, 31); 6296 andl(tmp, divisor); 6297 subl(rdx, tmp); // remainder 6298 bind(done); 6299 } 6300 6301 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6302 XMMRegister xtmp2, Register rtmp) { 6303 if(VM_Version::supports_gfni()) { 6304 // Galois field instruction based bit reversal based on following algorithm. 6305 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6306 mov64(rtmp, 0x8040201008040201L); 6307 movq(xtmp1, src); 6308 movq(xtmp2, rtmp); 6309 gf2p8affineqb(xtmp1, xtmp2, 0); 6310 movq(dst, xtmp1); 6311 } else { 6312 // Swap even and odd numbered bits. 6313 movl(rtmp, src); 6314 andl(rtmp, 0x55555555); 6315 shll(rtmp, 1); 6316 movl(dst, src); 6317 andl(dst, 0xAAAAAAAA); 6318 shrl(dst, 1); 6319 orl(dst, rtmp); 6320 6321 // Swap LSB and MSB 2 bits of each nibble. 6322 movl(rtmp, dst); 6323 andl(rtmp, 0x33333333); 6324 shll(rtmp, 2); 6325 andl(dst, 0xCCCCCCCC); 6326 shrl(dst, 2); 6327 orl(dst, rtmp); 6328 6329 // Swap LSB and MSB 4 bits of each byte. 6330 movl(rtmp, dst); 6331 andl(rtmp, 0x0F0F0F0F); 6332 shll(rtmp, 4); 6333 andl(dst, 0xF0F0F0F0); 6334 shrl(dst, 4); 6335 orl(dst, rtmp); 6336 } 6337 bswapl(dst); 6338 } 6339 6340 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6341 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6342 if(VM_Version::supports_gfni()) { 6343 // Galois field instruction based bit reversal based on following algorithm. 6344 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6345 mov64(rtmp1, 0x8040201008040201L); 6346 movq(xtmp1, src); 6347 movq(xtmp2, rtmp1); 6348 gf2p8affineqb(xtmp1, xtmp2, 0); 6349 movq(dst, xtmp1); 6350 } else { 6351 // Swap even and odd numbered bits. 6352 movq(rtmp1, src); 6353 mov64(rtmp2, 0x5555555555555555L); 6354 andq(rtmp1, rtmp2); 6355 shlq(rtmp1, 1); 6356 movq(dst, src); 6357 notq(rtmp2); 6358 andq(dst, rtmp2); 6359 shrq(dst, 1); 6360 orq(dst, rtmp1); 6361 6362 // Swap LSB and MSB 2 bits of each nibble. 6363 movq(rtmp1, dst); 6364 mov64(rtmp2, 0x3333333333333333L); 6365 andq(rtmp1, rtmp2); 6366 shlq(rtmp1, 2); 6367 notq(rtmp2); 6368 andq(dst, rtmp2); 6369 shrq(dst, 2); 6370 orq(dst, rtmp1); 6371 6372 // Swap LSB and MSB 4 bits of each byte. 6373 movq(rtmp1, dst); 6374 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6375 andq(rtmp1, rtmp2); 6376 shlq(rtmp1, 4); 6377 notq(rtmp2); 6378 andq(dst, rtmp2); 6379 shrq(dst, 4); 6380 orq(dst, rtmp1); 6381 } 6382 bswapq(dst); 6383 } 6384 6385 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6386 Label done; 6387 Label neg_divisor_fastpath; 6388 cmpq(divisor, 0); 6389 jccb(Assembler::less, neg_divisor_fastpath); 6390 xorl(rdx, rdx); 6391 divq(divisor); 6392 jmpb(done); 6393 bind(neg_divisor_fastpath); 6394 // Fastpath for divisor < 0: 6395 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6396 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6397 movq(rdx, rax); 6398 subq(rdx, divisor); 6399 if (VM_Version::supports_bmi1()) { 6400 andnq(rax, rdx, rax); 6401 } else { 6402 notq(rdx); 6403 andq(rax, rdx); 6404 } 6405 shrq(rax, 63); 6406 bind(done); 6407 } 6408 6409 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6410 Label done; 6411 Label neg_divisor_fastpath; 6412 cmpq(divisor, 0); 6413 jccb(Assembler::less, neg_divisor_fastpath); 6414 xorq(rdx, rdx); 6415 divq(divisor); 6416 jmp(done); 6417 bind(neg_divisor_fastpath); 6418 // Fastpath when divisor < 0: 6419 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6420 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6421 movq(rdx, rax); 6422 subq(rax, divisor); 6423 if (VM_Version::supports_bmi1()) { 6424 andnq(rax, rax, rdx); 6425 } else { 6426 notq(rax); 6427 andq(rax, rdx); 6428 } 6429 sarq(rax, 63); 6430 andq(rax, divisor); 6431 subq(rdx, rax); 6432 bind(done); 6433 } 6434 6435 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6436 Label done; 6437 Label neg_divisor_fastpath; 6438 cmpq(divisor, 0); 6439 jccb(Assembler::less, neg_divisor_fastpath); 6440 xorq(rdx, rdx); 6441 divq(divisor); 6442 jmp(done); 6443 bind(neg_divisor_fastpath); 6444 // Fastpath for divisor < 0: 6445 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6446 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6447 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6448 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6449 movq(rdx, rax); 6450 subq(rax, divisor); 6451 if (VM_Version::supports_bmi1()) { 6452 andnq(rax, rax, rdx); 6453 } else { 6454 notq(rax); 6455 andq(rax, rdx); 6456 } 6457 movq(tmp, rax); 6458 shrq(rax, 63); // quotient 6459 sarq(tmp, 63); 6460 andq(tmp, divisor); 6461 subq(rdx, tmp); // remainder 6462 bind(done); 6463 } 6464 6465 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6466 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6467 int vlen_enc) { 6468 assert(VM_Version::supports_avx512bw(), ""); 6469 // Byte shuffles are inlane operations and indices are determined using 6470 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6471 // normalized to index range 0-15. This makes sure that all the multiples 6472 // of an index value are placed at same relative position in 128 bit 6473 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6474 // will be 16th element in their respective 128 bit lanes. 6475 movl(rtmp, 16); 6476 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6477 6478 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6479 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6480 // original shuffle indices and move the shuffled lanes corresponding to true 6481 // mask to destination vector. 6482 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6483 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6484 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6485 6486 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6487 // and broadcasting second 128 bit lane. 6488 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6489 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6490 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6491 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6492 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6493 6494 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6495 // and broadcasting third 128 bit lane. 6496 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6497 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6498 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6499 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6500 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6501 6502 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6503 // and broadcasting third 128 bit lane. 6504 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6505 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6506 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6507 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6508 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6509 } 6510 6511 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6512 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6513 if (vlen_enc == AVX_128bit) { 6514 vpermilps(dst, src, shuffle, vlen_enc); 6515 } else if (bt == T_INT) { 6516 vpermd(dst, shuffle, src, vlen_enc); 6517 } else { 6518 assert(bt == T_FLOAT, ""); 6519 vpermps(dst, shuffle, src, vlen_enc); 6520 } 6521 } 6522 6523 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 6524 switch(opcode) { 6525 case Op_AddHF: vaddsh(dst, src1, src2); break; 6526 case Op_SubHF: vsubsh(dst, src1, src2); break; 6527 case Op_MulHF: vmulsh(dst, src1, src2); break; 6528 case Op_DivHF: vdivsh(dst, src1, src2); break; 6529 default: assert(false, "%s", NodeClassNames[opcode]); break; 6530 } 6531 } 6532 6533 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6534 switch(elem_bt) { 6535 case T_BYTE: 6536 if (ideal_opc == Op_SaturatingAddV) { 6537 vpaddsb(dst, src1, src2, vlen_enc); 6538 } else { 6539 assert(ideal_opc == Op_SaturatingSubV, ""); 6540 vpsubsb(dst, src1, src2, vlen_enc); 6541 } 6542 break; 6543 case T_SHORT: 6544 if (ideal_opc == Op_SaturatingAddV) { 6545 vpaddsw(dst, src1, src2, vlen_enc); 6546 } else { 6547 assert(ideal_opc == Op_SaturatingSubV, ""); 6548 vpsubsw(dst, src1, src2, vlen_enc); 6549 } 6550 break; 6551 default: 6552 fatal("Unsupported type %s", type2name(elem_bt)); 6553 break; 6554 } 6555 } 6556 6557 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6558 switch(elem_bt) { 6559 case T_BYTE: 6560 if (ideal_opc == Op_SaturatingAddV) { 6561 vpaddusb(dst, src1, src2, vlen_enc); 6562 } else { 6563 assert(ideal_opc == Op_SaturatingSubV, ""); 6564 vpsubusb(dst, src1, src2, vlen_enc); 6565 } 6566 break; 6567 case T_SHORT: 6568 if (ideal_opc == Op_SaturatingAddV) { 6569 vpaddusw(dst, src1, src2, vlen_enc); 6570 } else { 6571 assert(ideal_opc == Op_SaturatingSubV, ""); 6572 vpsubusw(dst, src1, src2, vlen_enc); 6573 } 6574 break; 6575 default: 6576 fatal("Unsupported type %s", type2name(elem_bt)); 6577 break; 6578 } 6579 } 6580 6581 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6582 XMMRegister src2, KRegister ktmp, int vlen_enc) { 6583 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input. 6584 // overflow_mask = Inp1 <u Inp2 6585 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc); 6586 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative) 6587 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false); 6588 } 6589 6590 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6591 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6592 // Emulate unsigned comparison using signed comparison 6593 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE 6594 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true); 6595 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc); 6596 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc); 6597 6598 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc); 6599 6600 // Res = INP1 - INP2 (non-commutative and non-associative) 6601 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6602 // Res = Mask ? Zero : Res 6603 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); 6604 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc); 6605 } 6606 6607 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6608 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) { 6609 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation. 6610 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2) 6611 // Res = Signed Add INP1, INP2 6612 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6613 // T1 = SRC1 | SRC2 6614 vpor(xtmp1, src1, src2, vlen_enc); 6615 // Max_Unsigned = -1 6616 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6617 // Unsigned compare: Mask = Res <u T1 6618 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc); 6619 // res = Mask ? Max_Unsigned : Res 6620 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc); 6621 } 6622 6623 // 6624 // Section 2-13 Hacker's Delight list following overflow detection check for saturating 6625 // unsigned addition operation. 6626 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1 6627 // 6628 // We empirically determined its semantic equivalence to following reduced expression 6629 // overflow_mask = (a + b) <u (a | b) 6630 // 6631 // and also verified it though Alive2 solver. 6632 // (https://alive2.llvm.org/ce/z/XDQ7dY) 6633 // 6634 6635 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6636 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) { 6637 // Res = Signed Add INP1, INP2 6638 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6639 // Compute T1 = INP1 | INP2 6640 vpor(xtmp3, src1, src2, vlen_enc); 6641 // T1 = Minimum signed value. 6642 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6643 // Convert T1 to signed value, T1 = T1 + MIN_VALUE 6644 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc); 6645 // Convert Res to signed value, Res<s> = Res + MIN_VALUE 6646 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc); 6647 // Compute overflow detection mask = Res<1> <s T1 6648 if (elem_bt == T_INT) { 6649 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc); 6650 } else { 6651 assert(elem_bt == T_LONG, ""); 6652 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc); 6653 } 6654 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6655 } 6656 6657 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6658 int vlen_enc, bool xtmp2_hold_M1) { 6659 if (VM_Version::supports_avx512dq()) { 6660 evpmovq2m(ktmp, src, vlen_enc); 6661 } else { 6662 assert(VM_Version::supports_evex(), ""); 6663 if (!xtmp2_hold_M1) { 6664 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6665 } 6666 evpsraq(xtmp1, src, 63, vlen_enc); 6667 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6668 } 6669 } 6670 6671 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6672 int vlen_enc, bool xtmp2_hold_M1) { 6673 if (VM_Version::supports_avx512dq()) { 6674 evpmovd2m(ktmp, src, vlen_enc); 6675 } else { 6676 assert(VM_Version::supports_evex(), ""); 6677 if (!xtmp2_hold_M1) { 6678 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6679 } 6680 vpsrad(xtmp1, src, 31, vlen_enc); 6681 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6682 } 6683 } 6684 6685 6686 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 6687 if (elem_bt == T_LONG) { 6688 if (VM_Version::supports_evex()) { 6689 evpsraq(dst, src, 63, vlen_enc); 6690 } else { 6691 vpsrad(dst, src, 31, vlen_enc); 6692 vpshufd(dst, dst, 0xF5, vlen_enc); 6693 } 6694 } else { 6695 assert(elem_bt == T_INT, ""); 6696 vpsrad(dst, src, 31, vlen_enc); 6697 } 6698 } 6699 6700 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6701 if (compute_allones) { 6702 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6703 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6704 } else { 6705 vpcmpeqq(allones, allones, allones, vlen_enc); 6706 } 6707 } 6708 if (elem_bt == T_LONG) { 6709 vpsrlq(dst, allones, 1, vlen_enc); 6710 } else { 6711 assert(elem_bt == T_INT, ""); 6712 vpsrld(dst, allones, 1, vlen_enc); 6713 } 6714 } 6715 6716 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6717 if (compute_allones) { 6718 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6719 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6720 } else { 6721 vpcmpeqq(allones, allones, allones, vlen_enc); 6722 } 6723 } 6724 if (elem_bt == T_LONG) { 6725 vpsllq(dst, allones, 63, vlen_enc); 6726 } else { 6727 assert(elem_bt == T_INT, ""); 6728 vpslld(dst, allones, 31, vlen_enc); 6729 } 6730 } 6731 6732 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2, 6733 Assembler::ComparisonPredicate cond, int vlen_enc) { 6734 switch(elem_bt) { 6735 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break; 6736 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break; 6737 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break; 6738 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break; 6739 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6740 } 6741 } 6742 6743 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6744 switch(elem_bt) { 6745 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break; 6746 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break; 6747 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break; 6748 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break; 6749 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6750 } 6751 } 6752 6753 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1, 6754 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) { 6755 if (elem_bt == T_LONG) { 6756 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6757 } else { 6758 assert(elem_bt == T_INT, ""); 6759 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6760 } 6761 } 6762 6763 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6764 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6765 KRegister ktmp1, KRegister ktmp2, int vlen_enc) { 6766 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6767 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6768 // Overflow detection based on Hacker's delight section 2-13. 6769 if (ideal_opc == Op_SaturatingAddV) { 6770 // res = src1 + src2 6771 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6772 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6773 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6774 vpxor(xtmp1, dst, src1, vlen_enc); 6775 vpxor(xtmp2, dst, src2, vlen_enc); 6776 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6777 } else { 6778 assert(ideal_opc == Op_SaturatingSubV, ""); 6779 // res = src1 - src2 6780 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6781 // Overflow occurs when both inputs have opposite polarity and 6782 // result polarity does not comply with first input polarity. 6783 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6784 vpxor(xtmp1, src1, src2, vlen_enc); 6785 vpxor(xtmp2, dst, src1, vlen_enc); 6786 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6787 } 6788 6789 // Compute overflow detection mask. 6790 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc); 6791 // Note: xtmp1 hold -1 in all its lanes after above call. 6792 6793 // Compute mask based on first input polarity. 6794 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true); 6795 6796 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6797 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6798 6799 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to 6800 // set bits in first input polarity mask holds a min value. 6801 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc); 6802 // Blend destination lanes with saturated values using overflow detection mask. 6803 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc); 6804 } 6805 6806 6807 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6808 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6809 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) { 6810 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6811 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6812 // Overflow detection based on Hacker's delight section 2-13. 6813 if (ideal_opc == Op_SaturatingAddV) { 6814 // res = src1 + src2 6815 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6816 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6817 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6818 vpxor(xtmp1, dst, src1, vlen_enc); 6819 vpxor(xtmp2, dst, src2, vlen_enc); 6820 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6821 } else { 6822 assert(ideal_opc == Op_SaturatingSubV, ""); 6823 // res = src1 - src2 6824 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6825 // Overflow occurs when both inputs have opposite polarity and 6826 // result polarity does not comply with first input polarity. 6827 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6828 vpxor(xtmp1, src1, src2, vlen_enc); 6829 vpxor(xtmp2, dst, src1, vlen_enc); 6830 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6831 } 6832 6833 // Sign-extend to compute overflow detection mask. 6834 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc); 6835 6836 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc); 6837 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc); 6838 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6839 6840 // Compose saturating min/max vector using first input polarity mask. 6841 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc); 6842 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc); 6843 6844 // Blend result with saturating vector using overflow detection mask. 6845 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6846 } 6847 6848 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6849 switch(elem_bt) { 6850 case T_BYTE: 6851 if (ideal_opc == Op_SaturatingAddV) { 6852 vpaddsb(dst, src1, src2, vlen_enc); 6853 } else { 6854 assert(ideal_opc == Op_SaturatingSubV, ""); 6855 vpsubsb(dst, src1, src2, vlen_enc); 6856 } 6857 break; 6858 case T_SHORT: 6859 if (ideal_opc == Op_SaturatingAddV) { 6860 vpaddsw(dst, src1, src2, vlen_enc); 6861 } else { 6862 assert(ideal_opc == Op_SaturatingSubV, ""); 6863 vpsubsw(dst, src1, src2, vlen_enc); 6864 } 6865 break; 6866 default: 6867 fatal("Unsupported type %s", type2name(elem_bt)); 6868 break; 6869 } 6870 } 6871 6872 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6873 switch(elem_bt) { 6874 case T_BYTE: 6875 if (ideal_opc == Op_SaturatingAddV) { 6876 vpaddusb(dst, src1, src2, vlen_enc); 6877 } else { 6878 assert(ideal_opc == Op_SaturatingSubV, ""); 6879 vpsubusb(dst, src1, src2, vlen_enc); 6880 } 6881 break; 6882 case T_SHORT: 6883 if (ideal_opc == Op_SaturatingAddV) { 6884 vpaddusw(dst, src1, src2, vlen_enc); 6885 } else { 6886 assert(ideal_opc == Op_SaturatingSubV, ""); 6887 vpsubusw(dst, src1, src2, vlen_enc); 6888 } 6889 break; 6890 default: 6891 fatal("Unsupported type %s", type2name(elem_bt)); 6892 break; 6893 } 6894 } 6895 6896 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6897 XMMRegister src2, int vlen_enc) { 6898 switch(elem_bt) { 6899 case T_BYTE: 6900 evpermi2b(dst, src1, src2, vlen_enc); 6901 break; 6902 case T_SHORT: 6903 evpermi2w(dst, src1, src2, vlen_enc); 6904 break; 6905 case T_INT: 6906 evpermi2d(dst, src1, src2, vlen_enc); 6907 break; 6908 case T_LONG: 6909 evpermi2q(dst, src1, src2, vlen_enc); 6910 break; 6911 case T_FLOAT: 6912 evpermi2ps(dst, src1, src2, vlen_enc); 6913 break; 6914 case T_DOUBLE: 6915 evpermi2pd(dst, src1, src2, vlen_enc); 6916 break; 6917 default: 6918 fatal("Unsupported type %s", type2name(elem_bt)); 6919 break; 6920 } 6921 } 6922 6923 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) { 6924 if (is_unsigned) { 6925 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 6926 } else { 6927 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 6928 } 6929 } 6930 6931 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) { 6932 if (is_unsigned) { 6933 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 6934 } else { 6935 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 6936 } 6937 } 6938 6939 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6940 switch(opcode) { 6941 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 6942 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 6943 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 6944 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 6945 default: assert(false, "%s", NodeClassNames[opcode]); break; 6946 } 6947 } 6948 6949 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6950 switch(opcode) { 6951 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 6952 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 6953 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 6954 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 6955 default: assert(false, "%s", NodeClassNames[opcode]); break; 6956 } 6957 } 6958 6959 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6960 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) { 6961 vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit); 6962 } 6963 6964 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6965 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6966 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) { 6967 // Move sign bits of src2 to mask register. 6968 evpmovw2m(ktmp, src2, vlen_enc); 6969 // xtmp1 = src2 < 0 ? src2 : src1 6970 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 6971 // xtmp2 = src2 < 0 ? ? src1 : src2 6972 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 6973 // Idea behind above swapping is to make seconds source operand a +ve value. 6974 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in 6975 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction, 6976 // the second source operand, either a NaN or a valid floating-point value, is returned 6977 // dst = max(xtmp1, xtmp2) 6978 evmaxph(dst, xtmp1, xtmp2, vlen_enc); 6979 // isNaN = is_unordered_quiet(xtmp1) 6980 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 6981 // Final result is same as first source if its a NaN value, 6982 // in case second operand holds a NaN value then as per above semantics 6983 // result is same as second operand. 6984 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 6985 } else { 6986 assert(opcode == Op_MinVHF || opcode == Op_MinHF, ""); 6987 // Move sign bits of src1 to mask register. 6988 evpmovw2m(ktmp, src1, vlen_enc); 6989 // xtmp1 = src1 < 0 ? src2 : src1 6990 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 6991 // xtmp2 = src1 < 0 ? src1 : src2 6992 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 6993 // Idea behind above swapping is to make seconds source operand a -ve value. 6994 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in 6995 // the second source operand is returned. 6996 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN 6997 // or a valid floating-point value, is written to the result. 6998 // dst = min(xtmp1, xtmp2) 6999 evminph(dst, xtmp1, xtmp2, vlen_enc); 7000 // isNaN = is_unordered_quiet(xtmp1) 7001 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7002 // Final result is same as first source if its a NaN value, 7003 // in case second operand holds a NaN value then as per above semantics 7004 // result is same as second operand. 7005 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7006 } 7007 }