1 /* 2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "gc/shared/barrierSet.hpp" 28 #include "gc/shared/barrierSetAssembler.hpp" 29 #include "oops/methodData.hpp" 30 #include "opto/c2_MacroAssembler.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/output.hpp" 33 #include "opto/opcodes.hpp" 34 #include "opto/subnode.hpp" 35 #include "runtime/globals.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 #include "utilities/checkedCast.hpp" 39 #include "utilities/globalDefinitions.hpp" 40 #include "utilities/powerOfTwo.hpp" 41 #include "utilities/sizes.hpp" 42 43 #ifdef PRODUCT 44 #define BLOCK_COMMENT(str) /* nothing */ 45 #define STOP(error) stop(error) 46 #else 47 #define BLOCK_COMMENT(str) block_comment(str) 48 #define STOP(error) block_comment(error); stop(error) 49 #endif 50 51 // C2 compiled method's prolog code. 52 // Beware! This sp_inc is NOT the same as the one mentioned in MacroAssembler::remove_frame but only the size 53 // of the extension space + the additional copy of the return address. That means, it doesn't contain the 54 // frame size (where the local and sp_inc are) and the saved RBP. 55 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) { 56 if (C->clinit_barrier_on_entry()) { 57 assert(VM_Version::supports_fast_class_init_checks(), "sanity"); 58 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started"); 59 60 Label L_skip_barrier; 61 Register klass = rscratch1; 62 63 mov_metadata(klass, C->method()->holder()->constant_encoding()); 64 clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/); 65 66 jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 67 68 bind(L_skip_barrier); 69 } 70 71 int framesize = C->output()->frame_size_in_bytes(); 72 int bangsize = C->output()->bang_size_in_bytes(); 73 bool fp_mode_24b = false; 74 int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0; 75 76 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 77 78 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 79 // Remove word for return addr 80 framesize -= wordSize; 81 stack_bang_size -= wordSize; 82 83 // Calls to C2R adapters often do not accept exceptional returns. 84 // We require that their callers must bang for them. But be careful, because 85 // some VM calls (such as call site linkage) can use several kilobytes of 86 // stack. But the stack safety zone should account for that. 87 // See bugs 4446381, 4468289, 4497237. 88 if (stack_bang_size > 0) { 89 generate_stack_overflow_check(stack_bang_size); 90 91 // We always push rbp, so that on return to interpreter rbp, will be 92 // restored correctly and we can correct the stack. 93 push(rbp); 94 #ifdef ASSERT 95 if (sp_inc > 0) { 96 movl(Address(rsp, 0), badRegWordVal); 97 movl(Address(rsp, VMRegImpl::stack_slot_size), badRegWordVal); 98 } 99 #endif 100 // Save caller's stack pointer into RBP if the frame pointer is preserved. 101 if (PreserveFramePointer) { 102 mov(rbp, rsp); 103 } 104 // Remove word for ebp 105 framesize -= wordSize; 106 107 // Create frame 108 if (framesize) { 109 subptr(rsp, framesize); 110 } 111 } else { 112 subptr(rsp, framesize); 113 114 // Save RBP register now. 115 framesize -= wordSize; 116 movptr(Address(rsp, framesize), rbp); 117 #ifdef ASSERT 118 if (sp_inc > 0) { 119 movl(Address(rsp, framesize), badRegWordVal); 120 movl(Address(rsp, framesize + VMRegImpl::stack_slot_size), badRegWordVal); 121 } 122 #endif 123 // Save caller's stack pointer into RBP if the frame pointer is preserved. 124 if (PreserveFramePointer) { 125 movptr(rbp, rsp); 126 if (framesize > 0) { 127 addptr(rbp, framesize); 128 } 129 } 130 } 131 132 if (C->needs_stack_repair()) { 133 // Save stack increment just below the saved rbp (also account for fixed framesize and rbp) 134 assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned"); 135 movptr(Address(rsp, framesize - wordSize), sp_inc + framesize); 136 } 137 138 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 139 framesize -= wordSize; 140 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 141 } 142 143 #ifdef ASSERT 144 if (VerifyStackAtCalls) { 145 Label L; 146 push(rax); 147 mov(rax, rsp); 148 andptr(rax, StackAlignmentInBytes-1); 149 cmpptr(rax, StackAlignmentInBytes-wordSize); 150 pop(rax); 151 jcc(Assembler::equal, L); 152 STOP("Stack is not properly aligned!"); 153 bind(L); 154 } 155 #endif 156 } 157 158 void C2_MacroAssembler::entry_barrier() { 159 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 160 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 161 Label dummy_slow_path; 162 Label dummy_continuation; 163 Label* slow_path = &dummy_slow_path; 164 Label* continuation = &dummy_continuation; 165 if (!Compile::current()->output()->in_scratch_emit_size()) { 166 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 167 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 168 Compile::current()->output()->add_stub(stub); 169 slow_path = &stub->entry(); 170 continuation = &stub->continuation(); 171 } 172 bs->nmethod_entry_barrier(this, slow_path, continuation); 173 } 174 175 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 176 switch (vlen_in_bytes) { 177 case 4: // fall-through 178 case 8: // fall-through 179 case 16: return Assembler::AVX_128bit; 180 case 32: return Assembler::AVX_256bit; 181 case 64: return Assembler::AVX_512bit; 182 183 default: { 184 ShouldNotReachHere(); 185 return Assembler::AVX_NoVec; 186 } 187 } 188 } 189 190 // fast_lock and fast_unlock used by C2 191 192 // Because the transitions from emitted code to the runtime 193 // monitorenter/exit helper stubs are so slow it's critical that 194 // we inline both the stack-locking fast path and the inflated fast path. 195 // 196 // See also: cmpFastLock and cmpFastUnlock. 197 // 198 // What follows is a specialized inline transliteration of the code 199 // in enter() and exit(). If we're concerned about I$ bloat another 200 // option would be to emit TrySlowEnter and TrySlowExit methods 201 // at startup-time. These methods would accept arguments as 202 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 203 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 204 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 205 // In practice, however, the # of lock sites is bounded and is usually small. 206 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 207 // if the processor uses simple bimodal branch predictors keyed by EIP 208 // Since the helper routines would be called from multiple synchronization 209 // sites. 210 // 211 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 212 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 213 // to those specialized methods. That'd give us a mostly platform-independent 214 // implementation that the JITs could optimize and inline at their pleasure. 215 // Done correctly, the only time we'd need to cross to native could would be 216 // to park() or unpark() threads. We'd also need a few more unsafe operators 217 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 218 // (b) explicit barriers or fence operations. 219 // 220 // TODO: 221 // 222 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 223 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 224 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 225 // the lock operators would typically be faster than reifying Self. 226 // 227 // * Ideally I'd define the primitives as: 228 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 229 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 230 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 231 // Instead, we're stuck with a rather awkward and brittle register assignments below. 232 // Furthermore the register assignments are overconstrained, possibly resulting in 233 // sub-optimal code near the synchronization site. 234 // 235 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 236 // Alternately, use a better sp-proximity test. 237 // 238 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 239 // Either one is sufficient to uniquely identify a thread. 240 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 241 // 242 // * Intrinsify notify() and notifyAll() for the common cases where the 243 // object is locked by the calling thread but the waitlist is empty. 244 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 245 // 246 // * use jccb and jmpb instead of jcc and jmp to improve code density. 247 // But beware of excessive branch density on AMD Opterons. 248 // 249 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 250 // or failure of the fast path. If the fast path fails then we pass 251 // control to the slow path, typically in C. In fast_lock and 252 // fast_unlock we often branch to DONE_LABEL, just to find that C2 253 // will emit a conditional branch immediately after the node. 254 // So we have branches to branches and lots of ICC.ZF games. 255 // Instead, it might be better to have C2 pass a "FailureLabel" 256 // into fast_lock and fast_unlock. In the case of success, control 257 // will drop through the node. ICC.ZF is undefined at exit. 258 // In the case of failure, the node will branch directly to the 259 // FailureLabel 260 261 262 // obj: object to lock 263 // box: on-stack box address -- KILLED 264 // rax: tmp -- KILLED 265 // t : tmp -- KILLED 266 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg, 267 Register t, Register thread) { 268 assert(rax_reg == rax, "Used for CAS"); 269 assert_different_registers(obj, box, rax_reg, t, thread); 270 271 // Handle inflated monitor. 272 Label inflated; 273 // Finish fast lock successfully. ZF value is irrelevant. 274 Label locked; 275 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 276 Label slow_path; 277 278 if (UseObjectMonitorTable) { 279 // Clear cache in case fast locking succeeds or we need to take the slow-path. 280 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 281 } 282 283 if (DiagnoseSyncOnValueBasedClasses != 0) { 284 load_klass(rax_reg, obj, t); 285 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 286 jcc(Assembler::notZero, slow_path); 287 } 288 289 const Register mark = t; 290 291 { // Fast Lock 292 293 Label push; 294 295 const Register top = UseObjectMonitorTable ? rax_reg : box; 296 297 // Load the mark. 298 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 299 300 // Prefetch top. 301 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 302 303 // Check for monitor (0b10). 304 testptr(mark, markWord::monitor_value); 305 jcc(Assembler::notZero, inflated); 306 307 // Check if lock-stack is full. 308 cmpl(top, LockStack::end_offset() - 1); 309 jcc(Assembler::greater, slow_path); 310 311 // Check if recursive. 312 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 313 jccb(Assembler::equal, push); 314 315 // Try to lock. Transition lock bits 0b01 => 0b00 316 movptr(rax_reg, mark); 317 orptr(rax_reg, markWord::unlocked_value); 318 andptr(mark, ~(int32_t)markWord::unlocked_value); 319 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 320 jcc(Assembler::notEqual, slow_path); 321 322 if (UseObjectMonitorTable) { 323 // Need to reload top, clobbered by CAS. 324 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 325 } 326 bind(push); 327 // After successful lock, push object on lock-stack. 328 movptr(Address(thread, top), obj); 329 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 330 jmpb(locked); 331 } 332 333 { // Handle inflated monitor. 334 bind(inflated); 335 336 const Register monitor = t; 337 338 if (!UseObjectMonitorTable) { 339 assert(mark == monitor, "should be the same here"); 340 } else { 341 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 342 // Fetch ObjectMonitor* from the cache or take the slow-path. 343 Label monitor_found; 344 345 // Load cache address 346 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 347 348 const int num_unrolled = 2; 349 for (int i = 0; i < num_unrolled; i++) { 350 cmpptr(obj, Address(t)); 351 jccb(Assembler::equal, monitor_found); 352 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 353 } 354 355 Label loop; 356 357 // Search for obj in cache. 358 bind(loop); 359 360 // Check for match. 361 cmpptr(obj, Address(t)); 362 jccb(Assembler::equal, monitor_found); 363 364 // Search until null encountered, guaranteed _null_sentinel at end. 365 cmpptr(Address(t), 1); 366 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 367 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 368 jmpb(loop); 369 370 // Cache hit. 371 bind(monitor_found); 372 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 373 } 374 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 375 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 376 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 377 378 Label monitor_locked; 379 // Lock the monitor. 380 381 if (UseObjectMonitorTable) { 382 // Cache the monitor for unlock before trashing box. On failure to acquire 383 // the lock, the slow path will reset the entry accordingly (see CacheSetter). 384 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 385 } 386 387 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 388 xorptr(rax_reg, rax_reg); 389 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset())); 390 lock(); cmpxchgptr(box, owner_address); 391 jccb(Assembler::equal, monitor_locked); 392 393 // Check if recursive. 394 cmpptr(box, rax_reg); 395 jccb(Assembler::notEqual, slow_path); 396 397 // Recursive. 398 increment(recursions_address); 399 400 bind(monitor_locked); 401 } 402 403 bind(locked); 404 // Set ZF = 1 405 xorl(rax_reg, rax_reg); 406 407 #ifdef ASSERT 408 // Check that locked label is reached with ZF set. 409 Label zf_correct; 410 Label zf_bad_zero; 411 jcc(Assembler::zero, zf_correct); 412 jmp(zf_bad_zero); 413 #endif 414 415 bind(slow_path); 416 #ifdef ASSERT 417 // Check that slow_path label is reached with ZF not set. 418 jcc(Assembler::notZero, zf_correct); 419 stop("Fast Lock ZF != 0"); 420 bind(zf_bad_zero); 421 stop("Fast Lock ZF != 1"); 422 bind(zf_correct); 423 #endif 424 // C2 uses the value of ZF to determine the continuation. 425 } 426 427 // obj: object to lock 428 // rax: tmp -- KILLED 429 // t : tmp - cannot be obj nor rax -- KILLED 430 // 431 // Some commentary on balanced locking: 432 // 433 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 434 // Methods that don't have provably balanced locking are forced to run in the 435 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 436 // The interpreter provides two properties: 437 // I1: At return-time the interpreter automatically and quietly unlocks any 438 // objects acquired in the current activation (frame). Recall that the 439 // interpreter maintains an on-stack list of locks currently held by 440 // a frame. 441 // I2: If a method attempts to unlock an object that is not held by the 442 // frame the interpreter throws IMSX. 443 // 444 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 445 // B() doesn't have provably balanced locking so it runs in the interpreter. 446 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 447 // is still locked by A(). 448 // 449 // The only other source of unbalanced locking would be JNI. The "Java Native Interface 450 // Specification" states that an object locked by JNI's MonitorEnter should not be 451 // unlocked by "normal" java-level locking and vice-versa. The specification doesn't 452 // specify what will occur if a program engages in such mixed-mode locking, however. 453 // Arguably given that the spec legislates the JNI case as undefined our implementation 454 // could reasonably *avoid* checking owner in fast_unlock(). 455 // In the interest of performance we elide m->Owner==Self check in unlock. 456 // A perfectly viable alternative is to elide the owner check except when 457 // Xcheck:jni is enabled. 458 459 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) { 460 assert(reg_rax == rax, "Used for CAS"); 461 assert_different_registers(obj, reg_rax, t); 462 463 // Handle inflated monitor. 464 Label inflated, inflated_check_lock_stack; 465 // Finish fast unlock successfully. MUST jump with ZF == 1 466 Label unlocked, slow_path; 467 468 const Register mark = t; 469 const Register monitor = t; 470 const Register top = UseObjectMonitorTable ? t : reg_rax; 471 const Register box = reg_rax; 472 473 Label dummy; 474 C2FastUnlockStub* stub = nullptr; 475 476 if (!Compile::current()->output()->in_scratch_emit_size()) { 477 stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread); 478 Compile::current()->output()->add_stub(stub); 479 } 480 481 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 482 483 { // Fast Unlock 484 485 // Load top. 486 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 487 488 if (!UseObjectMonitorTable) { 489 // Prefetch mark. 490 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 491 } 492 493 // Check if obj is top of lock-stack. 494 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 495 // Top of lock stack was not obj. Must be monitor. 496 jcc(Assembler::notEqual, inflated_check_lock_stack); 497 498 // Pop lock-stack. 499 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 500 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 501 502 // Check if recursive. 503 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 504 jcc(Assembler::equal, unlocked); 505 506 // We elide the monitor check, let the CAS fail instead. 507 508 if (UseObjectMonitorTable) { 509 // Load mark. 510 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 511 } 512 513 // Try to unlock. Transition lock bits 0b00 => 0b01 514 movptr(reg_rax, mark); 515 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 516 orptr(mark, markWord::unlocked_value); 517 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 518 jcc(Assembler::notEqual, push_and_slow_path); 519 jmp(unlocked); 520 } 521 522 523 { // Handle inflated monitor. 524 bind(inflated_check_lock_stack); 525 #ifdef ASSERT 526 Label check_done; 527 subl(top, oopSize); 528 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 529 jcc(Assembler::below, check_done); 530 cmpptr(obj, Address(thread, top)); 531 jccb(Assembler::notEqual, inflated_check_lock_stack); 532 stop("Fast Unlock lock on stack"); 533 bind(check_done); 534 if (UseObjectMonitorTable) { 535 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 536 } 537 testptr(mark, markWord::monitor_value); 538 jccb(Assembler::notZero, inflated); 539 stop("Fast Unlock not monitor"); 540 #endif 541 542 bind(inflated); 543 544 if (!UseObjectMonitorTable) { 545 assert(mark == monitor, "should be the same here"); 546 } else { 547 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 548 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 549 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 550 cmpptr(monitor, alignof(ObjectMonitor*)); 551 jcc(Assembler::below, slow_path); 552 } 553 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 554 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 555 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 556 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag}; 557 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 558 559 Label recursive; 560 561 // Check if recursive. 562 cmpptr(recursions_address, 0); 563 jccb(Assembler::notZero, recursive); 564 565 // Set owner to null. 566 // Release to satisfy the JMM 567 movptr(owner_address, NULL_WORD); 568 // We need a full fence after clearing owner to avoid stranding. 569 // StoreLoad achieves this. 570 membar(StoreLoad); 571 572 // Check if the entry_list is empty. 573 cmpptr(entry_list_address, NULL_WORD); 574 jccb(Assembler::zero, unlocked); // If so we are done. 575 576 // Check if there is a successor. 577 cmpptr(succ_address, NULL_WORD); 578 jccb(Assembler::notZero, unlocked); // If so we are done. 579 580 // Save the monitor pointer in the current thread, so we can try to 581 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 582 if (!UseObjectMonitorTable) { 583 andptr(monitor, ~(int32_t)markWord::monitor_value); 584 } 585 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 586 587 orl(t, 1); // Fast Unlock ZF = 0 588 jmpb(slow_path); 589 590 // Recursive unlock. 591 bind(recursive); 592 decrement(recursions_address); 593 } 594 595 bind(unlocked); 596 xorl(t, t); // Fast Unlock ZF = 1 597 598 #ifdef ASSERT 599 // Check that unlocked label is reached with ZF set. 600 Label zf_correct; 601 Label zf_bad_zero; 602 jcc(Assembler::zero, zf_correct); 603 jmp(zf_bad_zero); 604 #endif 605 606 bind(slow_path); 607 if (stub != nullptr) { 608 bind(stub->slow_path_continuation()); 609 } 610 #ifdef ASSERT 611 // Check that stub->continuation() label is reached with ZF not set. 612 jcc(Assembler::notZero, zf_correct); 613 stop("Fast Unlock ZF != 0"); 614 bind(zf_bad_zero); 615 stop("Fast Unlock ZF != 1"); 616 bind(zf_correct); 617 #endif 618 // C2 uses the value of ZF to determine the continuation. 619 } 620 621 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) { 622 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi); 623 } 624 625 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) { 626 const int framesize = Compile::current()->output()->frame_size_in_bytes(); 627 masm->movptr(dst, rsp); 628 if (framesize > 2 * wordSize) { 629 masm->addptr(dst, framesize - 2 * wordSize); 630 } 631 } 632 633 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) { 634 if (PreserveFramePointer) { 635 // frame pointer is valid 636 #ifdef ASSERT 637 // Verify frame pointer value in rbp. 638 reconstruct_frame_pointer_helper(this, rtmp); 639 Label L_success; 640 cmpq(rbp, rtmp); 641 jccb(Assembler::equal, L_success); 642 STOP("frame pointer mismatch"); 643 bind(L_success); 644 #endif // ASSERT 645 } else { 646 reconstruct_frame_pointer_helper(this, rbp); 647 } 648 } 649 650 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) { 651 jint lo = t->_lo; 652 jint hi = t->_hi; 653 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi); 654 if (t == TypeInt::INT) { 655 return; 656 } 657 658 BLOCK_COMMENT("CastII {"); 659 Label fail; 660 Label succeed; 661 662 if (lo != min_jint) { 663 cmpl(val, lo); 664 jccb(Assembler::less, fail); 665 } 666 if (hi != max_jint) { 667 cmpl(val, hi); 668 jccb(Assembler::greater, fail); 669 } 670 jmpb(succeed); 671 672 bind(fail); 673 movl(c_rarg0, idx); 674 movl(c_rarg1, val); 675 movl(c_rarg2, lo); 676 movl(c_rarg3, hi); 677 reconstruct_frame_pointer(rscratch1); 678 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range))); 679 hlt(); 680 bind(succeed); 681 BLOCK_COMMENT("} // CastII"); 682 } 683 684 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) { 685 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi); 686 } 687 688 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) { 689 jlong lo = t->_lo; 690 jlong hi = t->_hi; 691 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi); 692 if (t == TypeLong::LONG) { 693 return; 694 } 695 696 BLOCK_COMMENT("CastLL {"); 697 Label fail; 698 Label succeed; 699 700 auto cmp_val = [&](jlong bound) { 701 if (is_simm32(bound)) { 702 cmpq(val, checked_cast<int>(bound)); 703 } else { 704 mov64(tmp, bound); 705 cmpq(val, tmp); 706 } 707 }; 708 709 if (lo != min_jlong) { 710 cmp_val(lo); 711 jccb(Assembler::less, fail); 712 } 713 if (hi != max_jlong) { 714 cmp_val(hi); 715 jccb(Assembler::greater, fail); 716 } 717 jmpb(succeed); 718 719 bind(fail); 720 movl(c_rarg0, idx); 721 movq(c_rarg1, val); 722 mov64(c_rarg2, lo); 723 mov64(c_rarg3, hi); 724 reconstruct_frame_pointer(rscratch1); 725 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range))); 726 hlt(); 727 bind(succeed); 728 BLOCK_COMMENT("} // CastLL"); 729 } 730 731 //------------------------------------------------------------------------------------------- 732 // Generic instructions support for use in .ad files C2 code generation 733 734 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 735 if (dst != src) { 736 movdqu(dst, src); 737 } 738 if (opcode == Op_AbsVD) { 739 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 740 } else { 741 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 742 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 743 } 744 } 745 746 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 747 if (opcode == Op_AbsVD) { 748 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 749 } else { 750 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 751 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 752 } 753 } 754 755 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 756 if (dst != src) { 757 movdqu(dst, src); 758 } 759 if (opcode == Op_AbsVF) { 760 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 761 } else { 762 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 763 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 764 } 765 } 766 767 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 768 if (opcode == Op_AbsVF) { 769 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 770 } else { 771 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 772 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 773 } 774 } 775 776 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 777 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 778 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 779 780 if (opcode == Op_MinV) { 781 if (elem_bt == T_BYTE) { 782 pminsb(dst, src); 783 } else if (elem_bt == T_SHORT) { 784 pminsw(dst, src); 785 } else if (elem_bt == T_INT) { 786 pminsd(dst, src); 787 } else { 788 assert(elem_bt == T_LONG, "required"); 789 assert(tmp == xmm0, "required"); 790 assert_different_registers(dst, src, tmp); 791 movdqu(xmm0, dst); 792 pcmpgtq(xmm0, src); 793 blendvpd(dst, src); // xmm0 as mask 794 } 795 } else { // opcode == Op_MaxV 796 if (elem_bt == T_BYTE) { 797 pmaxsb(dst, src); 798 } else if (elem_bt == T_SHORT) { 799 pmaxsw(dst, src); 800 } else if (elem_bt == T_INT) { 801 pmaxsd(dst, src); 802 } else { 803 assert(elem_bt == T_LONG, "required"); 804 assert(tmp == xmm0, "required"); 805 assert_different_registers(dst, src, tmp); 806 movdqu(xmm0, src); 807 pcmpgtq(xmm0, dst); 808 blendvpd(dst, src); // xmm0 as mask 809 } 810 } 811 } 812 813 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 814 XMMRegister src1, Address src2, int vlen_enc) { 815 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 816 if (opcode == Op_UMinV) { 817 switch(elem_bt) { 818 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 819 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 820 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 821 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 822 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 823 } 824 } else { 825 assert(opcode == Op_UMaxV, "required"); 826 switch(elem_bt) { 827 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 828 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 829 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 830 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 831 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 832 } 833 } 834 } 835 836 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 837 // For optimality, leverage a full vector width of 512 bits 838 // for operations over smaller vector sizes on AVX512 targets. 839 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { 840 if (opcode == Op_UMaxV) { 841 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 842 } else { 843 assert(opcode == Op_UMinV, "required"); 844 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 845 } 846 } else { 847 // T1 = -1 848 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc); 849 // T1 = -1 << 63 850 vpsllq(xtmp1, xtmp1, 63, vlen_enc); 851 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2 852 vpaddq(xtmp2, xtmp1, src2, vlen_enc); 853 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1 854 vpaddq(xtmp1, xtmp1, src1, vlen_enc); 855 // Mask = T2 > T1 856 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc); 857 if (opcode == Op_UMaxV) { 858 // Res = Mask ? Src2 : Src1 859 vpblendvb(dst, src1, src2, xtmp1, vlen_enc); 860 } else { 861 // Res = Mask ? Src1 : Src2 862 vpblendvb(dst, src2, src1, xtmp1, vlen_enc); 863 } 864 } 865 } 866 867 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 868 XMMRegister src1, XMMRegister src2, int vlen_enc) { 869 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 870 if (opcode == Op_UMinV) { 871 switch(elem_bt) { 872 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 873 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 874 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 875 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 876 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 877 } 878 } else { 879 assert(opcode == Op_UMaxV, "required"); 880 switch(elem_bt) { 881 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 882 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 883 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 884 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 885 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 886 } 887 } 888 } 889 890 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 891 XMMRegister dst, XMMRegister src1, XMMRegister src2, 892 int vlen_enc) { 893 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 894 895 if (opcode == Op_MinV) { 896 if (elem_bt == T_BYTE) { 897 vpminsb(dst, src1, src2, vlen_enc); 898 } else if (elem_bt == T_SHORT) { 899 vpminsw(dst, src1, src2, vlen_enc); 900 } else if (elem_bt == T_INT) { 901 vpminsd(dst, src1, src2, vlen_enc); 902 } else { 903 assert(elem_bt == T_LONG, "required"); 904 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 905 vpminsq(dst, src1, src2, vlen_enc); 906 } else { 907 assert_different_registers(dst, src1, src2); 908 vpcmpgtq(dst, src1, src2, vlen_enc); 909 vblendvpd(dst, src1, src2, dst, vlen_enc); 910 } 911 } 912 } else { // opcode == Op_MaxV 913 if (elem_bt == T_BYTE) { 914 vpmaxsb(dst, src1, src2, vlen_enc); 915 } else if (elem_bt == T_SHORT) { 916 vpmaxsw(dst, src1, src2, vlen_enc); 917 } else if (elem_bt == T_INT) { 918 vpmaxsd(dst, src1, src2, vlen_enc); 919 } else { 920 assert(elem_bt == T_LONG, "required"); 921 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 922 vpmaxsq(dst, src1, src2, vlen_enc); 923 } else { 924 assert_different_registers(dst, src1, src2); 925 vpcmpgtq(dst, src1, src2, vlen_enc); 926 vblendvpd(dst, src2, src1, dst, vlen_enc); 927 } 928 } 929 } 930 } 931 932 // Float/Double min max 933 934 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 935 XMMRegister dst, XMMRegister a, XMMRegister b, 936 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 937 int vlen_enc) { 938 assert(UseAVX > 0, "required"); 939 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 940 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 941 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 942 assert_different_registers(a, tmp, atmp, btmp); 943 assert_different_registers(b, tmp, atmp, btmp); 944 945 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 946 bool is_double_word = is_double_word_type(elem_bt); 947 948 /* Note on 'non-obvious' assembly sequence: 949 * 950 * While there are vminps/vmaxps instructions, there are two important differences between hardware 951 * and Java on how they handle floats: 952 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 953 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 954 * 955 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 956 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 957 * (only useful when signs differ, noop otherwise) 958 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 959 960 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 961 * btmp = (b < +0.0) ? a : b 962 * atmp = (b < +0.0) ? b : a 963 * Tmp = Max_Float(atmp , btmp) 964 * Res = (atmp == NaN) ? atmp : Tmp 965 */ 966 967 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 968 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 969 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 970 XMMRegister mask; 971 972 if (!is_double_word && is_min) { 973 mask = a; 974 vblend = &MacroAssembler::vblendvps; 975 vmaxmin = &MacroAssembler::vminps; 976 vcmp = &MacroAssembler::vcmpps; 977 } else if (!is_double_word && !is_min) { 978 mask = b; 979 vblend = &MacroAssembler::vblendvps; 980 vmaxmin = &MacroAssembler::vmaxps; 981 vcmp = &MacroAssembler::vcmpps; 982 } else if (is_double_word && is_min) { 983 mask = a; 984 vblend = &MacroAssembler::vblendvpd; 985 vmaxmin = &MacroAssembler::vminpd; 986 vcmp = &MacroAssembler::vcmppd; 987 } else { 988 assert(is_double_word && !is_min, "sanity"); 989 mask = b; 990 vblend = &MacroAssembler::vblendvpd; 991 vmaxmin = &MacroAssembler::vmaxpd; 992 vcmp = &MacroAssembler::vcmppd; 993 } 994 995 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 996 XMMRegister maxmin, scratch; 997 if (dst == btmp) { 998 maxmin = btmp; 999 scratch = tmp; 1000 } else { 1001 maxmin = tmp; 1002 scratch = btmp; 1003 } 1004 1005 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1006 if (precompute_mask && !is_double_word) { 1007 vpsrad(tmp, mask, 32, vlen_enc); 1008 mask = tmp; 1009 } else if (precompute_mask && is_double_word) { 1010 vpxor(tmp, tmp, tmp, vlen_enc); 1011 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1012 mask = tmp; 1013 } 1014 1015 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1016 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1017 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1018 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1019 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1020 } 1021 1022 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1023 XMMRegister dst, XMMRegister a, XMMRegister b, 1024 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1025 int vlen_enc) { 1026 assert(UseAVX > 2, "required"); 1027 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1028 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1029 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1030 assert_different_registers(dst, a, atmp, btmp); 1031 assert_different_registers(dst, b, atmp, btmp); 1032 1033 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1034 bool is_double_word = is_double_word_type(elem_bt); 1035 bool merge = true; 1036 1037 if (!is_double_word && is_min) { 1038 evpmovd2m(ktmp, a, vlen_enc); 1039 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1040 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1041 vminps(dst, atmp, btmp, vlen_enc); 1042 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1043 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1044 } else if (!is_double_word && !is_min) { 1045 evpmovd2m(ktmp, b, vlen_enc); 1046 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1047 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1048 vmaxps(dst, atmp, btmp, vlen_enc); 1049 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1050 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1051 } else if (is_double_word && is_min) { 1052 evpmovq2m(ktmp, a, vlen_enc); 1053 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1054 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1055 vminpd(dst, atmp, btmp, vlen_enc); 1056 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1057 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1058 } else { 1059 assert(is_double_word && !is_min, "sanity"); 1060 evpmovq2m(ktmp, b, vlen_enc); 1061 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1062 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1063 vmaxpd(dst, atmp, btmp, vlen_enc); 1064 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1065 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1066 } 1067 } 1068 1069 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask, 1070 XMMRegister src1, XMMRegister src2, int vlen_enc) { 1071 assert(opc == Op_MinV || opc == Op_MinReductionV || 1072 opc == Op_MaxV || opc == Op_MaxReductionV, "sanity"); 1073 1074 int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN 1075 : AVX10_2_MINMAX_MAX_COMPARE_SIGN; 1076 if (elem_bt == T_FLOAT) { 1077 evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc); 1078 } else { 1079 assert(elem_bt == T_DOUBLE, ""); 1080 evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc); 1081 } 1082 } 1083 1084 // Float/Double signum 1085 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1086 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1087 1088 Label DONE_LABEL; 1089 1090 // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument 1091 // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases 1092 // If other floating point comparison instructions used, ZF=1 for equal and unordered cases 1093 if (opcode == Op_SignumF) { 1094 if (VM_Version::supports_avx10_2()) { 1095 vucomxss(dst, zero); 1096 jcc(Assembler::negative, DONE_LABEL); 1097 } else { 1098 ucomiss(dst, zero); 1099 jcc(Assembler::equal, DONE_LABEL); 1100 } 1101 movflt(dst, one); 1102 jcc(Assembler::above, DONE_LABEL); 1103 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1104 } else if (opcode == Op_SignumD) { 1105 if (VM_Version::supports_avx10_2()) { 1106 vucomxsd(dst, zero); 1107 jcc(Assembler::negative, DONE_LABEL); 1108 } else { 1109 ucomisd(dst, zero); 1110 jcc(Assembler::equal, DONE_LABEL); 1111 } 1112 movdbl(dst, one); 1113 jcc(Assembler::above, DONE_LABEL); 1114 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1115 } 1116 1117 bind(DONE_LABEL); 1118 } 1119 1120 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1121 if (sign) { 1122 pmovsxbw(dst, src); 1123 } else { 1124 pmovzxbw(dst, src); 1125 } 1126 } 1127 1128 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1129 if (sign) { 1130 vpmovsxbw(dst, src, vector_len); 1131 } else { 1132 vpmovzxbw(dst, src, vector_len); 1133 } 1134 } 1135 1136 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1137 if (sign) { 1138 vpmovsxbd(dst, src, vector_len); 1139 } else { 1140 vpmovzxbd(dst, src, vector_len); 1141 } 1142 } 1143 1144 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1145 if (sign) { 1146 vpmovsxwd(dst, src, vector_len); 1147 } else { 1148 vpmovzxwd(dst, src, vector_len); 1149 } 1150 } 1151 1152 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1153 int shift, int vector_len) { 1154 if (opcode == Op_RotateLeftV) { 1155 if (etype == T_INT) { 1156 evprold(dst, src, shift, vector_len); 1157 } else { 1158 assert(etype == T_LONG, "expected type T_LONG"); 1159 evprolq(dst, src, shift, vector_len); 1160 } 1161 } else { 1162 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1163 if (etype == T_INT) { 1164 evprord(dst, src, shift, vector_len); 1165 } else { 1166 assert(etype == T_LONG, "expected type T_LONG"); 1167 evprorq(dst, src, shift, vector_len); 1168 } 1169 } 1170 } 1171 1172 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1173 XMMRegister shift, int vector_len) { 1174 if (opcode == Op_RotateLeftV) { 1175 if (etype == T_INT) { 1176 evprolvd(dst, src, shift, vector_len); 1177 } else { 1178 assert(etype == T_LONG, "expected type T_LONG"); 1179 evprolvq(dst, src, shift, vector_len); 1180 } 1181 } else { 1182 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1183 if (etype == T_INT) { 1184 evprorvd(dst, src, shift, vector_len); 1185 } else { 1186 assert(etype == T_LONG, "expected type T_LONG"); 1187 evprorvq(dst, src, shift, vector_len); 1188 } 1189 } 1190 } 1191 1192 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1193 if (opcode == Op_RShiftVI) { 1194 psrad(dst, shift); 1195 } else if (opcode == Op_LShiftVI) { 1196 pslld(dst, shift); 1197 } else { 1198 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1199 psrld(dst, shift); 1200 } 1201 } 1202 1203 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1204 switch (opcode) { 1205 case Op_RShiftVI: psrad(dst, shift); break; 1206 case Op_LShiftVI: pslld(dst, shift); break; 1207 case Op_URShiftVI: psrld(dst, shift); break; 1208 1209 default: assert(false, "%s", NodeClassNames[opcode]); 1210 } 1211 } 1212 1213 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1214 if (opcode == Op_RShiftVI) { 1215 vpsrad(dst, nds, shift, vector_len); 1216 } else if (opcode == Op_LShiftVI) { 1217 vpslld(dst, nds, shift, vector_len); 1218 } else { 1219 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1220 vpsrld(dst, nds, shift, vector_len); 1221 } 1222 } 1223 1224 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1225 switch (opcode) { 1226 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1227 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1228 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1229 1230 default: assert(false, "%s", NodeClassNames[opcode]); 1231 } 1232 } 1233 1234 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1235 switch (opcode) { 1236 case Op_RShiftVB: // fall-through 1237 case Op_RShiftVS: psraw(dst, shift); break; 1238 1239 case Op_LShiftVB: // fall-through 1240 case Op_LShiftVS: psllw(dst, shift); break; 1241 1242 case Op_URShiftVS: // fall-through 1243 case Op_URShiftVB: psrlw(dst, shift); break; 1244 1245 default: assert(false, "%s", NodeClassNames[opcode]); 1246 } 1247 } 1248 1249 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1250 switch (opcode) { 1251 case Op_RShiftVB: // fall-through 1252 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1253 1254 case Op_LShiftVB: // fall-through 1255 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1256 1257 case Op_URShiftVS: // fall-through 1258 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1259 1260 default: assert(false, "%s", NodeClassNames[opcode]); 1261 } 1262 } 1263 1264 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1265 switch (opcode) { 1266 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1267 case Op_LShiftVL: psllq(dst, shift); break; 1268 case Op_URShiftVL: psrlq(dst, shift); break; 1269 1270 default: assert(false, "%s", NodeClassNames[opcode]); 1271 } 1272 } 1273 1274 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1275 if (opcode == Op_RShiftVL) { 1276 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1277 } else if (opcode == Op_LShiftVL) { 1278 psllq(dst, shift); 1279 } else { 1280 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1281 psrlq(dst, shift); 1282 } 1283 } 1284 1285 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1286 switch (opcode) { 1287 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1288 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1289 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1290 1291 default: assert(false, "%s", NodeClassNames[opcode]); 1292 } 1293 } 1294 1295 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1296 if (opcode == Op_RShiftVL) { 1297 evpsraq(dst, nds, shift, vector_len); 1298 } else if (opcode == Op_LShiftVL) { 1299 vpsllq(dst, nds, shift, vector_len); 1300 } else { 1301 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1302 vpsrlq(dst, nds, shift, vector_len); 1303 } 1304 } 1305 1306 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1307 switch (opcode) { 1308 case Op_RShiftVB: // fall-through 1309 case Op_RShiftVS: // fall-through 1310 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1311 1312 case Op_LShiftVB: // fall-through 1313 case Op_LShiftVS: // fall-through 1314 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1315 1316 case Op_URShiftVB: // fall-through 1317 case Op_URShiftVS: // fall-through 1318 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1319 1320 default: assert(false, "%s", NodeClassNames[opcode]); 1321 } 1322 } 1323 1324 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1325 switch (opcode) { 1326 case Op_RShiftVB: // fall-through 1327 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1328 1329 case Op_LShiftVB: // fall-through 1330 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1331 1332 case Op_URShiftVB: // fall-through 1333 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1334 1335 default: assert(false, "%s", NodeClassNames[opcode]); 1336 } 1337 } 1338 1339 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1340 assert(UseAVX >= 2, "required"); 1341 switch (opcode) { 1342 case Op_RShiftVL: { 1343 if (UseAVX > 2) { 1344 assert(tmp == xnoreg, "not used"); 1345 if (!VM_Version::supports_avx512vl()) { 1346 vlen_enc = Assembler::AVX_512bit; 1347 } 1348 evpsravq(dst, src, shift, vlen_enc); 1349 } else { 1350 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1351 vpsrlvq(dst, src, shift, vlen_enc); 1352 vpsrlvq(tmp, tmp, shift, vlen_enc); 1353 vpxor(dst, dst, tmp, vlen_enc); 1354 vpsubq(dst, dst, tmp, vlen_enc); 1355 } 1356 break; 1357 } 1358 case Op_LShiftVL: { 1359 assert(tmp == xnoreg, "not used"); 1360 vpsllvq(dst, src, shift, vlen_enc); 1361 break; 1362 } 1363 case Op_URShiftVL: { 1364 assert(tmp == xnoreg, "not used"); 1365 vpsrlvq(dst, src, shift, vlen_enc); 1366 break; 1367 } 1368 default: assert(false, "%s", NodeClassNames[opcode]); 1369 } 1370 } 1371 1372 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1373 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1374 assert(opcode == Op_LShiftVB || 1375 opcode == Op_RShiftVB || 1376 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1377 bool sign = (opcode != Op_URShiftVB); 1378 assert(vector_len == 0, "required"); 1379 vextendbd(sign, dst, src, 1); 1380 vpmovzxbd(vtmp, shift, 1); 1381 varshiftd(opcode, dst, dst, vtmp, 1); 1382 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1383 vextracti128_high(vtmp, dst); 1384 vpackusdw(dst, dst, vtmp, 0); 1385 } 1386 1387 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1388 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1389 assert(opcode == Op_LShiftVB || 1390 opcode == Op_RShiftVB || 1391 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1392 bool sign = (opcode != Op_URShiftVB); 1393 int ext_vector_len = vector_len + 1; 1394 vextendbw(sign, dst, src, ext_vector_len); 1395 vpmovzxbw(vtmp, shift, ext_vector_len); 1396 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1397 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1398 if (vector_len == 0) { 1399 vextracti128_high(vtmp, dst); 1400 vpackuswb(dst, dst, vtmp, vector_len); 1401 } else { 1402 vextracti64x4_high(vtmp, dst); 1403 vpackuswb(dst, dst, vtmp, vector_len); 1404 vpermq(dst, dst, 0xD8, vector_len); 1405 } 1406 } 1407 1408 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1409 switch(typ) { 1410 case T_BYTE: 1411 pinsrb(dst, val, idx); 1412 break; 1413 case T_SHORT: 1414 pinsrw(dst, val, idx); 1415 break; 1416 case T_INT: 1417 pinsrd(dst, val, idx); 1418 break; 1419 case T_LONG: 1420 pinsrq(dst, val, idx); 1421 break; 1422 default: 1423 assert(false,"Should not reach here."); 1424 break; 1425 } 1426 } 1427 1428 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1429 switch(typ) { 1430 case T_BYTE: 1431 vpinsrb(dst, src, val, idx); 1432 break; 1433 case T_SHORT: 1434 vpinsrw(dst, src, val, idx); 1435 break; 1436 case T_INT: 1437 vpinsrd(dst, src, val, idx); 1438 break; 1439 case T_LONG: 1440 vpinsrq(dst, src, val, idx); 1441 break; 1442 default: 1443 assert(false,"Should not reach here."); 1444 break; 1445 } 1446 } 1447 1448 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst, 1449 Register base, Register idx_base, 1450 Register mask, Register mask_idx, 1451 Register rtmp, int vlen_enc) { 1452 vpxor(dst, dst, dst, vlen_enc); 1453 if (elem_bt == T_SHORT) { 1454 for (int i = 0; i < 4; i++) { 1455 // dst[i] = mask[i] ? src[idx_base[i]] : 0 1456 Label skip_load; 1457 btq(mask, mask_idx); 1458 jccb(Assembler::carryClear, skip_load); 1459 movl(rtmp, Address(idx_base, i * 4)); 1460 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1461 bind(skip_load); 1462 incq(mask_idx); 1463 } 1464 } else { 1465 assert(elem_bt == T_BYTE, ""); 1466 for (int i = 0; i < 8; i++) { 1467 // dst[i] = mask[i] ? src[idx_base[i]] : 0 1468 Label skip_load; 1469 btq(mask, mask_idx); 1470 jccb(Assembler::carryClear, skip_load); 1471 movl(rtmp, Address(idx_base, i * 4)); 1472 pinsrb(dst, Address(base, rtmp), i); 1473 bind(skip_load); 1474 incq(mask_idx); 1475 } 1476 } 1477 } 1478 1479 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst, 1480 Register base, Register idx_base, 1481 Register rtmp, int vlen_enc) { 1482 vpxor(dst, dst, dst, vlen_enc); 1483 if (elem_bt == T_SHORT) { 1484 for (int i = 0; i < 4; i++) { 1485 // dst[i] = src[idx_base[i]] 1486 movl(rtmp, Address(idx_base, i * 4)); 1487 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1488 } 1489 } else { 1490 assert(elem_bt == T_BYTE, ""); 1491 for (int i = 0; i < 8; i++) { 1492 // dst[i] = src[idx_base[i]] 1493 movl(rtmp, Address(idx_base, i * 4)); 1494 pinsrb(dst, Address(base, rtmp), i); 1495 } 1496 } 1497 } 1498 1499 /* 1500 * Gather using hybrid algorithm, first partially unroll scalar loop 1501 * to accumulate values from gather indices into a quad-word(64bit) slice. 1502 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1503 * permutation to place the slice into appropriate vector lane 1504 * locations in destination vector. Following pseudo code describes the 1505 * algorithm in detail: 1506 * 1507 * DST_VEC = ZERO_VEC 1508 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1509 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1510 * FOREACH_ITER: 1511 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1512 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1513 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1514 * PERM_INDEX = PERM_INDEX - TWO_VEC 1515 * 1516 * With each iteration, doubleword permute indices (0,1) corresponding 1517 * to gathered quadword gets right shifted by two lane positions. 1518 * 1519 */ 1520 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1521 Register base, Register idx_base, 1522 Register mask, XMMRegister xtmp1, 1523 XMMRegister xtmp2, XMMRegister temp_dst, 1524 Register rtmp, Register mask_idx, 1525 Register length, int vector_len, int vlen_enc) { 1526 Label GATHER8_LOOP; 1527 assert(is_subword_type(elem_ty), ""); 1528 movl(length, vector_len); 1529 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1530 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1531 vallones(xtmp2, vlen_enc); 1532 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1533 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1534 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1535 1536 bind(GATHER8_LOOP); 1537 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1538 if (mask == noreg) { 1539 vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc); 1540 } else { 1541 vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc); 1542 } 1543 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1544 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1545 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1546 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1547 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1548 vpor(dst, dst, temp_dst, vlen_enc); 1549 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1550 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1551 jcc(Assembler::notEqual, GATHER8_LOOP); 1552 } 1553 1554 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1555 switch(typ) { 1556 case T_INT: 1557 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1558 break; 1559 case T_FLOAT: 1560 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1561 break; 1562 case T_LONG: 1563 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1564 break; 1565 case T_DOUBLE: 1566 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1567 break; 1568 default: 1569 assert(false,"Should not reach here."); 1570 break; 1571 } 1572 } 1573 1574 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1575 switch(typ) { 1576 case T_INT: 1577 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1578 break; 1579 case T_FLOAT: 1580 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1581 break; 1582 case T_LONG: 1583 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1584 break; 1585 case T_DOUBLE: 1586 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1587 break; 1588 default: 1589 assert(false,"Should not reach here."); 1590 break; 1591 } 1592 } 1593 1594 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1595 switch(typ) { 1596 case T_INT: 1597 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1598 break; 1599 case T_FLOAT: 1600 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1601 break; 1602 case T_LONG: 1603 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1604 break; 1605 case T_DOUBLE: 1606 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1607 break; 1608 default: 1609 assert(false,"Should not reach here."); 1610 break; 1611 } 1612 } 1613 1614 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1615 if (vlen_in_bytes <= 16) { 1616 pxor (dst, dst); 1617 psubb(dst, src); 1618 switch (elem_bt) { 1619 case T_BYTE: /* nothing to do */ break; 1620 case T_SHORT: pmovsxbw(dst, dst); break; 1621 case T_INT: pmovsxbd(dst, dst); break; 1622 case T_FLOAT: pmovsxbd(dst, dst); break; 1623 case T_LONG: pmovsxbq(dst, dst); break; 1624 case T_DOUBLE: pmovsxbq(dst, dst); break; 1625 1626 default: assert(false, "%s", type2name(elem_bt)); 1627 } 1628 } else { 1629 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1630 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1631 1632 vpxor (dst, dst, dst, vlen_enc); 1633 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1634 1635 switch (elem_bt) { 1636 case T_BYTE: /* nothing to do */ break; 1637 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1638 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1639 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1640 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1641 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1642 1643 default: assert(false, "%s", type2name(elem_bt)); 1644 } 1645 } 1646 } 1647 1648 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1649 if (novlbwdq) { 1650 vpmovsxbd(xtmp, src, vlen_enc); 1651 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1652 Assembler::eq, true, vlen_enc, noreg); 1653 } else { 1654 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1655 vpsubb(xtmp, xtmp, src, vlen_enc); 1656 evpmovb2m(dst, xtmp, vlen_enc); 1657 } 1658 } 1659 1660 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) { 1661 if (is_integral_type(bt)) { 1662 switch (vlen_in_bytes) { 1663 case 4: movdl(dst, src); break; 1664 case 8: movq(dst, src); break; 1665 case 16: movdqu(dst, src); break; 1666 case 32: vmovdqu(dst, src); break; 1667 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1668 default: ShouldNotReachHere(); 1669 } 1670 } else { 1671 switch (vlen_in_bytes) { 1672 case 4: movflt(dst, src); break; 1673 case 8: movdbl(dst, src); break; 1674 case 16: movups(dst, src); break; 1675 case 32: vmovups(dst, src, Assembler::AVX_256bit); break; 1676 case 64: vmovups(dst, src, Assembler::AVX_512bit); break; 1677 default: ShouldNotReachHere(); 1678 } 1679 } 1680 } 1681 1682 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1683 assert(rscratch != noreg || always_reachable(src), "missing"); 1684 1685 if (reachable(src)) { 1686 load_vector(bt, dst, as_Address(src), vlen_in_bytes); 1687 } else { 1688 lea(rscratch, src); 1689 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes); 1690 } 1691 } 1692 1693 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1694 int vlen_enc = vector_length_encoding(vlen); 1695 if (VM_Version::supports_avx()) { 1696 if (bt == T_LONG) { 1697 if (VM_Version::supports_avx2()) { 1698 vpbroadcastq(dst, src, vlen_enc); 1699 } else { 1700 vmovddup(dst, src, vlen_enc); 1701 } 1702 } else if (bt == T_DOUBLE) { 1703 if (vlen_enc != Assembler::AVX_128bit) { 1704 vbroadcastsd(dst, src, vlen_enc, noreg); 1705 } else { 1706 vmovddup(dst, src, vlen_enc); 1707 } 1708 } else { 1709 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1710 vpbroadcastd(dst, src, vlen_enc); 1711 } else { 1712 vbroadcastss(dst, src, vlen_enc); 1713 } 1714 } 1715 } else if (VM_Version::supports_sse3()) { 1716 movddup(dst, src); 1717 } else { 1718 load_vector(bt, dst, src, vlen); 1719 } 1720 } 1721 1722 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1723 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1724 int offset = exact_log2(type2aelembytes(bt)) << 6; 1725 if (is_floating_point_type(bt)) { 1726 offset += 128; 1727 } 1728 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1729 load_vector(T_BYTE, dst, addr, vlen_in_bytes); 1730 } 1731 1732 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1733 1734 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1735 int vector_len = Assembler::AVX_128bit; 1736 1737 switch (opcode) { 1738 case Op_AndReductionV: pand(dst, src); break; 1739 case Op_OrReductionV: por (dst, src); break; 1740 case Op_XorReductionV: pxor(dst, src); break; 1741 case Op_MinReductionV: 1742 switch (typ) { 1743 case T_BYTE: pminsb(dst, src); break; 1744 case T_SHORT: pminsw(dst, src); break; 1745 case T_INT: pminsd(dst, src); break; 1746 case T_LONG: assert(UseAVX > 2, "required"); 1747 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1748 default: assert(false, "wrong type"); 1749 } 1750 break; 1751 case Op_MaxReductionV: 1752 switch (typ) { 1753 case T_BYTE: pmaxsb(dst, src); break; 1754 case T_SHORT: pmaxsw(dst, src); break; 1755 case T_INT: pmaxsd(dst, src); break; 1756 case T_LONG: assert(UseAVX > 2, "required"); 1757 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1758 default: assert(false, "wrong type"); 1759 } 1760 break; 1761 case Op_AddReductionVF: addss(dst, src); break; 1762 case Op_AddReductionVD: addsd(dst, src); break; 1763 case Op_AddReductionVI: 1764 switch (typ) { 1765 case T_BYTE: paddb(dst, src); break; 1766 case T_SHORT: paddw(dst, src); break; 1767 case T_INT: paddd(dst, src); break; 1768 default: assert(false, "wrong type"); 1769 } 1770 break; 1771 case Op_AddReductionVL: paddq(dst, src); break; 1772 case Op_MulReductionVF: mulss(dst, src); break; 1773 case Op_MulReductionVD: mulsd(dst, src); break; 1774 case Op_MulReductionVI: 1775 switch (typ) { 1776 case T_SHORT: pmullw(dst, src); break; 1777 case T_INT: pmulld(dst, src); break; 1778 default: assert(false, "wrong type"); 1779 } 1780 break; 1781 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1782 evpmullq(dst, dst, src, vector_len); break; 1783 default: assert(false, "wrong opcode"); 1784 } 1785 } 1786 1787 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1788 switch (opcode) { 1789 case Op_AddReductionVF: addps(dst, src); break; 1790 case Op_AddReductionVD: addpd(dst, src); break; 1791 case Op_MulReductionVF: mulps(dst, src); break; 1792 case Op_MulReductionVD: mulpd(dst, src); break; 1793 default: assert(false, "%s", NodeClassNames[opcode]); 1794 } 1795 } 1796 1797 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1798 int vector_len = Assembler::AVX_256bit; 1799 1800 switch (opcode) { 1801 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1802 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1803 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1804 case Op_MinReductionV: 1805 switch (typ) { 1806 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1807 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1808 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1809 case T_LONG: assert(UseAVX > 2, "required"); 1810 vpminsq(dst, src1, src2, vector_len); break; 1811 default: assert(false, "wrong type"); 1812 } 1813 break; 1814 case Op_MaxReductionV: 1815 switch (typ) { 1816 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1817 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1818 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1819 case T_LONG: assert(UseAVX > 2, "required"); 1820 vpmaxsq(dst, src1, src2, vector_len); break; 1821 default: assert(false, "wrong type"); 1822 } 1823 break; 1824 case Op_AddReductionVI: 1825 switch (typ) { 1826 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1827 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1828 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1829 default: assert(false, "wrong type"); 1830 } 1831 break; 1832 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1833 case Op_MulReductionVI: 1834 switch (typ) { 1835 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1836 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1837 default: assert(false, "wrong type"); 1838 } 1839 break; 1840 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1841 default: assert(false, "wrong opcode"); 1842 } 1843 } 1844 1845 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1846 int vector_len = Assembler::AVX_256bit; 1847 1848 switch (opcode) { 1849 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 1850 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 1851 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 1852 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 1853 default: assert(false, "%s", NodeClassNames[opcode]); 1854 } 1855 } 1856 1857 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1858 XMMRegister dst, XMMRegister src, 1859 XMMRegister vtmp1, XMMRegister vtmp2) { 1860 switch (opcode) { 1861 case Op_AddReductionVF: 1862 case Op_MulReductionVF: 1863 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1864 break; 1865 1866 case Op_AddReductionVD: 1867 case Op_MulReductionVD: 1868 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1869 break; 1870 1871 default: assert(false, "wrong opcode"); 1872 } 1873 } 1874 1875 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 1876 XMMRegister dst, XMMRegister src, 1877 XMMRegister vtmp1, XMMRegister vtmp2) { 1878 switch (opcode) { 1879 case Op_AddReductionVF: 1880 case Op_MulReductionVF: 1881 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1882 break; 1883 1884 case Op_AddReductionVD: 1885 case Op_MulReductionVD: 1886 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1887 break; 1888 1889 default: assert(false, "%s", NodeClassNames[opcode]); 1890 } 1891 } 1892 1893 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1894 Register dst, Register src1, XMMRegister src2, 1895 XMMRegister vtmp1, XMMRegister vtmp2) { 1896 switch (vlen) { 1897 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1898 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1899 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1900 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1901 1902 default: assert(false, "wrong vector length"); 1903 } 1904 } 1905 1906 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1907 Register dst, Register src1, XMMRegister src2, 1908 XMMRegister vtmp1, XMMRegister vtmp2) { 1909 switch (vlen) { 1910 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1911 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1912 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1913 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1914 1915 default: assert(false, "wrong vector length"); 1916 } 1917 } 1918 1919 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1920 Register dst, Register src1, XMMRegister src2, 1921 XMMRegister vtmp1, XMMRegister vtmp2) { 1922 switch (vlen) { 1923 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1924 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1925 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1926 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1927 1928 default: assert(false, "wrong vector length"); 1929 } 1930 } 1931 1932 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1933 Register dst, Register src1, XMMRegister src2, 1934 XMMRegister vtmp1, XMMRegister vtmp2) { 1935 switch (vlen) { 1936 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1937 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1938 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1939 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1940 1941 default: assert(false, "wrong vector length"); 1942 } 1943 } 1944 1945 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1946 Register dst, Register src1, XMMRegister src2, 1947 XMMRegister vtmp1, XMMRegister vtmp2) { 1948 switch (vlen) { 1949 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1950 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1951 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1952 1953 default: assert(false, "wrong vector length"); 1954 } 1955 } 1956 1957 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1958 switch (vlen) { 1959 case 2: 1960 assert(vtmp2 == xnoreg, ""); 1961 reduce2F(opcode, dst, src, vtmp1); 1962 break; 1963 case 4: 1964 assert(vtmp2 == xnoreg, ""); 1965 reduce4F(opcode, dst, src, vtmp1); 1966 break; 1967 case 8: 1968 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1969 break; 1970 case 16: 1971 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1972 break; 1973 default: assert(false, "wrong vector length"); 1974 } 1975 } 1976 1977 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1978 switch (vlen) { 1979 case 2: 1980 assert(vtmp2 == xnoreg, ""); 1981 reduce2D(opcode, dst, src, vtmp1); 1982 break; 1983 case 4: 1984 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1985 break; 1986 case 8: 1987 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1988 break; 1989 default: assert(false, "wrong vector length"); 1990 } 1991 } 1992 1993 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1994 switch (vlen) { 1995 case 2: 1996 assert(vtmp1 == xnoreg, ""); 1997 assert(vtmp2 == xnoreg, ""); 1998 unorderedReduce2F(opcode, dst, src); 1999 break; 2000 case 4: 2001 assert(vtmp2 == xnoreg, ""); 2002 unorderedReduce4F(opcode, dst, src, vtmp1); 2003 break; 2004 case 8: 2005 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 2006 break; 2007 case 16: 2008 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 2009 break; 2010 default: assert(false, "wrong vector length"); 2011 } 2012 } 2013 2014 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2015 switch (vlen) { 2016 case 2: 2017 assert(vtmp1 == xnoreg, ""); 2018 assert(vtmp2 == xnoreg, ""); 2019 unorderedReduce2D(opcode, dst, src); 2020 break; 2021 case 4: 2022 assert(vtmp2 == xnoreg, ""); 2023 unorderedReduce4D(opcode, dst, src, vtmp1); 2024 break; 2025 case 8: 2026 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2027 break; 2028 default: assert(false, "wrong vector length"); 2029 } 2030 } 2031 2032 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2033 if (opcode == Op_AddReductionVI) { 2034 if (vtmp1 != src2) { 2035 movdqu(vtmp1, src2); 2036 } 2037 phaddd(vtmp1, vtmp1); 2038 } else { 2039 pshufd(vtmp1, src2, 0x1); 2040 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2041 } 2042 movdl(vtmp2, src1); 2043 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2044 movdl(dst, vtmp1); 2045 } 2046 2047 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2048 if (opcode == Op_AddReductionVI) { 2049 if (vtmp1 != src2) { 2050 movdqu(vtmp1, src2); 2051 } 2052 phaddd(vtmp1, src2); 2053 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2054 } else { 2055 pshufd(vtmp2, src2, 0xE); 2056 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2057 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2058 } 2059 } 2060 2061 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2062 if (opcode == Op_AddReductionVI) { 2063 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2064 vextracti128_high(vtmp2, vtmp1); 2065 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2066 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2067 } else { 2068 vextracti128_high(vtmp1, src2); 2069 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2070 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2071 } 2072 } 2073 2074 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2075 vextracti64x4_high(vtmp2, src2); 2076 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2077 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2078 } 2079 2080 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2081 pshufd(vtmp2, src2, 0x1); 2082 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2083 movdqu(vtmp1, vtmp2); 2084 psrldq(vtmp1, 2); 2085 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2086 movdqu(vtmp2, vtmp1); 2087 psrldq(vtmp2, 1); 2088 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2089 movdl(vtmp2, src1); 2090 pmovsxbd(vtmp1, vtmp1); 2091 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2092 pextrb(dst, vtmp1, 0x0); 2093 movsbl(dst, dst); 2094 } 2095 2096 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2097 pshufd(vtmp1, src2, 0xE); 2098 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2099 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2100 } 2101 2102 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2103 vextracti128_high(vtmp2, src2); 2104 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2105 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2106 } 2107 2108 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2109 vextracti64x4_high(vtmp1, src2); 2110 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2111 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2112 } 2113 2114 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2115 pmovsxbw(vtmp2, src2); 2116 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2117 } 2118 2119 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2120 if (UseAVX > 1) { 2121 int vector_len = Assembler::AVX_256bit; 2122 vpmovsxbw(vtmp1, src2, vector_len); 2123 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2124 } else { 2125 pmovsxbw(vtmp2, src2); 2126 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2127 pshufd(vtmp2, src2, 0x1); 2128 pmovsxbw(vtmp2, src2); 2129 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2130 } 2131 } 2132 2133 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2134 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2135 int vector_len = Assembler::AVX_512bit; 2136 vpmovsxbw(vtmp1, src2, vector_len); 2137 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2138 } else { 2139 assert(UseAVX >= 2,"Should not reach here."); 2140 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2141 vextracti128_high(vtmp2, src2); 2142 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2143 } 2144 } 2145 2146 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2147 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2148 vextracti64x4_high(vtmp2, src2); 2149 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2150 } 2151 2152 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2153 if (opcode == Op_AddReductionVI) { 2154 if (vtmp1 != src2) { 2155 movdqu(vtmp1, src2); 2156 } 2157 phaddw(vtmp1, vtmp1); 2158 phaddw(vtmp1, vtmp1); 2159 } else { 2160 pshufd(vtmp2, src2, 0x1); 2161 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2162 movdqu(vtmp1, vtmp2); 2163 psrldq(vtmp1, 2); 2164 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2165 } 2166 movdl(vtmp2, src1); 2167 pmovsxwd(vtmp1, vtmp1); 2168 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2169 pextrw(dst, vtmp1, 0x0); 2170 movswl(dst, dst); 2171 } 2172 2173 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2174 if (opcode == Op_AddReductionVI) { 2175 if (vtmp1 != src2) { 2176 movdqu(vtmp1, src2); 2177 } 2178 phaddw(vtmp1, src2); 2179 } else { 2180 pshufd(vtmp1, src2, 0xE); 2181 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2182 } 2183 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2184 } 2185 2186 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2187 if (opcode == Op_AddReductionVI) { 2188 int vector_len = Assembler::AVX_256bit; 2189 vphaddw(vtmp2, src2, src2, vector_len); 2190 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2191 } else { 2192 vextracti128_high(vtmp2, src2); 2193 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2194 } 2195 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2196 } 2197 2198 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2199 int vector_len = Assembler::AVX_256bit; 2200 vextracti64x4_high(vtmp1, src2); 2201 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2202 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2203 } 2204 2205 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2206 pshufd(vtmp2, src2, 0xE); 2207 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2208 movdq(vtmp1, src1); 2209 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2210 movdq(dst, vtmp1); 2211 } 2212 2213 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2214 vextracti128_high(vtmp1, src2); 2215 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2216 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2217 } 2218 2219 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2220 vextracti64x4_high(vtmp2, src2); 2221 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2222 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2223 } 2224 2225 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2226 mov64(temp, -1L); 2227 bzhiq(temp, temp, len); 2228 kmovql(dst, temp); 2229 } 2230 2231 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2232 reduce_operation_128(T_FLOAT, opcode, dst, src); 2233 pshufd(vtmp, src, 0x1); 2234 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2235 } 2236 2237 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2238 reduce2F(opcode, dst, src, vtmp); 2239 pshufd(vtmp, src, 0x2); 2240 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2241 pshufd(vtmp, src, 0x3); 2242 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2243 } 2244 2245 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2246 reduce4F(opcode, dst, src, vtmp2); 2247 vextractf128_high(vtmp2, src); 2248 reduce4F(opcode, dst, vtmp2, vtmp1); 2249 } 2250 2251 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2252 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2253 vextracti64x4_high(vtmp1, src); 2254 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2255 } 2256 2257 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2258 pshufd(dst, src, 0x1); 2259 reduce_operation_128(T_FLOAT, opcode, dst, src); 2260 } 2261 2262 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2263 pshufd(vtmp, src, 0xE); 2264 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2265 unorderedReduce2F(opcode, dst, vtmp); 2266 } 2267 2268 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2269 vextractf128_high(vtmp1, src); 2270 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2271 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2272 } 2273 2274 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2275 vextractf64x4_high(vtmp2, src); 2276 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2277 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2278 } 2279 2280 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2281 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2282 pshufd(vtmp, src, 0xE); 2283 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2284 } 2285 2286 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2287 reduce2D(opcode, dst, src, vtmp2); 2288 vextractf128_high(vtmp2, src); 2289 reduce2D(opcode, dst, vtmp2, vtmp1); 2290 } 2291 2292 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2293 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2294 vextracti64x4_high(vtmp1, src); 2295 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2296 } 2297 2298 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2299 pshufd(dst, src, 0xE); 2300 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2301 } 2302 2303 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2304 vextractf128_high(vtmp, src); 2305 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2306 unorderedReduce2D(opcode, dst, vtmp); 2307 } 2308 2309 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2310 vextractf64x4_high(vtmp2, src); 2311 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2312 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2313 } 2314 2315 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2316 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2317 } 2318 2319 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2320 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2321 } 2322 2323 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 2324 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2325 } 2326 2327 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2328 int vec_enc) { 2329 switch(elem_bt) { 2330 case T_INT: 2331 case T_FLOAT: 2332 vmaskmovps(dst, src, mask, vec_enc); 2333 break; 2334 case T_LONG: 2335 case T_DOUBLE: 2336 vmaskmovpd(dst, src, mask, vec_enc); 2337 break; 2338 default: 2339 fatal("Unsupported type %s", type2name(elem_bt)); 2340 break; 2341 } 2342 } 2343 2344 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2345 int vec_enc) { 2346 switch(elem_bt) { 2347 case T_INT: 2348 case T_FLOAT: 2349 vmaskmovps(dst, src, mask, vec_enc); 2350 break; 2351 case T_LONG: 2352 case T_DOUBLE: 2353 vmaskmovpd(dst, src, mask, vec_enc); 2354 break; 2355 default: 2356 fatal("Unsupported type %s", type2name(elem_bt)); 2357 break; 2358 } 2359 } 2360 2361 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2362 XMMRegister dst, XMMRegister src, 2363 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2364 XMMRegister xmm_0, XMMRegister xmm_1) { 2365 const int permconst[] = {1, 14}; 2366 XMMRegister wsrc = src; 2367 XMMRegister wdst = xmm_0; 2368 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2369 2370 int vlen_enc = Assembler::AVX_128bit; 2371 if (vlen == 16) { 2372 vlen_enc = Assembler::AVX_256bit; 2373 } 2374 2375 for (int i = log2(vlen) - 1; i >=0; i--) { 2376 if (i == 0 && !is_dst_valid) { 2377 wdst = dst; 2378 } 2379 if (i == 3) { 2380 vextracti64x4_high(wtmp, wsrc); 2381 } else if (i == 2) { 2382 vextracti128_high(wtmp, wsrc); 2383 } else { // i = [0,1] 2384 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2385 } 2386 2387 if (VM_Version::supports_avx10_2()) { 2388 vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc); 2389 } else { 2390 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2391 } 2392 wsrc = wdst; 2393 vlen_enc = Assembler::AVX_128bit; 2394 } 2395 if (is_dst_valid) { 2396 if (VM_Version::supports_avx10_2()) { 2397 vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit); 2398 } else { 2399 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2400 } 2401 } 2402 } 2403 2404 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2405 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2406 XMMRegister xmm_0, XMMRegister xmm_1) { 2407 XMMRegister wsrc = src; 2408 XMMRegister wdst = xmm_0; 2409 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2410 int vlen_enc = Assembler::AVX_128bit; 2411 if (vlen == 8) { 2412 vlen_enc = Assembler::AVX_256bit; 2413 } 2414 for (int i = log2(vlen) - 1; i >=0; i--) { 2415 if (i == 0 && !is_dst_valid) { 2416 wdst = dst; 2417 } 2418 if (i == 1) { 2419 vextracti128_high(wtmp, wsrc); 2420 } else if (i == 2) { 2421 vextracti64x4_high(wtmp, wsrc); 2422 } else { 2423 assert(i == 0, "%d", i); 2424 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2425 } 2426 2427 if (VM_Version::supports_avx10_2()) { 2428 vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc); 2429 } else { 2430 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2431 } 2432 2433 wsrc = wdst; 2434 vlen_enc = Assembler::AVX_128bit; 2435 } 2436 2437 if (is_dst_valid) { 2438 if (VM_Version::supports_avx10_2()) { 2439 vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit); 2440 } else { 2441 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2442 } 2443 } 2444 } 2445 2446 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2447 switch (bt) { 2448 case T_BYTE: pextrb(dst, src, idx); break; 2449 case T_SHORT: pextrw(dst, src, idx); break; 2450 case T_INT: pextrd(dst, src, idx); break; 2451 case T_LONG: pextrq(dst, src, idx); break; 2452 2453 default: 2454 assert(false,"Should not reach here."); 2455 break; 2456 } 2457 } 2458 2459 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2460 int esize = type2aelembytes(typ); 2461 int elem_per_lane = 16/esize; 2462 int lane = elemindex / elem_per_lane; 2463 int eindex = elemindex % elem_per_lane; 2464 2465 if (lane >= 2) { 2466 assert(UseAVX > 2, "required"); 2467 vextractf32x4(dst, src, lane & 3); 2468 return dst; 2469 } else if (lane > 0) { 2470 assert(UseAVX > 0, "required"); 2471 vextractf128(dst, src, lane); 2472 return dst; 2473 } else { 2474 return src; 2475 } 2476 } 2477 2478 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2479 if (typ == T_BYTE) { 2480 movsbl(dst, dst); 2481 } else if (typ == T_SHORT) { 2482 movswl(dst, dst); 2483 } 2484 } 2485 2486 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2487 int esize = type2aelembytes(typ); 2488 int elem_per_lane = 16/esize; 2489 int eindex = elemindex % elem_per_lane; 2490 assert(is_integral_type(typ),"required"); 2491 2492 if (eindex == 0) { 2493 if (typ == T_LONG) { 2494 movq(dst, src); 2495 } else { 2496 movdl(dst, src); 2497 movsxl(typ, dst); 2498 } 2499 } else { 2500 extract(typ, dst, src, eindex); 2501 movsxl(typ, dst); 2502 } 2503 } 2504 2505 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2506 int esize = type2aelembytes(typ); 2507 int elem_per_lane = 16/esize; 2508 int eindex = elemindex % elem_per_lane; 2509 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2510 2511 if (eindex == 0) { 2512 movq(dst, src); 2513 } else { 2514 if (typ == T_FLOAT) { 2515 if (UseAVX == 0) { 2516 movdqu(dst, src); 2517 shufps(dst, dst, eindex); 2518 } else { 2519 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2520 } 2521 } else { 2522 if (UseAVX == 0) { 2523 movdqu(dst, src); 2524 psrldq(dst, eindex*esize); 2525 } else { 2526 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2527 } 2528 movq(dst, dst); 2529 } 2530 } 2531 // Zero upper bits 2532 if (typ == T_FLOAT) { 2533 if (UseAVX == 0) { 2534 assert(vtmp != xnoreg, "required."); 2535 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2536 pand(dst, vtmp); 2537 } else { 2538 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2539 } 2540 } 2541 } 2542 2543 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2544 switch(typ) { 2545 case T_BYTE: 2546 case T_BOOLEAN: 2547 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2548 break; 2549 case T_SHORT: 2550 case T_CHAR: 2551 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2552 break; 2553 case T_INT: 2554 case T_FLOAT: 2555 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2556 break; 2557 case T_LONG: 2558 case T_DOUBLE: 2559 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2560 break; 2561 default: 2562 assert(false,"Should not reach here."); 2563 break; 2564 } 2565 } 2566 2567 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2568 assert(rscratch != noreg || always_reachable(src2), "missing"); 2569 2570 switch(typ) { 2571 case T_BOOLEAN: 2572 case T_BYTE: 2573 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2574 break; 2575 case T_CHAR: 2576 case T_SHORT: 2577 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2578 break; 2579 case T_INT: 2580 case T_FLOAT: 2581 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2582 break; 2583 case T_LONG: 2584 case T_DOUBLE: 2585 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2586 break; 2587 default: 2588 assert(false,"Should not reach here."); 2589 break; 2590 } 2591 } 2592 2593 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2594 switch(typ) { 2595 case T_BYTE: 2596 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2597 break; 2598 case T_SHORT: 2599 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2600 break; 2601 case T_INT: 2602 case T_FLOAT: 2603 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2604 break; 2605 case T_LONG: 2606 case T_DOUBLE: 2607 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2608 break; 2609 default: 2610 assert(false,"Should not reach here."); 2611 break; 2612 } 2613 } 2614 2615 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2616 assert(vlen_in_bytes <= 32, ""); 2617 int esize = type2aelembytes(bt); 2618 if (vlen_in_bytes == 32) { 2619 assert(vtmp == xnoreg, "required."); 2620 if (esize >= 4) { 2621 vtestps(src1, src2, AVX_256bit); 2622 } else { 2623 vptest(src1, src2, AVX_256bit); 2624 } 2625 return; 2626 } 2627 if (vlen_in_bytes < 16) { 2628 // Duplicate the lower part to fill the whole register, 2629 // Don't need to do so for src2 2630 assert(vtmp != xnoreg, "required"); 2631 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2632 pshufd(vtmp, src1, shuffle_imm); 2633 } else { 2634 assert(vtmp == xnoreg, "required"); 2635 vtmp = src1; 2636 } 2637 if (esize >= 4 && VM_Version::supports_avx()) { 2638 vtestps(vtmp, src2, AVX_128bit); 2639 } else { 2640 ptest(vtmp, src2); 2641 } 2642 } 2643 2644 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2645 #ifdef ASSERT 2646 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2647 bool is_bw_supported = VM_Version::supports_avx512bw(); 2648 if (is_bw && !is_bw_supported) { 2649 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2650 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2651 "XMM register should be 0-15"); 2652 } 2653 #endif // ASSERT 2654 switch (elem_bt) { 2655 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2656 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2657 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2658 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2659 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2660 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2661 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2662 } 2663 } 2664 2665 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2666 assert(UseAVX >= 2, "required"); 2667 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2668 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2669 if ((UseAVX > 2) && 2670 (!is_bw || VM_Version::supports_avx512bw()) && 2671 (!is_vl || VM_Version::supports_avx512vl())) { 2672 switch (elem_bt) { 2673 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2674 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2675 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2676 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2677 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2678 } 2679 } else { 2680 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2681 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2682 switch (elem_bt) { 2683 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2684 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2685 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2686 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2687 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2688 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2689 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2690 } 2691 } 2692 } 2693 2694 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2695 switch (to_elem_bt) { 2696 case T_SHORT: 2697 vpmovsxbw(dst, src, vlen_enc); 2698 break; 2699 case T_INT: 2700 vpmovsxbd(dst, src, vlen_enc); 2701 break; 2702 case T_FLOAT: 2703 vpmovsxbd(dst, src, vlen_enc); 2704 vcvtdq2ps(dst, dst, vlen_enc); 2705 break; 2706 case T_LONG: 2707 vpmovsxbq(dst, src, vlen_enc); 2708 break; 2709 case T_DOUBLE: { 2710 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2711 vpmovsxbd(dst, src, mid_vlen_enc); 2712 vcvtdq2pd(dst, dst, vlen_enc); 2713 break; 2714 } 2715 default: 2716 fatal("Unsupported type %s", type2name(to_elem_bt)); 2717 break; 2718 } 2719 } 2720 2721 //------------------------------------------------------------------------------------------- 2722 2723 // IndexOf for constant substrings with size >= 8 chars 2724 // which don't need to be loaded through stack. 2725 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2726 Register cnt1, Register cnt2, 2727 int int_cnt2, Register result, 2728 XMMRegister vec, Register tmp, 2729 int ae) { 2730 ShortBranchVerifier sbv(this); 2731 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2732 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2733 2734 // This method uses the pcmpestri instruction with bound registers 2735 // inputs: 2736 // xmm - substring 2737 // rax - substring length (elements count) 2738 // mem - scanned string 2739 // rdx - string length (elements count) 2740 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2741 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2742 // outputs: 2743 // rcx - matched index in string 2744 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2745 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2746 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2747 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2748 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2749 2750 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2751 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2752 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2753 2754 // Note, inline_string_indexOf() generates checks: 2755 // if (substr.count > string.count) return -1; 2756 // if (substr.count == 0) return 0; 2757 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2758 2759 // Load substring. 2760 if (ae == StrIntrinsicNode::UL) { 2761 pmovzxbw(vec, Address(str2, 0)); 2762 } else { 2763 movdqu(vec, Address(str2, 0)); 2764 } 2765 movl(cnt2, int_cnt2); 2766 movptr(result, str1); // string addr 2767 2768 if (int_cnt2 > stride) { 2769 jmpb(SCAN_TO_SUBSTR); 2770 2771 // Reload substr for rescan, this code 2772 // is executed only for large substrings (> 8 chars) 2773 bind(RELOAD_SUBSTR); 2774 if (ae == StrIntrinsicNode::UL) { 2775 pmovzxbw(vec, Address(str2, 0)); 2776 } else { 2777 movdqu(vec, Address(str2, 0)); 2778 } 2779 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2780 2781 bind(RELOAD_STR); 2782 // We came here after the beginning of the substring was 2783 // matched but the rest of it was not so we need to search 2784 // again. Start from the next element after the previous match. 2785 2786 // cnt2 is number of substring reminding elements and 2787 // cnt1 is number of string reminding elements when cmp failed. 2788 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2789 subl(cnt1, cnt2); 2790 addl(cnt1, int_cnt2); 2791 movl(cnt2, int_cnt2); // Now restore cnt2 2792 2793 decrementl(cnt1); // Shift to next element 2794 cmpl(cnt1, cnt2); 2795 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2796 2797 addptr(result, (1<<scale1)); 2798 2799 } // (int_cnt2 > 8) 2800 2801 // Scan string for start of substr in 16-byte vectors 2802 bind(SCAN_TO_SUBSTR); 2803 pcmpestri(vec, Address(result, 0), mode); 2804 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2805 subl(cnt1, stride); 2806 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2807 cmpl(cnt1, cnt2); 2808 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2809 addptr(result, 16); 2810 jmpb(SCAN_TO_SUBSTR); 2811 2812 // Found a potential substr 2813 bind(FOUND_CANDIDATE); 2814 // Matched whole vector if first element matched (tmp(rcx) == 0). 2815 if (int_cnt2 == stride) { 2816 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2817 } else { // int_cnt2 > 8 2818 jccb(Assembler::overflow, FOUND_SUBSTR); 2819 } 2820 // After pcmpestri tmp(rcx) contains matched element index 2821 // Compute start addr of substr 2822 lea(result, Address(result, tmp, scale1)); 2823 2824 // Make sure string is still long enough 2825 subl(cnt1, tmp); 2826 cmpl(cnt1, cnt2); 2827 if (int_cnt2 == stride) { 2828 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2829 } else { // int_cnt2 > 8 2830 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2831 } 2832 // Left less then substring. 2833 2834 bind(RET_NOT_FOUND); 2835 movl(result, -1); 2836 jmp(EXIT); 2837 2838 if (int_cnt2 > stride) { 2839 // This code is optimized for the case when whole substring 2840 // is matched if its head is matched. 2841 bind(MATCH_SUBSTR_HEAD); 2842 pcmpestri(vec, Address(result, 0), mode); 2843 // Reload only string if does not match 2844 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2845 2846 Label CONT_SCAN_SUBSTR; 2847 // Compare the rest of substring (> 8 chars). 2848 bind(FOUND_SUBSTR); 2849 // First 8 chars are already matched. 2850 negptr(cnt2); 2851 addptr(cnt2, stride); 2852 2853 bind(SCAN_SUBSTR); 2854 subl(cnt1, stride); 2855 cmpl(cnt2, -stride); // Do not read beyond substring 2856 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2857 // Back-up strings to avoid reading beyond substring: 2858 // cnt1 = cnt1 - cnt2 + 8 2859 addl(cnt1, cnt2); // cnt2 is negative 2860 addl(cnt1, stride); 2861 movl(cnt2, stride); negptr(cnt2); 2862 bind(CONT_SCAN_SUBSTR); 2863 if (int_cnt2 < (int)G) { 2864 int tail_off1 = int_cnt2<<scale1; 2865 int tail_off2 = int_cnt2<<scale2; 2866 if (ae == StrIntrinsicNode::UL) { 2867 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2868 } else { 2869 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2870 } 2871 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2872 } else { 2873 // calculate index in register to avoid integer overflow (int_cnt2*2) 2874 movl(tmp, int_cnt2); 2875 addptr(tmp, cnt2); 2876 if (ae == StrIntrinsicNode::UL) { 2877 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2878 } else { 2879 movdqu(vec, Address(str2, tmp, scale2, 0)); 2880 } 2881 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2882 } 2883 // Need to reload strings pointers if not matched whole vector 2884 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2885 addptr(cnt2, stride); 2886 jcc(Assembler::negative, SCAN_SUBSTR); 2887 // Fall through if found full substring 2888 2889 } // (int_cnt2 > 8) 2890 2891 bind(RET_FOUND); 2892 // Found result if we matched full small substring. 2893 // Compute substr offset 2894 subptr(result, str1); 2895 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2896 shrl(result, 1); // index 2897 } 2898 bind(EXIT); 2899 2900 } // string_indexofC8 2901 2902 // Small strings are loaded through stack if they cross page boundary. 2903 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2904 Register cnt1, Register cnt2, 2905 int int_cnt2, Register result, 2906 XMMRegister vec, Register tmp, 2907 int ae) { 2908 ShortBranchVerifier sbv(this); 2909 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2910 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2911 2912 // 2913 // int_cnt2 is length of small (< 8 chars) constant substring 2914 // or (-1) for non constant substring in which case its length 2915 // is in cnt2 register. 2916 // 2917 // Note, inline_string_indexOf() generates checks: 2918 // if (substr.count > string.count) return -1; 2919 // if (substr.count == 0) return 0; 2920 // 2921 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2922 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2923 // This method uses the pcmpestri instruction with bound registers 2924 // inputs: 2925 // xmm - substring 2926 // rax - substring length (elements count) 2927 // mem - scanned string 2928 // rdx - string length (elements count) 2929 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2930 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2931 // outputs: 2932 // rcx - matched index in string 2933 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2934 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2935 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2936 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2937 2938 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2939 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2940 FOUND_CANDIDATE; 2941 2942 { //======================================================== 2943 // We don't know where these strings are located 2944 // and we can't read beyond them. Load them through stack. 2945 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2946 2947 movptr(tmp, rsp); // save old SP 2948 2949 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2950 if (int_cnt2 == (1>>scale2)) { // One byte 2951 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2952 load_unsigned_byte(result, Address(str2, 0)); 2953 movdl(vec, result); // move 32 bits 2954 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2955 // Not enough header space in 32-bit VM: 12+3 = 15. 2956 movl(result, Address(str2, -1)); 2957 shrl(result, 8); 2958 movdl(vec, result); // move 32 bits 2959 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2960 load_unsigned_short(result, Address(str2, 0)); 2961 movdl(vec, result); // move 32 bits 2962 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2963 movdl(vec, Address(str2, 0)); // move 32 bits 2964 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2965 movq(vec, Address(str2, 0)); // move 64 bits 2966 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2967 // Array header size is 12 bytes in 32-bit VM 2968 // + 6 bytes for 3 chars == 18 bytes, 2969 // enough space to load vec and shift. 2970 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2971 if (ae == StrIntrinsicNode::UL) { 2972 int tail_off = int_cnt2-8; 2973 pmovzxbw(vec, Address(str2, tail_off)); 2974 psrldq(vec, -2*tail_off); 2975 } 2976 else { 2977 int tail_off = int_cnt2*(1<<scale2); 2978 movdqu(vec, Address(str2, tail_off-16)); 2979 psrldq(vec, 16-tail_off); 2980 } 2981 } 2982 } else { // not constant substring 2983 cmpl(cnt2, stride); 2984 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2985 2986 // We can read beyond string if srt+16 does not cross page boundary 2987 // since heaps are aligned and mapped by pages. 2988 assert(os::vm_page_size() < (int)G, "default page should be small"); 2989 movl(result, str2); // We need only low 32 bits 2990 andl(result, ((int)os::vm_page_size()-1)); 2991 cmpl(result, ((int)os::vm_page_size()-16)); 2992 jccb(Assembler::belowEqual, CHECK_STR); 2993 2994 // Move small strings to stack to allow load 16 bytes into vec. 2995 subptr(rsp, 16); 2996 int stk_offset = wordSize-(1<<scale2); 2997 push(cnt2); 2998 2999 bind(COPY_SUBSTR); 3000 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3001 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3002 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3003 } else if (ae == StrIntrinsicNode::UU) { 3004 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3005 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3006 } 3007 decrement(cnt2); 3008 jccb(Assembler::notZero, COPY_SUBSTR); 3009 3010 pop(cnt2); 3011 movptr(str2, rsp); // New substring address 3012 } // non constant 3013 3014 bind(CHECK_STR); 3015 cmpl(cnt1, stride); 3016 jccb(Assembler::aboveEqual, BIG_STRINGS); 3017 3018 // Check cross page boundary. 3019 movl(result, str1); // We need only low 32 bits 3020 andl(result, ((int)os::vm_page_size()-1)); 3021 cmpl(result, ((int)os::vm_page_size()-16)); 3022 jccb(Assembler::belowEqual, BIG_STRINGS); 3023 3024 subptr(rsp, 16); 3025 int stk_offset = -(1<<scale1); 3026 if (int_cnt2 < 0) { // not constant 3027 push(cnt2); 3028 stk_offset += wordSize; 3029 } 3030 movl(cnt2, cnt1); 3031 3032 bind(COPY_STR); 3033 if (ae == StrIntrinsicNode::LL) { 3034 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3035 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3036 } else { 3037 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3038 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3039 } 3040 decrement(cnt2); 3041 jccb(Assembler::notZero, COPY_STR); 3042 3043 if (int_cnt2 < 0) { // not constant 3044 pop(cnt2); 3045 } 3046 movptr(str1, rsp); // New string address 3047 3048 bind(BIG_STRINGS); 3049 // Load substring. 3050 if (int_cnt2 < 0) { // -1 3051 if (ae == StrIntrinsicNode::UL) { 3052 pmovzxbw(vec, Address(str2, 0)); 3053 } else { 3054 movdqu(vec, Address(str2, 0)); 3055 } 3056 push(cnt2); // substr count 3057 push(str2); // substr addr 3058 push(str1); // string addr 3059 } else { 3060 // Small (< 8 chars) constant substrings are loaded already. 3061 movl(cnt2, int_cnt2); 3062 } 3063 push(tmp); // original SP 3064 3065 } // Finished loading 3066 3067 //======================================================== 3068 // Start search 3069 // 3070 3071 movptr(result, str1); // string addr 3072 3073 if (int_cnt2 < 0) { // Only for non constant substring 3074 jmpb(SCAN_TO_SUBSTR); 3075 3076 // SP saved at sp+0 3077 // String saved at sp+1*wordSize 3078 // Substr saved at sp+2*wordSize 3079 // Substr count saved at sp+3*wordSize 3080 3081 // Reload substr for rescan, this code 3082 // is executed only for large substrings (> 8 chars) 3083 bind(RELOAD_SUBSTR); 3084 movptr(str2, Address(rsp, 2*wordSize)); 3085 movl(cnt2, Address(rsp, 3*wordSize)); 3086 if (ae == StrIntrinsicNode::UL) { 3087 pmovzxbw(vec, Address(str2, 0)); 3088 } else { 3089 movdqu(vec, Address(str2, 0)); 3090 } 3091 // We came here after the beginning of the substring was 3092 // matched but the rest of it was not so we need to search 3093 // again. Start from the next element after the previous match. 3094 subptr(str1, result); // Restore counter 3095 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3096 shrl(str1, 1); 3097 } 3098 addl(cnt1, str1); 3099 decrementl(cnt1); // Shift to next element 3100 cmpl(cnt1, cnt2); 3101 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3102 3103 addptr(result, (1<<scale1)); 3104 } // non constant 3105 3106 // Scan string for start of substr in 16-byte vectors 3107 bind(SCAN_TO_SUBSTR); 3108 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3109 pcmpestri(vec, Address(result, 0), mode); 3110 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3111 subl(cnt1, stride); 3112 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3113 cmpl(cnt1, cnt2); 3114 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3115 addptr(result, 16); 3116 3117 bind(ADJUST_STR); 3118 cmpl(cnt1, stride); // Do not read beyond string 3119 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3120 // Back-up string to avoid reading beyond string. 3121 lea(result, Address(result, cnt1, scale1, -16)); 3122 movl(cnt1, stride); 3123 jmpb(SCAN_TO_SUBSTR); 3124 3125 // Found a potential substr 3126 bind(FOUND_CANDIDATE); 3127 // After pcmpestri tmp(rcx) contains matched element index 3128 3129 // Make sure string is still long enough 3130 subl(cnt1, tmp); 3131 cmpl(cnt1, cnt2); 3132 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3133 // Left less then substring. 3134 3135 bind(RET_NOT_FOUND); 3136 movl(result, -1); 3137 jmp(CLEANUP); 3138 3139 bind(FOUND_SUBSTR); 3140 // Compute start addr of substr 3141 lea(result, Address(result, tmp, scale1)); 3142 if (int_cnt2 > 0) { // Constant substring 3143 // Repeat search for small substring (< 8 chars) 3144 // from new point without reloading substring. 3145 // Have to check that we don't read beyond string. 3146 cmpl(tmp, stride-int_cnt2); 3147 jccb(Assembler::greater, ADJUST_STR); 3148 // Fall through if matched whole substring. 3149 } else { // non constant 3150 assert(int_cnt2 == -1, "should be != 0"); 3151 3152 addl(tmp, cnt2); 3153 // Found result if we matched whole substring. 3154 cmpl(tmp, stride); 3155 jcc(Assembler::lessEqual, RET_FOUND); 3156 3157 // Repeat search for small substring (<= 8 chars) 3158 // from new point 'str1' without reloading substring. 3159 cmpl(cnt2, stride); 3160 // Have to check that we don't read beyond string. 3161 jccb(Assembler::lessEqual, ADJUST_STR); 3162 3163 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3164 // Compare the rest of substring (> 8 chars). 3165 movptr(str1, result); 3166 3167 cmpl(tmp, cnt2); 3168 // First 8 chars are already matched. 3169 jccb(Assembler::equal, CHECK_NEXT); 3170 3171 bind(SCAN_SUBSTR); 3172 pcmpestri(vec, Address(str1, 0), mode); 3173 // Need to reload strings pointers if not matched whole vector 3174 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3175 3176 bind(CHECK_NEXT); 3177 subl(cnt2, stride); 3178 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3179 addptr(str1, 16); 3180 if (ae == StrIntrinsicNode::UL) { 3181 addptr(str2, 8); 3182 } else { 3183 addptr(str2, 16); 3184 } 3185 subl(cnt1, stride); 3186 cmpl(cnt2, stride); // Do not read beyond substring 3187 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3188 // Back-up strings to avoid reading beyond substring. 3189 3190 if (ae == StrIntrinsicNode::UL) { 3191 lea(str2, Address(str2, cnt2, scale2, -8)); 3192 lea(str1, Address(str1, cnt2, scale1, -16)); 3193 } else { 3194 lea(str2, Address(str2, cnt2, scale2, -16)); 3195 lea(str1, Address(str1, cnt2, scale1, -16)); 3196 } 3197 subl(cnt1, cnt2); 3198 movl(cnt2, stride); 3199 addl(cnt1, stride); 3200 bind(CONT_SCAN_SUBSTR); 3201 if (ae == StrIntrinsicNode::UL) { 3202 pmovzxbw(vec, Address(str2, 0)); 3203 } else { 3204 movdqu(vec, Address(str2, 0)); 3205 } 3206 jmp(SCAN_SUBSTR); 3207 3208 bind(RET_FOUND_LONG); 3209 movptr(str1, Address(rsp, wordSize)); 3210 } // non constant 3211 3212 bind(RET_FOUND); 3213 // Compute substr offset 3214 subptr(result, str1); 3215 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3216 shrl(result, 1); // index 3217 } 3218 bind(CLEANUP); 3219 pop(rsp); // restore SP 3220 3221 } // string_indexof 3222 3223 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3224 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3225 ShortBranchVerifier sbv(this); 3226 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3227 3228 int stride = 8; 3229 3230 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3231 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3232 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3233 FOUND_SEQ_CHAR, DONE_LABEL; 3234 3235 movptr(result, str1); 3236 if (UseAVX >= 2) { 3237 cmpl(cnt1, stride); 3238 jcc(Assembler::less, SCAN_TO_CHAR); 3239 cmpl(cnt1, 2*stride); 3240 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3241 movdl(vec1, ch); 3242 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3243 vpxor(vec2, vec2); 3244 movl(tmp, cnt1); 3245 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3246 andl(cnt1,0x0000000F); //tail count (in chars) 3247 3248 bind(SCAN_TO_16_CHAR_LOOP); 3249 vmovdqu(vec3, Address(result, 0)); 3250 vpcmpeqw(vec3, vec3, vec1, 1); 3251 vptest(vec2, vec3); 3252 jcc(Assembler::carryClear, FOUND_CHAR); 3253 addptr(result, 32); 3254 subl(tmp, 2*stride); 3255 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3256 jmp(SCAN_TO_8_CHAR); 3257 bind(SCAN_TO_8_CHAR_INIT); 3258 movdl(vec1, ch); 3259 pshuflw(vec1, vec1, 0x00); 3260 pshufd(vec1, vec1, 0); 3261 pxor(vec2, vec2); 3262 } 3263 bind(SCAN_TO_8_CHAR); 3264 cmpl(cnt1, stride); 3265 jcc(Assembler::less, SCAN_TO_CHAR); 3266 if (UseAVX < 2) { 3267 movdl(vec1, ch); 3268 pshuflw(vec1, vec1, 0x00); 3269 pshufd(vec1, vec1, 0); 3270 pxor(vec2, vec2); 3271 } 3272 movl(tmp, cnt1); 3273 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3274 andl(cnt1,0x00000007); //tail count (in chars) 3275 3276 bind(SCAN_TO_8_CHAR_LOOP); 3277 movdqu(vec3, Address(result, 0)); 3278 pcmpeqw(vec3, vec1); 3279 ptest(vec2, vec3); 3280 jcc(Assembler::carryClear, FOUND_CHAR); 3281 addptr(result, 16); 3282 subl(tmp, stride); 3283 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3284 bind(SCAN_TO_CHAR); 3285 testl(cnt1, cnt1); 3286 jcc(Assembler::zero, RET_NOT_FOUND); 3287 bind(SCAN_TO_CHAR_LOOP); 3288 load_unsigned_short(tmp, Address(result, 0)); 3289 cmpl(ch, tmp); 3290 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3291 addptr(result, 2); 3292 subl(cnt1, 1); 3293 jccb(Assembler::zero, RET_NOT_FOUND); 3294 jmp(SCAN_TO_CHAR_LOOP); 3295 3296 bind(RET_NOT_FOUND); 3297 movl(result, -1); 3298 jmpb(DONE_LABEL); 3299 3300 bind(FOUND_CHAR); 3301 if (UseAVX >= 2) { 3302 vpmovmskb(tmp, vec3); 3303 } else { 3304 pmovmskb(tmp, vec3); 3305 } 3306 bsfl(ch, tmp); 3307 addptr(result, ch); 3308 3309 bind(FOUND_SEQ_CHAR); 3310 subptr(result, str1); 3311 shrl(result, 1); 3312 3313 bind(DONE_LABEL); 3314 } // string_indexof_char 3315 3316 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3317 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3318 ShortBranchVerifier sbv(this); 3319 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3320 3321 int stride = 16; 3322 3323 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3324 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3325 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3326 FOUND_SEQ_CHAR, DONE_LABEL; 3327 3328 movptr(result, str1); 3329 if (UseAVX >= 2) { 3330 cmpl(cnt1, stride); 3331 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3332 cmpl(cnt1, stride*2); 3333 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3334 movdl(vec1, ch); 3335 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3336 vpxor(vec2, vec2); 3337 movl(tmp, cnt1); 3338 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3339 andl(cnt1,0x0000001F); //tail count (in chars) 3340 3341 bind(SCAN_TO_32_CHAR_LOOP); 3342 vmovdqu(vec3, Address(result, 0)); 3343 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3344 vptest(vec2, vec3); 3345 jcc(Assembler::carryClear, FOUND_CHAR); 3346 addptr(result, 32); 3347 subl(tmp, stride*2); 3348 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3349 jmp(SCAN_TO_16_CHAR); 3350 3351 bind(SCAN_TO_16_CHAR_INIT); 3352 movdl(vec1, ch); 3353 pxor(vec2, vec2); 3354 pshufb(vec1, vec2); 3355 } 3356 3357 bind(SCAN_TO_16_CHAR); 3358 cmpl(cnt1, stride); 3359 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3360 if (UseAVX < 2) { 3361 movdl(vec1, ch); 3362 pxor(vec2, vec2); 3363 pshufb(vec1, vec2); 3364 } 3365 movl(tmp, cnt1); 3366 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3367 andl(cnt1,0x0000000F); //tail count (in bytes) 3368 3369 bind(SCAN_TO_16_CHAR_LOOP); 3370 movdqu(vec3, Address(result, 0)); 3371 pcmpeqb(vec3, vec1); 3372 ptest(vec2, vec3); 3373 jcc(Assembler::carryClear, FOUND_CHAR); 3374 addptr(result, 16); 3375 subl(tmp, stride); 3376 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3377 3378 bind(SCAN_TO_CHAR_INIT); 3379 testl(cnt1, cnt1); 3380 jcc(Assembler::zero, RET_NOT_FOUND); 3381 bind(SCAN_TO_CHAR_LOOP); 3382 load_unsigned_byte(tmp, Address(result, 0)); 3383 cmpl(ch, tmp); 3384 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3385 addptr(result, 1); 3386 subl(cnt1, 1); 3387 jccb(Assembler::zero, RET_NOT_FOUND); 3388 jmp(SCAN_TO_CHAR_LOOP); 3389 3390 bind(RET_NOT_FOUND); 3391 movl(result, -1); 3392 jmpb(DONE_LABEL); 3393 3394 bind(FOUND_CHAR); 3395 if (UseAVX >= 2) { 3396 vpmovmskb(tmp, vec3); 3397 } else { 3398 pmovmskb(tmp, vec3); 3399 } 3400 bsfl(ch, tmp); 3401 addptr(result, ch); 3402 3403 bind(FOUND_SEQ_CHAR); 3404 subptr(result, str1); 3405 3406 bind(DONE_LABEL); 3407 } // stringL_indexof_char 3408 3409 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3410 switch (eltype) { 3411 case T_BOOLEAN: return sizeof(jboolean); 3412 case T_BYTE: return sizeof(jbyte); 3413 case T_SHORT: return sizeof(jshort); 3414 case T_CHAR: return sizeof(jchar); 3415 case T_INT: return sizeof(jint); 3416 default: 3417 ShouldNotReachHere(); 3418 return -1; 3419 } 3420 } 3421 3422 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3423 switch (eltype) { 3424 // T_BOOLEAN used as surrogate for unsigned byte 3425 case T_BOOLEAN: movzbl(dst, src); break; 3426 case T_BYTE: movsbl(dst, src); break; 3427 case T_SHORT: movswl(dst, src); break; 3428 case T_CHAR: movzwl(dst, src); break; 3429 case T_INT: movl(dst, src); break; 3430 default: 3431 ShouldNotReachHere(); 3432 } 3433 } 3434 3435 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3436 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3437 } 3438 3439 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3440 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3441 } 3442 3443 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3444 const int vlen = Assembler::AVX_256bit; 3445 switch (eltype) { 3446 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3447 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3448 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3449 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3450 case T_INT: 3451 // do nothing 3452 break; 3453 default: 3454 ShouldNotReachHere(); 3455 } 3456 } 3457 3458 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3459 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3460 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3461 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3462 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3463 BasicType eltype) { 3464 ShortBranchVerifier sbv(this); 3465 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3466 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3467 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3468 3469 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3470 SHORT_UNROLLED_LOOP_EXIT, 3471 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3472 UNROLLED_VECTOR_LOOP_BEGIN, 3473 END; 3474 switch (eltype) { 3475 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3476 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3477 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3478 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3479 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3480 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3481 } 3482 3483 // For "renaming" for readibility of the code 3484 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3485 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3486 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3487 3488 const int elsize = arrays_hashcode_elsize(eltype); 3489 3490 /* 3491 if (cnt1 >= 2) { 3492 if (cnt1 >= 32) { 3493 UNROLLED VECTOR LOOP 3494 } 3495 UNROLLED SCALAR LOOP 3496 } 3497 SINGLE SCALAR 3498 */ 3499 3500 cmpl(cnt1, 32); 3501 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3502 3503 // cnt1 >= 32 && generate_vectorized_loop 3504 xorl(index, index); 3505 3506 // vresult = IntVector.zero(I256); 3507 for (int idx = 0; idx < 4; idx++) { 3508 vpxor(vresult[idx], vresult[idx]); 3509 } 3510 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3511 Register bound = tmp2; 3512 Register next = tmp3; 3513 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3514 movl(next, Address(tmp2, 0)); 3515 movdl(vnext, next); 3516 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3517 3518 // index = 0; 3519 // bound = cnt1 & ~(32 - 1); 3520 movl(bound, cnt1); 3521 andl(bound, ~(32 - 1)); 3522 // for (; index < bound; index += 32) { 3523 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3524 // result *= next; 3525 imull(result, next); 3526 // loop fission to upfront the cost of fetching from memory, OOO execution 3527 // can then hopefully do a better job of prefetching 3528 for (int idx = 0; idx < 4; idx++) { 3529 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3530 } 3531 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3532 for (int idx = 0; idx < 4; idx++) { 3533 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3534 arrays_hashcode_elvcast(vtmp[idx], eltype); 3535 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3536 } 3537 // index += 32; 3538 addl(index, 32); 3539 // index < bound; 3540 cmpl(index, bound); 3541 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3542 // } 3543 3544 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3545 subl(cnt1, bound); 3546 // release bound 3547 3548 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3549 for (int idx = 0; idx < 4; idx++) { 3550 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3551 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3552 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3553 } 3554 // result += vresult.reduceLanes(ADD); 3555 for (int idx = 0; idx < 4; idx++) { 3556 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3557 } 3558 3559 // } else if (cnt1 < 32) { 3560 3561 bind(SHORT_UNROLLED_BEGIN); 3562 // int i = 1; 3563 movl(index, 1); 3564 cmpl(index, cnt1); 3565 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3566 3567 // for (; i < cnt1 ; i += 2) { 3568 bind(SHORT_UNROLLED_LOOP_BEGIN); 3569 movl(tmp3, 961); 3570 imull(result, tmp3); 3571 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3572 movl(tmp3, tmp2); 3573 shll(tmp3, 5); 3574 subl(tmp3, tmp2); 3575 addl(result, tmp3); 3576 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3577 addl(result, tmp3); 3578 addl(index, 2); 3579 cmpl(index, cnt1); 3580 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3581 3582 // } 3583 // if (i >= cnt1) { 3584 bind(SHORT_UNROLLED_LOOP_EXIT); 3585 jccb(Assembler::greater, END); 3586 movl(tmp2, result); 3587 shll(result, 5); 3588 subl(result, tmp2); 3589 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3590 addl(result, tmp3); 3591 // } 3592 bind(END); 3593 3594 BLOCK_COMMENT("} // arrays_hashcode"); 3595 3596 } // arrays_hashcode 3597 3598 // helper function for string_compare 3599 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3600 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3601 Address::ScaleFactor scale2, Register index, int ae) { 3602 if (ae == StrIntrinsicNode::LL) { 3603 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3604 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3605 } else if (ae == StrIntrinsicNode::UU) { 3606 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3607 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3608 } else { 3609 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3610 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3611 } 3612 } 3613 3614 // Compare strings, used for char[] and byte[]. 3615 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3616 Register cnt1, Register cnt2, Register result, 3617 XMMRegister vec1, int ae, KRegister mask) { 3618 ShortBranchVerifier sbv(this); 3619 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3620 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3 3621 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3622 int stride2x2 = 0x40; 3623 Address::ScaleFactor scale = Address::no_scale; 3624 Address::ScaleFactor scale1 = Address::no_scale; 3625 Address::ScaleFactor scale2 = Address::no_scale; 3626 3627 if (ae != StrIntrinsicNode::LL) { 3628 stride2x2 = 0x20; 3629 } 3630 3631 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3632 shrl(cnt2, 1); 3633 } 3634 // Compute the minimum of the string lengths and the 3635 // difference of the string lengths (stack). 3636 // Do the conditional move stuff 3637 movl(result, cnt1); 3638 subl(cnt1, cnt2); 3639 push(cnt1); 3640 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3641 3642 // Is the minimum length zero? 3643 testl(cnt2, cnt2); 3644 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3645 if (ae == StrIntrinsicNode::LL) { 3646 // Load first bytes 3647 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3648 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3649 } else if (ae == StrIntrinsicNode::UU) { 3650 // Load first characters 3651 load_unsigned_short(result, Address(str1, 0)); 3652 load_unsigned_short(cnt1, Address(str2, 0)); 3653 } else { 3654 load_unsigned_byte(result, Address(str1, 0)); 3655 load_unsigned_short(cnt1, Address(str2, 0)); 3656 } 3657 subl(result, cnt1); 3658 jcc(Assembler::notZero, POP_LABEL); 3659 3660 if (ae == StrIntrinsicNode::UU) { 3661 // Divide length by 2 to get number of chars 3662 shrl(cnt2, 1); 3663 } 3664 cmpl(cnt2, 1); 3665 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3666 3667 // Check if the strings start at the same location and setup scale and stride 3668 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3669 cmpptr(str1, str2); 3670 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3671 if (ae == StrIntrinsicNode::LL) { 3672 scale = Address::times_1; 3673 stride = 16; 3674 } else { 3675 scale = Address::times_2; 3676 stride = 8; 3677 } 3678 } else { 3679 scale1 = Address::times_1; 3680 scale2 = Address::times_2; 3681 // scale not used 3682 stride = 8; 3683 } 3684 3685 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3686 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3687 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3688 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3689 Label COMPARE_TAIL_LONG; 3690 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3 3691 3692 int pcmpmask = 0x19; 3693 if (ae == StrIntrinsicNode::LL) { 3694 pcmpmask &= ~0x01; 3695 } 3696 3697 // Setup to compare 16-chars (32-bytes) vectors, 3698 // start from first character again because it has aligned address. 3699 if (ae == StrIntrinsicNode::LL) { 3700 stride2 = 32; 3701 } else { 3702 stride2 = 16; 3703 } 3704 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3705 adr_stride = stride << scale; 3706 } else { 3707 adr_stride1 = 8; //stride << scale1; 3708 adr_stride2 = 16; //stride << scale2; 3709 } 3710 3711 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3712 // rax and rdx are used by pcmpestri as elements counters 3713 movl(result, cnt2); 3714 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3715 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3716 3717 // fast path : compare first 2 8-char vectors. 3718 bind(COMPARE_16_CHARS); 3719 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3720 movdqu(vec1, Address(str1, 0)); 3721 } else { 3722 pmovzxbw(vec1, Address(str1, 0)); 3723 } 3724 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3725 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3726 3727 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3728 movdqu(vec1, Address(str1, adr_stride)); 3729 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3730 } else { 3731 pmovzxbw(vec1, Address(str1, adr_stride1)); 3732 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3733 } 3734 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3735 addl(cnt1, stride); 3736 3737 // Compare the characters at index in cnt1 3738 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3739 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3740 subl(result, cnt2); 3741 jmp(POP_LABEL); 3742 3743 // Setup the registers to start vector comparison loop 3744 bind(COMPARE_WIDE_VECTORS); 3745 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3746 lea(str1, Address(str1, result, scale)); 3747 lea(str2, Address(str2, result, scale)); 3748 } else { 3749 lea(str1, Address(str1, result, scale1)); 3750 lea(str2, Address(str2, result, scale2)); 3751 } 3752 subl(result, stride2); 3753 subl(cnt2, stride2); 3754 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3755 negptr(result); 3756 3757 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3758 bind(COMPARE_WIDE_VECTORS_LOOP); 3759 3760 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3761 cmpl(cnt2, stride2x2); 3762 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3763 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3764 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3765 3766 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3767 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3768 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3769 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3770 } else { 3771 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3772 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3773 } 3774 kortestql(mask, mask); 3775 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3776 addptr(result, stride2x2); // update since we already compared at this addr 3777 subl(cnt2, stride2x2); // and sub the size too 3778 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3779 3780 vpxor(vec1, vec1); 3781 jmpb(COMPARE_WIDE_TAIL); 3782 }//if (VM_Version::supports_avx512vlbw()) 3783 3784 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3785 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3786 vmovdqu(vec1, Address(str1, result, scale)); 3787 vpxor(vec1, Address(str2, result, scale)); 3788 } else { 3789 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3790 vpxor(vec1, Address(str2, result, scale2)); 3791 } 3792 vptest(vec1, vec1); 3793 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3794 addptr(result, stride2); 3795 subl(cnt2, stride2); 3796 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3797 // clean upper bits of YMM registers 3798 vpxor(vec1, vec1); 3799 3800 // compare wide vectors tail 3801 bind(COMPARE_WIDE_TAIL); 3802 testptr(result, result); 3803 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3804 3805 movl(result, stride2); 3806 movl(cnt2, result); 3807 negptr(result); 3808 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3809 3810 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3811 bind(VECTOR_NOT_EQUAL); 3812 // clean upper bits of YMM registers 3813 vpxor(vec1, vec1); 3814 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3815 lea(str1, Address(str1, result, scale)); 3816 lea(str2, Address(str2, result, scale)); 3817 } else { 3818 lea(str1, Address(str1, result, scale1)); 3819 lea(str2, Address(str2, result, scale2)); 3820 } 3821 jmp(COMPARE_16_CHARS); 3822 3823 // Compare tail chars, length between 1 to 15 chars 3824 bind(COMPARE_TAIL_LONG); 3825 movl(cnt2, result); 3826 cmpl(cnt2, stride); 3827 jcc(Assembler::less, COMPARE_SMALL_STR); 3828 3829 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3830 movdqu(vec1, Address(str1, 0)); 3831 } else { 3832 pmovzxbw(vec1, Address(str1, 0)); 3833 } 3834 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3835 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3836 subptr(cnt2, stride); 3837 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3838 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3839 lea(str1, Address(str1, result, scale)); 3840 lea(str2, Address(str2, result, scale)); 3841 } else { 3842 lea(str1, Address(str1, result, scale1)); 3843 lea(str2, Address(str2, result, scale2)); 3844 } 3845 negptr(cnt2); 3846 jmpb(WHILE_HEAD_LABEL); 3847 3848 bind(COMPARE_SMALL_STR); 3849 } else if (UseSSE42Intrinsics) { 3850 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3851 int pcmpmask = 0x19; 3852 // Setup to compare 8-char (16-byte) vectors, 3853 // start from first character again because it has aligned address. 3854 movl(result, cnt2); 3855 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3856 if (ae == StrIntrinsicNode::LL) { 3857 pcmpmask &= ~0x01; 3858 } 3859 jcc(Assembler::zero, COMPARE_TAIL); 3860 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3861 lea(str1, Address(str1, result, scale)); 3862 lea(str2, Address(str2, result, scale)); 3863 } else { 3864 lea(str1, Address(str1, result, scale1)); 3865 lea(str2, Address(str2, result, scale2)); 3866 } 3867 negptr(result); 3868 3869 // pcmpestri 3870 // inputs: 3871 // vec1- substring 3872 // rax - negative string length (elements count) 3873 // mem - scanned string 3874 // rdx - string length (elements count) 3875 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3876 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3877 // outputs: 3878 // rcx - first mismatched element index 3879 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3880 3881 bind(COMPARE_WIDE_VECTORS); 3882 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3883 movdqu(vec1, Address(str1, result, scale)); 3884 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3885 } else { 3886 pmovzxbw(vec1, Address(str1, result, scale1)); 3887 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3888 } 3889 // After pcmpestri cnt1(rcx) contains mismatched element index 3890 3891 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3892 addptr(result, stride); 3893 subptr(cnt2, stride); 3894 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3895 3896 // compare wide vectors tail 3897 testptr(result, result); 3898 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3899 3900 movl(cnt2, stride); 3901 movl(result, stride); 3902 negptr(result); 3903 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3904 movdqu(vec1, Address(str1, result, scale)); 3905 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3906 } else { 3907 pmovzxbw(vec1, Address(str1, result, scale1)); 3908 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3909 } 3910 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3911 3912 // Mismatched characters in the vectors 3913 bind(VECTOR_NOT_EQUAL); 3914 addptr(cnt1, result); 3915 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3916 subl(result, cnt2); 3917 jmpb(POP_LABEL); 3918 3919 bind(COMPARE_TAIL); // limit is zero 3920 movl(cnt2, result); 3921 // Fallthru to tail compare 3922 } 3923 // Shift str2 and str1 to the end of the arrays, negate min 3924 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3925 lea(str1, Address(str1, cnt2, scale)); 3926 lea(str2, Address(str2, cnt2, scale)); 3927 } else { 3928 lea(str1, Address(str1, cnt2, scale1)); 3929 lea(str2, Address(str2, cnt2, scale2)); 3930 } 3931 decrementl(cnt2); // first character was compared already 3932 negptr(cnt2); 3933 3934 // Compare the rest of the elements 3935 bind(WHILE_HEAD_LABEL); 3936 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3937 subl(result, cnt1); 3938 jccb(Assembler::notZero, POP_LABEL); 3939 increment(cnt2); 3940 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3941 3942 // Strings are equal up to min length. Return the length difference. 3943 bind(LENGTH_DIFF_LABEL); 3944 pop(result); 3945 if (ae == StrIntrinsicNode::UU) { 3946 // Divide diff by 2 to get number of chars 3947 sarl(result, 1); 3948 } 3949 jmpb(DONE_LABEL); 3950 3951 if (VM_Version::supports_avx512vlbw()) { 3952 3953 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3954 3955 kmovql(cnt1, mask); 3956 notq(cnt1); 3957 bsfq(cnt2, cnt1); 3958 if (ae != StrIntrinsicNode::LL) { 3959 // Divide diff by 2 to get number of chars 3960 sarl(cnt2, 1); 3961 } 3962 addq(result, cnt2); 3963 if (ae == StrIntrinsicNode::LL) { 3964 load_unsigned_byte(cnt1, Address(str2, result)); 3965 load_unsigned_byte(result, Address(str1, result)); 3966 } else if (ae == StrIntrinsicNode::UU) { 3967 load_unsigned_short(cnt1, Address(str2, result, scale)); 3968 load_unsigned_short(result, Address(str1, result, scale)); 3969 } else { 3970 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3971 load_unsigned_byte(result, Address(str1, result, scale1)); 3972 } 3973 subl(result, cnt1); 3974 jmpb(POP_LABEL); 3975 }//if (VM_Version::supports_avx512vlbw()) 3976 3977 // Discard the stored length difference 3978 bind(POP_LABEL); 3979 pop(cnt1); 3980 3981 // That's it 3982 bind(DONE_LABEL); 3983 if(ae == StrIntrinsicNode::UL) { 3984 negl(result); 3985 } 3986 3987 } 3988 3989 // Search for Non-ASCII character (Negative byte value) in a byte array, 3990 // return the index of the first such character, otherwise the length 3991 // of the array segment searched. 3992 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3993 // @IntrinsicCandidate 3994 // public static int countPositives(byte[] ba, int off, int len) { 3995 // for (int i = off; i < off + len; i++) { 3996 // if (ba[i] < 0) { 3997 // return i - off; 3998 // } 3999 // } 4000 // return len; 4001 // } 4002 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4003 Register result, Register tmp1, 4004 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4005 // rsi: byte array 4006 // rcx: len 4007 // rax: result 4008 ShortBranchVerifier sbv(this); 4009 assert_different_registers(ary1, len, result, tmp1); 4010 assert_different_registers(vec1, vec2); 4011 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4012 4013 movl(result, len); // copy 4014 // len == 0 4015 testl(len, len); 4016 jcc(Assembler::zero, DONE); 4017 4018 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4019 VM_Version::supports_avx512vlbw() && 4020 VM_Version::supports_bmi2()) { 4021 4022 Label test_64_loop, test_tail, BREAK_LOOP; 4023 movl(tmp1, len); 4024 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4025 4026 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4027 andl(len, 0xffffffc0); // vector count (in chars) 4028 jccb(Assembler::zero, test_tail); 4029 4030 lea(ary1, Address(ary1, len, Address::times_1)); 4031 negptr(len); 4032 4033 bind(test_64_loop); 4034 // Check whether our 64 elements of size byte contain negatives 4035 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4036 kortestql(mask1, mask1); 4037 jcc(Assembler::notZero, BREAK_LOOP); 4038 4039 addptr(len, 64); 4040 jccb(Assembler::notZero, test_64_loop); 4041 4042 bind(test_tail); 4043 // bail out when there is nothing to be done 4044 testl(tmp1, -1); 4045 jcc(Assembler::zero, DONE); 4046 4047 4048 // check the tail for absense of negatives 4049 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4050 { 4051 Register tmp3_aliased = len; 4052 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4053 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4054 notq(tmp3_aliased); 4055 kmovql(mask2, tmp3_aliased); 4056 } 4057 4058 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4059 ktestq(mask1, mask2); 4060 jcc(Assembler::zero, DONE); 4061 4062 // do a full check for negative registers in the tail 4063 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4064 // ary1 already pointing to the right place 4065 jmpb(TAIL_START); 4066 4067 bind(BREAK_LOOP); 4068 // At least one byte in the last 64 byte block was negative. 4069 // Set up to look at the last 64 bytes as if they were a tail 4070 lea(ary1, Address(ary1, len, Address::times_1)); 4071 addptr(result, len); 4072 // Ignore the very last byte: if all others are positive, 4073 // it must be negative, so we can skip right to the 2+1 byte 4074 // end comparison at this point 4075 orl(result, 63); 4076 movl(len, 63); 4077 // Fallthru to tail compare 4078 } else { 4079 4080 if (UseAVX >= 2) { 4081 // With AVX2, use 32-byte vector compare 4082 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4083 4084 // Compare 32-byte vectors 4085 testl(len, 0xffffffe0); // vector count (in bytes) 4086 jccb(Assembler::zero, TAIL_START); 4087 4088 andl(len, 0xffffffe0); 4089 lea(ary1, Address(ary1, len, Address::times_1)); 4090 negptr(len); 4091 4092 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4093 movdl(vec2, tmp1); 4094 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4095 4096 bind(COMPARE_WIDE_VECTORS); 4097 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4098 vptest(vec1, vec2); 4099 jccb(Assembler::notZero, BREAK_LOOP); 4100 addptr(len, 32); 4101 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4102 4103 testl(result, 0x0000001f); // any bytes remaining? 4104 jcc(Assembler::zero, DONE); 4105 4106 // Quick test using the already prepared vector mask 4107 movl(len, result); 4108 andl(len, 0x0000001f); 4109 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4110 vptest(vec1, vec2); 4111 jcc(Assembler::zero, DONE); 4112 // There are zeros, jump to the tail to determine exactly where 4113 jmpb(TAIL_START); 4114 4115 bind(BREAK_LOOP); 4116 // At least one byte in the last 32-byte vector is negative. 4117 // Set up to look at the last 32 bytes as if they were a tail 4118 lea(ary1, Address(ary1, len, Address::times_1)); 4119 addptr(result, len); 4120 // Ignore the very last byte: if all others are positive, 4121 // it must be negative, so we can skip right to the 2+1 byte 4122 // end comparison at this point 4123 orl(result, 31); 4124 movl(len, 31); 4125 // Fallthru to tail compare 4126 } else if (UseSSE42Intrinsics) { 4127 // With SSE4.2, use double quad vector compare 4128 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4129 4130 // Compare 16-byte vectors 4131 testl(len, 0xfffffff0); // vector count (in bytes) 4132 jcc(Assembler::zero, TAIL_START); 4133 4134 andl(len, 0xfffffff0); 4135 lea(ary1, Address(ary1, len, Address::times_1)); 4136 negptr(len); 4137 4138 movl(tmp1, 0x80808080); 4139 movdl(vec2, tmp1); 4140 pshufd(vec2, vec2, 0); 4141 4142 bind(COMPARE_WIDE_VECTORS); 4143 movdqu(vec1, Address(ary1, len, Address::times_1)); 4144 ptest(vec1, vec2); 4145 jccb(Assembler::notZero, BREAK_LOOP); 4146 addptr(len, 16); 4147 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4148 4149 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4150 jcc(Assembler::zero, DONE); 4151 4152 // Quick test using the already prepared vector mask 4153 movl(len, result); 4154 andl(len, 0x0000000f); // tail count (in bytes) 4155 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4156 ptest(vec1, vec2); 4157 jcc(Assembler::zero, DONE); 4158 jmpb(TAIL_START); 4159 4160 bind(BREAK_LOOP); 4161 // At least one byte in the last 16-byte vector is negative. 4162 // Set up and look at the last 16 bytes as if they were a tail 4163 lea(ary1, Address(ary1, len, Address::times_1)); 4164 addptr(result, len); 4165 // Ignore the very last byte: if all others are positive, 4166 // it must be negative, so we can skip right to the 2+1 byte 4167 // end comparison at this point 4168 orl(result, 15); 4169 movl(len, 15); 4170 // Fallthru to tail compare 4171 } 4172 } 4173 4174 bind(TAIL_START); 4175 // Compare 4-byte vectors 4176 andl(len, 0xfffffffc); // vector count (in bytes) 4177 jccb(Assembler::zero, COMPARE_CHAR); 4178 4179 lea(ary1, Address(ary1, len, Address::times_1)); 4180 negptr(len); 4181 4182 bind(COMPARE_VECTORS); 4183 movl(tmp1, Address(ary1, len, Address::times_1)); 4184 andl(tmp1, 0x80808080); 4185 jccb(Assembler::notZero, TAIL_ADJUST); 4186 addptr(len, 4); 4187 jccb(Assembler::notZero, COMPARE_VECTORS); 4188 4189 // Compare trailing char (final 2-3 bytes), if any 4190 bind(COMPARE_CHAR); 4191 4192 testl(result, 0x2); // tail char 4193 jccb(Assembler::zero, COMPARE_BYTE); 4194 load_unsigned_short(tmp1, Address(ary1, 0)); 4195 andl(tmp1, 0x00008080); 4196 jccb(Assembler::notZero, CHAR_ADJUST); 4197 lea(ary1, Address(ary1, 2)); 4198 4199 bind(COMPARE_BYTE); 4200 testl(result, 0x1); // tail byte 4201 jccb(Assembler::zero, DONE); 4202 load_unsigned_byte(tmp1, Address(ary1, 0)); 4203 testl(tmp1, 0x00000080); 4204 jccb(Assembler::zero, DONE); 4205 subptr(result, 1); 4206 jmpb(DONE); 4207 4208 bind(TAIL_ADJUST); 4209 // there are negative bits in the last 4 byte block. 4210 // Adjust result and check the next three bytes 4211 addptr(result, len); 4212 orl(result, 3); 4213 lea(ary1, Address(ary1, len, Address::times_1)); 4214 jmpb(COMPARE_CHAR); 4215 4216 bind(CHAR_ADJUST); 4217 // We are looking at a char + optional byte tail, and found that one 4218 // of the bytes in the char is negative. Adjust the result, check the 4219 // first byte and readjust if needed. 4220 andl(result, 0xfffffffc); 4221 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4222 jccb(Assembler::notZero, DONE); 4223 addptr(result, 1); 4224 4225 // That's it 4226 bind(DONE); 4227 if (UseAVX >= 2) { 4228 // clean upper bits of YMM registers 4229 vpxor(vec1, vec1); 4230 vpxor(vec2, vec2); 4231 } 4232 } 4233 4234 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4235 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4236 Register limit, Register result, Register chr, 4237 XMMRegister vec1, XMMRegister vec2, bool is_char, 4238 KRegister mask, bool expand_ary2) { 4239 // for expand_ary2, limit is the (smaller) size of the second array. 4240 ShortBranchVerifier sbv(this); 4241 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4242 4243 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4244 "Expansion only implemented for AVX2"); 4245 4246 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4247 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4248 4249 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4250 int scaleIncr = expand_ary2 ? 8 : 16; 4251 4252 if (is_array_equ) { 4253 // Check the input args 4254 cmpoop(ary1, ary2); 4255 jcc(Assembler::equal, TRUE_LABEL); 4256 4257 // Need additional checks for arrays_equals. 4258 testptr(ary1, ary1); 4259 jcc(Assembler::zero, FALSE_LABEL); 4260 testptr(ary2, ary2); 4261 jcc(Assembler::zero, FALSE_LABEL); 4262 4263 // Check the lengths 4264 movl(limit, Address(ary1, length_offset)); 4265 cmpl(limit, Address(ary2, length_offset)); 4266 jcc(Assembler::notEqual, FALSE_LABEL); 4267 } 4268 4269 // count == 0 4270 testl(limit, limit); 4271 jcc(Assembler::zero, TRUE_LABEL); 4272 4273 if (is_array_equ) { 4274 // Load array address 4275 lea(ary1, Address(ary1, base_offset)); 4276 lea(ary2, Address(ary2, base_offset)); 4277 } 4278 4279 if (is_array_equ && is_char) { 4280 // arrays_equals when used for char[]. 4281 shll(limit, 1); // byte count != 0 4282 } 4283 movl(result, limit); // copy 4284 4285 if (UseAVX >= 2) { 4286 // With AVX2, use 32-byte vector compare 4287 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4288 4289 // Compare 32-byte vectors 4290 if (expand_ary2) { 4291 andl(result, 0x0000000f); // tail count (in bytes) 4292 andl(limit, 0xfffffff0); // vector count (in bytes) 4293 jcc(Assembler::zero, COMPARE_TAIL); 4294 } else { 4295 andl(result, 0x0000001f); // tail count (in bytes) 4296 andl(limit, 0xffffffe0); // vector count (in bytes) 4297 jcc(Assembler::zero, COMPARE_TAIL_16); 4298 } 4299 4300 lea(ary1, Address(ary1, limit, scaleFactor)); 4301 lea(ary2, Address(ary2, limit, Address::times_1)); 4302 negptr(limit); 4303 4304 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4305 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4306 4307 cmpl(limit, -64); 4308 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4309 4310 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4311 4312 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4313 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4314 kortestql(mask, mask); 4315 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4316 addptr(limit, 64); // update since we already compared at this addr 4317 cmpl(limit, -64); 4318 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4319 4320 // At this point we may still need to compare -limit+result bytes. 4321 // We could execute the next two instruction and just continue via non-wide path: 4322 // cmpl(limit, 0); 4323 // jcc(Assembler::equal, COMPARE_TAIL); // true 4324 // But since we stopped at the points ary{1,2}+limit which are 4325 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4326 // (|limit| <= 32 and result < 32), 4327 // we may just compare the last 64 bytes. 4328 // 4329 addptr(result, -64); // it is safe, bc we just came from this area 4330 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4331 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4332 kortestql(mask, mask); 4333 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4334 4335 jmp(TRUE_LABEL); 4336 4337 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4338 4339 }//if (VM_Version::supports_avx512vlbw()) 4340 4341 bind(COMPARE_WIDE_VECTORS); 4342 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4343 if (expand_ary2) { 4344 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4345 } else { 4346 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4347 } 4348 vpxor(vec1, vec2); 4349 4350 vptest(vec1, vec1); 4351 jcc(Assembler::notZero, FALSE_LABEL); 4352 addptr(limit, scaleIncr * 2); 4353 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4354 4355 testl(result, result); 4356 jcc(Assembler::zero, TRUE_LABEL); 4357 4358 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4359 if (expand_ary2) { 4360 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4361 } else { 4362 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4363 } 4364 vpxor(vec1, vec2); 4365 4366 vptest(vec1, vec1); 4367 jcc(Assembler::notZero, FALSE_LABEL); 4368 jmp(TRUE_LABEL); 4369 4370 bind(COMPARE_TAIL_16); // limit is zero 4371 movl(limit, result); 4372 4373 // Compare 16-byte chunks 4374 andl(result, 0x0000000f); // tail count (in bytes) 4375 andl(limit, 0xfffffff0); // vector count (in bytes) 4376 jcc(Assembler::zero, COMPARE_TAIL); 4377 4378 lea(ary1, Address(ary1, limit, scaleFactor)); 4379 lea(ary2, Address(ary2, limit, Address::times_1)); 4380 negptr(limit); 4381 4382 bind(COMPARE_WIDE_VECTORS_16); 4383 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4384 if (expand_ary2) { 4385 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4386 } else { 4387 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4388 } 4389 pxor(vec1, vec2); 4390 4391 ptest(vec1, vec1); 4392 jcc(Assembler::notZero, FALSE_LABEL); 4393 addptr(limit, scaleIncr); 4394 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4395 4396 bind(COMPARE_TAIL); // limit is zero 4397 movl(limit, result); 4398 // Fallthru to tail compare 4399 } else if (UseSSE42Intrinsics) { 4400 // With SSE4.2, use double quad vector compare 4401 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4402 4403 // Compare 16-byte vectors 4404 andl(result, 0x0000000f); // tail count (in bytes) 4405 andl(limit, 0xfffffff0); // vector count (in bytes) 4406 jcc(Assembler::zero, COMPARE_TAIL); 4407 4408 lea(ary1, Address(ary1, limit, Address::times_1)); 4409 lea(ary2, Address(ary2, limit, Address::times_1)); 4410 negptr(limit); 4411 4412 bind(COMPARE_WIDE_VECTORS); 4413 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4414 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4415 pxor(vec1, vec2); 4416 4417 ptest(vec1, vec1); 4418 jcc(Assembler::notZero, FALSE_LABEL); 4419 addptr(limit, 16); 4420 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4421 4422 testl(result, result); 4423 jcc(Assembler::zero, TRUE_LABEL); 4424 4425 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4426 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4427 pxor(vec1, vec2); 4428 4429 ptest(vec1, vec1); 4430 jccb(Assembler::notZero, FALSE_LABEL); 4431 jmpb(TRUE_LABEL); 4432 4433 bind(COMPARE_TAIL); // limit is zero 4434 movl(limit, result); 4435 // Fallthru to tail compare 4436 } 4437 4438 // Compare 4-byte vectors 4439 if (expand_ary2) { 4440 testl(result, result); 4441 jccb(Assembler::zero, TRUE_LABEL); 4442 } else { 4443 andl(limit, 0xfffffffc); // vector count (in bytes) 4444 jccb(Assembler::zero, COMPARE_CHAR); 4445 } 4446 4447 lea(ary1, Address(ary1, limit, scaleFactor)); 4448 lea(ary2, Address(ary2, limit, Address::times_1)); 4449 negptr(limit); 4450 4451 bind(COMPARE_VECTORS); 4452 if (expand_ary2) { 4453 // There are no "vector" operations for bytes to shorts 4454 movzbl(chr, Address(ary2, limit, Address::times_1)); 4455 cmpw(Address(ary1, limit, Address::times_2), chr); 4456 jccb(Assembler::notEqual, FALSE_LABEL); 4457 addptr(limit, 1); 4458 jcc(Assembler::notZero, COMPARE_VECTORS); 4459 jmp(TRUE_LABEL); 4460 } else { 4461 movl(chr, Address(ary1, limit, Address::times_1)); 4462 cmpl(chr, Address(ary2, limit, Address::times_1)); 4463 jccb(Assembler::notEqual, FALSE_LABEL); 4464 addptr(limit, 4); 4465 jcc(Assembler::notZero, COMPARE_VECTORS); 4466 } 4467 4468 // Compare trailing char (final 2 bytes), if any 4469 bind(COMPARE_CHAR); 4470 testl(result, 0x2); // tail char 4471 jccb(Assembler::zero, COMPARE_BYTE); 4472 load_unsigned_short(chr, Address(ary1, 0)); 4473 load_unsigned_short(limit, Address(ary2, 0)); 4474 cmpl(chr, limit); 4475 jccb(Assembler::notEqual, FALSE_LABEL); 4476 4477 if (is_array_equ && is_char) { 4478 bind(COMPARE_BYTE); 4479 } else { 4480 lea(ary1, Address(ary1, 2)); 4481 lea(ary2, Address(ary2, 2)); 4482 4483 bind(COMPARE_BYTE); 4484 testl(result, 0x1); // tail byte 4485 jccb(Assembler::zero, TRUE_LABEL); 4486 load_unsigned_byte(chr, Address(ary1, 0)); 4487 load_unsigned_byte(limit, Address(ary2, 0)); 4488 cmpl(chr, limit); 4489 jccb(Assembler::notEqual, FALSE_LABEL); 4490 } 4491 bind(TRUE_LABEL); 4492 movl(result, 1); // return true 4493 jmpb(DONE); 4494 4495 bind(FALSE_LABEL); 4496 xorl(result, result); // return false 4497 4498 // That's it 4499 bind(DONE); 4500 if (UseAVX >= 2) { 4501 // clean upper bits of YMM registers 4502 vpxor(vec1, vec1); 4503 vpxor(vec2, vec2); 4504 } 4505 } 4506 4507 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4508 #define __ masm. 4509 Register dst = stub.data<0>(); 4510 XMMRegister src = stub.data<1>(); 4511 address target = stub.data<2>(); 4512 __ bind(stub.entry()); 4513 __ subptr(rsp, 8); 4514 __ movdbl(Address(rsp), src); 4515 __ call(RuntimeAddress(target)); 4516 // APX REX2 encoding for pop(dst) increases the stub size by 1 byte. 4517 __ pop(dst); 4518 __ jmp(stub.continuation()); 4519 #undef __ 4520 } 4521 4522 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4523 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4524 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4525 4526 address slowpath_target; 4527 if (dst_bt == T_INT) { 4528 if (src_bt == T_FLOAT) { 4529 cvttss2sil(dst, src); 4530 cmpl(dst, 0x80000000); 4531 slowpath_target = StubRoutines::x86::f2i_fixup(); 4532 } else { 4533 cvttsd2sil(dst, src); 4534 cmpl(dst, 0x80000000); 4535 slowpath_target = StubRoutines::x86::d2i_fixup(); 4536 } 4537 } else { 4538 if (src_bt == T_FLOAT) { 4539 cvttss2siq(dst, src); 4540 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4541 slowpath_target = StubRoutines::x86::f2l_fixup(); 4542 } else { 4543 cvttsd2siq(dst, src); 4544 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4545 slowpath_target = StubRoutines::x86::d2l_fixup(); 4546 } 4547 } 4548 4549 // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte. 4550 int max_size = 23 + (UseAPX ? 1 : 0); 4551 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath); 4552 jcc(Assembler::equal, stub->entry()); 4553 bind(stub->continuation()); 4554 } 4555 4556 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4557 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4558 switch(ideal_opc) { 4559 case Op_LShiftVS: 4560 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4561 case Op_LShiftVI: 4562 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4563 case Op_LShiftVL: 4564 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4565 case Op_RShiftVS: 4566 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4567 case Op_RShiftVI: 4568 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4569 case Op_RShiftVL: 4570 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4571 case Op_URShiftVS: 4572 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4573 case Op_URShiftVI: 4574 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4575 case Op_URShiftVL: 4576 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4577 case Op_RotateRightV: 4578 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4579 case Op_RotateLeftV: 4580 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4581 default: 4582 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4583 break; 4584 } 4585 } 4586 4587 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4588 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) { 4589 if (is_unsigned) { 4590 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4591 } else { 4592 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4593 } 4594 } 4595 4596 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4597 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4598 switch (elem_bt) { 4599 case T_BYTE: 4600 if (ideal_opc == Op_SaturatingAddV) { 4601 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4602 } else { 4603 assert(ideal_opc == Op_SaturatingSubV, ""); 4604 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4605 } 4606 break; 4607 case T_SHORT: 4608 if (ideal_opc == Op_SaturatingAddV) { 4609 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4610 } else { 4611 assert(ideal_opc == Op_SaturatingSubV, ""); 4612 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4613 } 4614 break; 4615 default: 4616 fatal("Unsupported type %s", type2name(elem_bt)); 4617 break; 4618 } 4619 } 4620 4621 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4622 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4623 switch (elem_bt) { 4624 case T_BYTE: 4625 if (ideal_opc == Op_SaturatingAddV) { 4626 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4627 } else { 4628 assert(ideal_opc == Op_SaturatingSubV, ""); 4629 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4630 } 4631 break; 4632 case T_SHORT: 4633 if (ideal_opc == Op_SaturatingAddV) { 4634 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4635 } else { 4636 assert(ideal_opc == Op_SaturatingSubV, ""); 4637 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4638 } 4639 break; 4640 default: 4641 fatal("Unsupported type %s", type2name(elem_bt)); 4642 break; 4643 } 4644 } 4645 4646 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4647 Address src2, bool is_unsigned, bool merge, int vlen_enc) { 4648 if (is_unsigned) { 4649 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4650 } else { 4651 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4652 } 4653 } 4654 4655 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4656 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4657 switch (elem_bt) { 4658 case T_BYTE: 4659 if (ideal_opc == Op_SaturatingAddV) { 4660 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4661 } else { 4662 assert(ideal_opc == Op_SaturatingSubV, ""); 4663 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4664 } 4665 break; 4666 case T_SHORT: 4667 if (ideal_opc == Op_SaturatingAddV) { 4668 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4669 } else { 4670 assert(ideal_opc == Op_SaturatingSubV, ""); 4671 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4672 } 4673 break; 4674 default: 4675 fatal("Unsupported type %s", type2name(elem_bt)); 4676 break; 4677 } 4678 } 4679 4680 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4681 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4682 switch (elem_bt) { 4683 case T_BYTE: 4684 if (ideal_opc == Op_SaturatingAddV) { 4685 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4686 } else { 4687 assert(ideal_opc == Op_SaturatingSubV, ""); 4688 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4689 } 4690 break; 4691 case T_SHORT: 4692 if (ideal_opc == Op_SaturatingAddV) { 4693 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4694 } else { 4695 assert(ideal_opc == Op_SaturatingSubV, ""); 4696 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4697 } 4698 break; 4699 default: 4700 fatal("Unsupported type %s", type2name(elem_bt)); 4701 break; 4702 } 4703 } 4704 4705 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4706 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4707 bool is_varshift) { 4708 switch (ideal_opc) { 4709 case Op_AddVB: 4710 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4711 case Op_AddVS: 4712 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4713 case Op_AddVI: 4714 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4715 case Op_AddVL: 4716 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4717 case Op_AddVF: 4718 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4719 case Op_AddVD: 4720 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4721 case Op_SubVB: 4722 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4723 case Op_SubVS: 4724 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4725 case Op_SubVI: 4726 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4727 case Op_SubVL: 4728 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4729 case Op_SubVF: 4730 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4731 case Op_SubVD: 4732 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4733 case Op_MulVS: 4734 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4735 case Op_MulVI: 4736 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4737 case Op_MulVL: 4738 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4739 case Op_MulVF: 4740 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4741 case Op_MulVD: 4742 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4743 case Op_DivVF: 4744 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4745 case Op_DivVD: 4746 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4747 case Op_SqrtVF: 4748 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4749 case Op_SqrtVD: 4750 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4751 case Op_AbsVB: 4752 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4753 case Op_AbsVS: 4754 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4755 case Op_AbsVI: 4756 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4757 case Op_AbsVL: 4758 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4759 case Op_FmaVF: 4760 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4761 case Op_FmaVD: 4762 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4763 case Op_VectorRearrange: 4764 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4765 case Op_LShiftVS: 4766 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4767 case Op_LShiftVI: 4768 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4769 case Op_LShiftVL: 4770 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4771 case Op_RShiftVS: 4772 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4773 case Op_RShiftVI: 4774 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4775 case Op_RShiftVL: 4776 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4777 case Op_URShiftVS: 4778 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4779 case Op_URShiftVI: 4780 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4781 case Op_URShiftVL: 4782 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4783 case Op_RotateLeftV: 4784 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4785 case Op_RotateRightV: 4786 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4787 case Op_MaxV: 4788 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4789 case Op_MinV: 4790 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4791 case Op_UMinV: 4792 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4793 case Op_UMaxV: 4794 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4795 case Op_XorV: 4796 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4797 case Op_OrV: 4798 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4799 case Op_AndV: 4800 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4801 default: 4802 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4803 break; 4804 } 4805 } 4806 4807 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4808 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4809 switch (ideal_opc) { 4810 case Op_AddVB: 4811 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4812 case Op_AddVS: 4813 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4814 case Op_AddVI: 4815 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4816 case Op_AddVL: 4817 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4818 case Op_AddVF: 4819 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4820 case Op_AddVD: 4821 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4822 case Op_SubVB: 4823 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4824 case Op_SubVS: 4825 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4826 case Op_SubVI: 4827 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4828 case Op_SubVL: 4829 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4830 case Op_SubVF: 4831 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4832 case Op_SubVD: 4833 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4834 case Op_MulVS: 4835 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4836 case Op_MulVI: 4837 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4838 case Op_MulVL: 4839 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4840 case Op_MulVF: 4841 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4842 case Op_MulVD: 4843 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4844 case Op_DivVF: 4845 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4846 case Op_DivVD: 4847 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4848 case Op_FmaVF: 4849 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4850 case Op_FmaVD: 4851 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4852 case Op_MaxV: 4853 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4854 case Op_MinV: 4855 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4856 case Op_UMaxV: 4857 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4858 case Op_UMinV: 4859 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4860 case Op_XorV: 4861 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4862 case Op_OrV: 4863 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4864 case Op_AndV: 4865 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4866 default: 4867 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4868 break; 4869 } 4870 } 4871 4872 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4873 KRegister src1, KRegister src2) { 4874 BasicType etype = T_ILLEGAL; 4875 switch(mask_len) { 4876 case 2: 4877 case 4: 4878 case 8: etype = T_BYTE; break; 4879 case 16: etype = T_SHORT; break; 4880 case 32: etype = T_INT; break; 4881 case 64: etype = T_LONG; break; 4882 default: fatal("Unsupported type"); break; 4883 } 4884 assert(etype != T_ILLEGAL, ""); 4885 switch(ideal_opc) { 4886 case Op_AndVMask: 4887 kand(etype, dst, src1, src2); break; 4888 case Op_OrVMask: 4889 kor(etype, dst, src1, src2); break; 4890 case Op_XorVMask: 4891 kxor(etype, dst, src1, src2); break; 4892 default: 4893 fatal("Unsupported masked operation"); break; 4894 } 4895 } 4896 4897 /* 4898 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4899 * If src is NaN, the result is 0. 4900 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4901 * the result is equal to the value of Integer.MIN_VALUE. 4902 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4903 * the result is equal to the value of Integer.MAX_VALUE. 4904 */ 4905 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4906 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4907 Register rscratch, AddressLiteral float_sign_flip, 4908 int vec_enc) { 4909 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4910 Label done; 4911 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4912 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4913 vptest(xtmp2, xtmp2, vec_enc); 4914 jccb(Assembler::equal, done); 4915 4916 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4917 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4918 4919 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4920 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4921 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4922 4923 // Recompute the mask for remaining special value. 4924 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4925 // Extract SRC values corresponding to TRUE mask lanes. 4926 vpand(xtmp4, xtmp2, src, vec_enc); 4927 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4928 // values are set. 4929 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4930 4931 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4932 bind(done); 4933 } 4934 4935 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4936 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4937 Register rscratch, AddressLiteral float_sign_flip, 4938 int vec_enc) { 4939 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4940 Label done; 4941 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4942 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4943 kortestwl(ktmp1, ktmp1); 4944 jccb(Assembler::equal, done); 4945 4946 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4947 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4948 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4949 4950 kxorwl(ktmp1, ktmp1, ktmp2); 4951 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4952 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4953 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4954 bind(done); 4955 } 4956 4957 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4958 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4959 Register rscratch, AddressLiteral double_sign_flip, 4960 int vec_enc) { 4961 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4962 4963 Label done; 4964 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4965 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4966 kortestwl(ktmp1, ktmp1); 4967 jccb(Assembler::equal, done); 4968 4969 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4970 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4971 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4972 4973 kxorwl(ktmp1, ktmp1, ktmp2); 4974 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4975 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4976 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4977 bind(done); 4978 } 4979 4980 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4981 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4982 Register rscratch, AddressLiteral float_sign_flip, 4983 int vec_enc) { 4984 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4985 Label done; 4986 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4987 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4988 kortestwl(ktmp1, ktmp1); 4989 jccb(Assembler::equal, done); 4990 4991 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4992 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4993 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4994 4995 kxorwl(ktmp1, ktmp1, ktmp2); 4996 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4997 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4998 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4999 bind(done); 5000 } 5001 5002 /* 5003 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5004 * If src is NaN, the result is 0. 5005 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 5006 * the result is equal to the value of Long.MIN_VALUE. 5007 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 5008 * the result is equal to the value of Long.MAX_VALUE. 5009 */ 5010 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5011 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5012 Register rscratch, AddressLiteral double_sign_flip, 5013 int vec_enc) { 5014 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5015 5016 Label done; 5017 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5018 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 5019 kortestwl(ktmp1, ktmp1); 5020 jccb(Assembler::equal, done); 5021 5022 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5023 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5024 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5025 5026 kxorwl(ktmp1, ktmp1, ktmp2); 5027 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5028 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5029 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5030 bind(done); 5031 } 5032 5033 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5034 XMMRegister xtmp, int index, int vec_enc) { 5035 assert(vec_enc < Assembler::AVX_512bit, ""); 5036 if (vec_enc == Assembler::AVX_256bit) { 5037 vextractf128_high(xtmp, src); 5038 vshufps(dst, src, xtmp, index, vec_enc); 5039 } else { 5040 vshufps(dst, src, zero, index, vec_enc); 5041 } 5042 } 5043 5044 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5045 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5046 AddressLiteral float_sign_flip, int src_vec_enc) { 5047 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5048 5049 Label done; 5050 // Compare the destination lanes with float_sign_flip 5051 // value to get mask for all special values. 5052 movdqu(xtmp1, float_sign_flip, rscratch); 5053 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5054 ptest(xtmp2, xtmp2); 5055 jccb(Assembler::equal, done); 5056 5057 // Flip float_sign_flip to get max integer value. 5058 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5059 pxor(xtmp1, xtmp4); 5060 5061 // Set detination lanes corresponding to unordered source lanes as zero. 5062 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5063 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5064 5065 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5066 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5067 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5068 5069 // Recompute the mask for remaining special value. 5070 pxor(xtmp2, xtmp3); 5071 // Extract mask corresponding to non-negative source lanes. 5072 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5073 5074 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5075 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5076 pand(xtmp3, xtmp2); 5077 5078 // Replace destination lanes holding special value(0x80000000) with max int 5079 // if corresponding source lane holds a +ve value. 5080 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5081 bind(done); 5082 } 5083 5084 5085 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5086 XMMRegister xtmp, Register rscratch, int vec_enc) { 5087 switch(to_elem_bt) { 5088 case T_SHORT: 5089 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5090 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5091 vpackusdw(dst, dst, zero, vec_enc); 5092 if (vec_enc == Assembler::AVX_256bit) { 5093 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5094 } 5095 break; 5096 case T_BYTE: 5097 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5098 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5099 vpackusdw(dst, dst, zero, vec_enc); 5100 if (vec_enc == Assembler::AVX_256bit) { 5101 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5102 } 5103 vpackuswb(dst, dst, zero, vec_enc); 5104 break; 5105 default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt)); 5106 } 5107 } 5108 5109 /* 5110 * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):- 5111 * a) Perform vector D2L/F2I cast. 5112 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5113 * It signifies that source value could be any of the special floating point 5114 * values(NaN,-Inf,Inf,Max,-Min). 5115 * c) Set destination to zero if source is NaN value. 5116 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5117 */ 5118 5119 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5120 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5121 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5122 int to_elem_sz = type2aelembytes(to_elem_bt); 5123 assert(to_elem_sz <= 4, ""); 5124 vcvttps2dq(dst, src, vec_enc); 5125 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5126 if (to_elem_sz < 4) { 5127 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5128 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5129 } 5130 } 5131 5132 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5133 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5134 Register rscratch, int vec_enc) { 5135 int to_elem_sz = type2aelembytes(to_elem_bt); 5136 assert(to_elem_sz <= 4, ""); 5137 vcvttps2dq(dst, src, vec_enc); 5138 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5139 switch(to_elem_bt) { 5140 case T_INT: 5141 break; 5142 case T_SHORT: 5143 evpmovdw(dst, dst, vec_enc); 5144 break; 5145 case T_BYTE: 5146 evpmovdb(dst, dst, vec_enc); 5147 break; 5148 default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt)); 5149 } 5150 } 5151 5152 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5153 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5154 Register rscratch, int vec_enc) { 5155 evcvttps2qq(dst, src, vec_enc); 5156 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5157 } 5158 5159 // Handling for downcasting from double to integer or sub-word types on AVX2. 5160 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5161 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5162 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5163 int to_elem_sz = type2aelembytes(to_elem_bt); 5164 assert(to_elem_sz < 8, ""); 5165 vcvttpd2dq(dst, src, vec_enc); 5166 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5167 float_sign_flip, vec_enc); 5168 if (to_elem_sz < 4) { 5169 // xtmp4 holds all zero lanes. 5170 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5171 } 5172 } 5173 5174 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5175 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5176 KRegister ktmp2, AddressLiteral sign_flip, 5177 Register rscratch, int vec_enc) { 5178 if (VM_Version::supports_avx512dq()) { 5179 evcvttpd2qq(dst, src, vec_enc); 5180 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5181 switch(to_elem_bt) { 5182 case T_LONG: 5183 break; 5184 case T_INT: 5185 evpmovsqd(dst, dst, vec_enc); 5186 break; 5187 case T_SHORT: 5188 evpmovsqd(dst, dst, vec_enc); 5189 evpmovdw(dst, dst, vec_enc); 5190 break; 5191 case T_BYTE: 5192 evpmovsqd(dst, dst, vec_enc); 5193 evpmovdb(dst, dst, vec_enc); 5194 break; 5195 default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt)); 5196 } 5197 } else { 5198 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5199 vcvttpd2dq(dst, src, vec_enc); 5200 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5201 switch(to_elem_bt) { 5202 case T_INT: 5203 break; 5204 case T_SHORT: 5205 evpmovdw(dst, dst, vec_enc); 5206 break; 5207 case T_BYTE: 5208 evpmovdb(dst, dst, vec_enc); 5209 break; 5210 default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt)); 5211 } 5212 } 5213 } 5214 5215 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5216 switch(to_elem_bt) { 5217 case T_LONG: 5218 evcvttps2qqs(dst, src, vec_enc); 5219 break; 5220 case T_INT: 5221 evcvttps2dqs(dst, src, vec_enc); 5222 break; 5223 case T_SHORT: 5224 evcvttps2dqs(dst, src, vec_enc); 5225 evpmovdw(dst, dst, vec_enc); 5226 break; 5227 case T_BYTE: 5228 evcvttps2dqs(dst, src, vec_enc); 5229 evpmovdb(dst, dst, vec_enc); 5230 break; 5231 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt)); 5232 } 5233 } 5234 5235 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) { 5236 switch(to_elem_bt) { 5237 case T_LONG: 5238 evcvttps2qqs(dst, src, vec_enc); 5239 break; 5240 case T_INT: 5241 evcvttps2dqs(dst, src, vec_enc); 5242 break; 5243 case T_SHORT: 5244 evcvttps2dqs(dst, src, vec_enc); 5245 evpmovdw(dst, dst, vec_enc); 5246 break; 5247 case T_BYTE: 5248 evcvttps2dqs(dst, src, vec_enc); 5249 evpmovdb(dst, dst, vec_enc); 5250 break; 5251 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt)); 5252 } 5253 } 5254 5255 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5256 switch(to_elem_bt) { 5257 case T_LONG: 5258 evcvttpd2qqs(dst, src, vec_enc); 5259 break; 5260 case T_INT: 5261 evcvttpd2dqs(dst, src, vec_enc); 5262 break; 5263 case T_SHORT: 5264 evcvttpd2dqs(dst, src, vec_enc); 5265 evpmovdw(dst, dst, vec_enc); 5266 break; 5267 case T_BYTE: 5268 evcvttpd2dqs(dst, src, vec_enc); 5269 evpmovdb(dst, dst, vec_enc); 5270 break; 5271 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt)); 5272 } 5273 } 5274 5275 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) { 5276 switch(to_elem_bt) { 5277 case T_LONG: 5278 evcvttpd2qqs(dst, src, vec_enc); 5279 break; 5280 case T_INT: 5281 evcvttpd2dqs(dst, src, vec_enc); 5282 break; 5283 case T_SHORT: 5284 evcvttpd2dqs(dst, src, vec_enc); 5285 evpmovdw(dst, dst, vec_enc); 5286 break; 5287 case T_BYTE: 5288 evcvttpd2dqs(dst, src, vec_enc); 5289 evpmovdb(dst, dst, vec_enc); 5290 break; 5291 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt)); 5292 } 5293 } 5294 5295 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5296 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5297 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5298 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5299 // and re-instantiate original MXCSR.RC mode after that. 5300 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5301 5302 mov64(tmp, julong_cast(0.5L)); 5303 evpbroadcastq(xtmp1, tmp, vec_enc); 5304 vaddpd(xtmp1, src , xtmp1, vec_enc); 5305 evcvtpd2qq(dst, xtmp1, vec_enc); 5306 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5307 double_sign_flip, vec_enc);; 5308 5309 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5310 } 5311 5312 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5313 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5314 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5315 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5316 // and re-instantiate original MXCSR.RC mode after that. 5317 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5318 5319 movl(tmp, jint_cast(0.5)); 5320 movq(xtmp1, tmp); 5321 vbroadcastss(xtmp1, xtmp1, vec_enc); 5322 vaddps(xtmp1, src , xtmp1, vec_enc); 5323 vcvtps2dq(dst, xtmp1, vec_enc); 5324 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5325 float_sign_flip, vec_enc); 5326 5327 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5328 } 5329 5330 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5331 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5332 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5333 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5334 // and re-instantiate original MXCSR.RC mode after that. 5335 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5336 5337 movl(tmp, jint_cast(0.5)); 5338 movq(xtmp1, tmp); 5339 vbroadcastss(xtmp1, xtmp1, vec_enc); 5340 vaddps(xtmp1, src , xtmp1, vec_enc); 5341 vcvtps2dq(dst, xtmp1, vec_enc); 5342 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5343 5344 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5345 } 5346 5347 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5348 BasicType from_elem_bt, BasicType to_elem_bt) { 5349 switch (from_elem_bt) { 5350 case T_BYTE: 5351 switch (to_elem_bt) { 5352 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5353 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5354 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5355 default: ShouldNotReachHere(); 5356 } 5357 break; 5358 case T_SHORT: 5359 switch (to_elem_bt) { 5360 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5361 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5362 default: ShouldNotReachHere(); 5363 } 5364 break; 5365 case T_INT: 5366 assert(to_elem_bt == T_LONG, ""); 5367 vpmovzxdq(dst, src, vlen_enc); 5368 break; 5369 default: 5370 ShouldNotReachHere(); 5371 } 5372 } 5373 5374 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5375 BasicType from_elem_bt, BasicType to_elem_bt) { 5376 switch (from_elem_bt) { 5377 case T_BYTE: 5378 switch (to_elem_bt) { 5379 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5380 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5381 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5382 default: ShouldNotReachHere(); 5383 } 5384 break; 5385 case T_SHORT: 5386 switch (to_elem_bt) { 5387 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5388 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5389 default: ShouldNotReachHere(); 5390 } 5391 break; 5392 case T_INT: 5393 assert(to_elem_bt == T_LONG, ""); 5394 vpmovsxdq(dst, src, vlen_enc); 5395 break; 5396 default: 5397 ShouldNotReachHere(); 5398 } 5399 } 5400 5401 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5402 BasicType dst_bt, BasicType src_bt, int vlen) { 5403 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5404 assert(vlen_enc != AVX_512bit, ""); 5405 5406 int dst_bt_size = type2aelembytes(dst_bt); 5407 int src_bt_size = type2aelembytes(src_bt); 5408 if (dst_bt_size > src_bt_size) { 5409 switch (dst_bt_size / src_bt_size) { 5410 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5411 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5412 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5413 default: ShouldNotReachHere(); 5414 } 5415 } else { 5416 assert(dst_bt_size < src_bt_size, ""); 5417 switch (src_bt_size / dst_bt_size) { 5418 case 2: { 5419 if (vlen_enc == AVX_128bit) { 5420 vpacksswb(dst, src, src, vlen_enc); 5421 } else { 5422 vpacksswb(dst, src, src, vlen_enc); 5423 vpermq(dst, dst, 0x08, vlen_enc); 5424 } 5425 break; 5426 } 5427 case 4: { 5428 if (vlen_enc == AVX_128bit) { 5429 vpackssdw(dst, src, src, vlen_enc); 5430 vpacksswb(dst, dst, dst, vlen_enc); 5431 } else { 5432 vpackssdw(dst, src, src, vlen_enc); 5433 vpermq(dst, dst, 0x08, vlen_enc); 5434 vpacksswb(dst, dst, dst, AVX_128bit); 5435 } 5436 break; 5437 } 5438 case 8: { 5439 if (vlen_enc == AVX_128bit) { 5440 vpshufd(dst, src, 0x08, vlen_enc); 5441 vpackssdw(dst, dst, dst, vlen_enc); 5442 vpacksswb(dst, dst, dst, vlen_enc); 5443 } else { 5444 vpshufd(dst, src, 0x08, vlen_enc); 5445 vpermq(dst, dst, 0x08, vlen_enc); 5446 vpackssdw(dst, dst, dst, AVX_128bit); 5447 vpacksswb(dst, dst, dst, AVX_128bit); 5448 } 5449 break; 5450 } 5451 default: ShouldNotReachHere(); 5452 } 5453 } 5454 } 5455 5456 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5457 bool merge, BasicType bt, int vlen_enc) { 5458 if (bt == T_INT) { 5459 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5460 } else { 5461 assert(bt == T_LONG, ""); 5462 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5463 } 5464 } 5465 5466 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5467 bool merge, BasicType bt, int vlen_enc) { 5468 if (bt == T_INT) { 5469 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5470 } else { 5471 assert(bt == T_LONG, ""); 5472 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5473 } 5474 } 5475 5476 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5477 Register rtmp2, XMMRegister xtmp, int mask_len, 5478 int vec_enc) { 5479 int index = 0; 5480 int vindex = 0; 5481 mov64(rtmp1, 0x0101010101010101L); 5482 pdepq(rtmp1, src, rtmp1); 5483 if (mask_len > 8) { 5484 movq(rtmp2, src); 5485 vpxor(xtmp, xtmp, xtmp, vec_enc); 5486 movq(xtmp, rtmp1); 5487 } 5488 movq(dst, rtmp1); 5489 5490 mask_len -= 8; 5491 while (mask_len > 0) { 5492 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5493 index++; 5494 if ((index % 2) == 0) { 5495 pxor(xtmp, xtmp); 5496 } 5497 mov64(rtmp1, 0x0101010101010101L); 5498 shrq(rtmp2, 8); 5499 pdepq(rtmp1, rtmp2, rtmp1); 5500 pinsrq(xtmp, rtmp1, index % 2); 5501 vindex = index / 2; 5502 if (vindex) { 5503 // Write entire 16 byte vector when both 64 bit 5504 // lanes are update to save redundant instructions. 5505 if (index % 2) { 5506 vinsertf128(dst, dst, xtmp, vindex); 5507 } 5508 } else { 5509 vmovdqu(dst, xtmp); 5510 } 5511 mask_len -= 8; 5512 } 5513 } 5514 5515 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5516 switch(opc) { 5517 case Op_VectorMaskTrueCount: 5518 popcntq(dst, tmp); 5519 break; 5520 case Op_VectorMaskLastTrue: 5521 if (VM_Version::supports_lzcnt()) { 5522 lzcntq(tmp, tmp); 5523 movl(dst, 63); 5524 subl(dst, tmp); 5525 } else { 5526 movl(dst, -1); 5527 bsrq(tmp, tmp); 5528 cmov32(Assembler::notZero, dst, tmp); 5529 } 5530 break; 5531 case Op_VectorMaskFirstTrue: 5532 if (VM_Version::supports_bmi1()) { 5533 if (masklen < 32) { 5534 orl(tmp, 1 << masklen); 5535 tzcntl(dst, tmp); 5536 } else if (masklen == 32) { 5537 tzcntl(dst, tmp); 5538 } else { 5539 assert(masklen == 64, ""); 5540 tzcntq(dst, tmp); 5541 } 5542 } else { 5543 if (masklen < 32) { 5544 orl(tmp, 1 << masklen); 5545 bsfl(dst, tmp); 5546 } else { 5547 assert(masklen == 32 || masklen == 64, ""); 5548 movl(dst, masklen); 5549 if (masklen == 32) { 5550 bsfl(tmp, tmp); 5551 } else { 5552 bsfq(tmp, tmp); 5553 } 5554 cmov32(Assembler::notZero, dst, tmp); 5555 } 5556 } 5557 break; 5558 case Op_VectorMaskToLong: 5559 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5560 break; 5561 default: assert(false, "Unhandled mask operation"); 5562 } 5563 } 5564 5565 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5566 int masklen, int masksize, int vec_enc) { 5567 assert(VM_Version::supports_popcnt(), ""); 5568 5569 if(VM_Version::supports_avx512bw()) { 5570 kmovql(tmp, mask); 5571 } else { 5572 assert(masklen <= 16, ""); 5573 kmovwl(tmp, mask); 5574 } 5575 5576 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5577 // operations needs to be clipped. 5578 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5579 andq(tmp, (1 << masklen) - 1); 5580 } 5581 5582 vector_mask_operation_helper(opc, dst, tmp, masklen); 5583 } 5584 5585 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5586 Register tmp, int masklen, BasicType bt, int vec_enc) { 5587 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5588 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5589 assert(VM_Version::supports_popcnt(), ""); 5590 5591 bool need_clip = false; 5592 switch(bt) { 5593 case T_BOOLEAN: 5594 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5595 vpxor(xtmp, xtmp, xtmp, vec_enc); 5596 vpsubb(xtmp, xtmp, mask, vec_enc); 5597 vpmovmskb(tmp, xtmp, vec_enc); 5598 need_clip = masklen < 16; 5599 break; 5600 case T_BYTE: 5601 vpmovmskb(tmp, mask, vec_enc); 5602 need_clip = masklen < 16; 5603 break; 5604 case T_SHORT: 5605 vpacksswb(xtmp, mask, mask, vec_enc); 5606 if (masklen >= 16) { 5607 vpermpd(xtmp, xtmp, 8, vec_enc); 5608 } 5609 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5610 need_clip = masklen < 16; 5611 break; 5612 case T_INT: 5613 case T_FLOAT: 5614 vmovmskps(tmp, mask, vec_enc); 5615 need_clip = masklen < 4; 5616 break; 5617 case T_LONG: 5618 case T_DOUBLE: 5619 vmovmskpd(tmp, mask, vec_enc); 5620 need_clip = masklen < 2; 5621 break; 5622 default: assert(false, "Unhandled type, %s", type2name(bt)); 5623 } 5624 5625 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5626 // operations needs to be clipped. 5627 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5628 // need_clip implies masklen < 32 5629 andq(tmp, (1 << masklen) - 1); 5630 } 5631 5632 vector_mask_operation_helper(opc, dst, tmp, masklen); 5633 } 5634 5635 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5636 Register rtmp2, int mask_len) { 5637 kmov(rtmp1, src); 5638 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5639 mov64(rtmp2, -1L); 5640 pextq(rtmp2, rtmp2, rtmp1); 5641 kmov(dst, rtmp2); 5642 } 5643 5644 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5645 XMMRegister mask, Register rtmp, Register rscratch, 5646 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5647 int vec_enc) { 5648 assert(type2aelembytes(bt) >= 4, ""); 5649 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5650 address compress_perm_table = nullptr; 5651 address expand_perm_table = nullptr; 5652 if (type2aelembytes(bt) == 8) { 5653 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5654 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5655 vmovmskpd(rtmp, mask, vec_enc); 5656 } else { 5657 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5658 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5659 vmovmskps(rtmp, mask, vec_enc); 5660 } 5661 shlq(rtmp, 5); // for 32 byte permute row. 5662 if (opcode == Op_CompressV) { 5663 lea(rscratch, ExternalAddress(compress_perm_table)); 5664 } else { 5665 lea(rscratch, ExternalAddress(expand_perm_table)); 5666 } 5667 addptr(rtmp, rscratch); 5668 vmovdqu(permv, Address(rtmp)); 5669 vpermps(dst, permv, src, Assembler::AVX_256bit); 5670 vpxor(xtmp, xtmp, xtmp, vec_enc); 5671 // Blend the result with zero vector using permute mask, each column entry 5672 // in a permute table row contains either a valid permute index or a -1 (default) 5673 // value, this can potentially be used as a blending mask after 5674 // compressing/expanding the source vector lanes. 5675 vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv); 5676 } 5677 5678 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5679 bool merge, BasicType bt, int vec_enc) { 5680 if (opcode == Op_CompressV) { 5681 switch(bt) { 5682 case T_BYTE: 5683 evpcompressb(dst, mask, src, merge, vec_enc); 5684 break; 5685 case T_CHAR: 5686 case T_SHORT: 5687 evpcompressw(dst, mask, src, merge, vec_enc); 5688 break; 5689 case T_INT: 5690 evpcompressd(dst, mask, src, merge, vec_enc); 5691 break; 5692 case T_FLOAT: 5693 evcompressps(dst, mask, src, merge, vec_enc); 5694 break; 5695 case T_LONG: 5696 evpcompressq(dst, mask, src, merge, vec_enc); 5697 break; 5698 case T_DOUBLE: 5699 evcompresspd(dst, mask, src, merge, vec_enc); 5700 break; 5701 default: 5702 fatal("Unsupported type %s", type2name(bt)); 5703 break; 5704 } 5705 } else { 5706 assert(opcode == Op_ExpandV, ""); 5707 switch(bt) { 5708 case T_BYTE: 5709 evpexpandb(dst, mask, src, merge, vec_enc); 5710 break; 5711 case T_CHAR: 5712 case T_SHORT: 5713 evpexpandw(dst, mask, src, merge, vec_enc); 5714 break; 5715 case T_INT: 5716 evpexpandd(dst, mask, src, merge, vec_enc); 5717 break; 5718 case T_FLOAT: 5719 evexpandps(dst, mask, src, merge, vec_enc); 5720 break; 5721 case T_LONG: 5722 evpexpandq(dst, mask, src, merge, vec_enc); 5723 break; 5724 case T_DOUBLE: 5725 evexpandpd(dst, mask, src, merge, vec_enc); 5726 break; 5727 default: 5728 fatal("Unsupported type %s", type2name(bt)); 5729 break; 5730 } 5731 } 5732 } 5733 5734 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5735 KRegister ktmp1, int vec_enc) { 5736 if (opcode == Op_SignumVD) { 5737 vsubpd(dst, zero, one, vec_enc); 5738 // if src < 0 ? -1 : 1 5739 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5740 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5741 // if src == NaN, -0.0 or 0.0 return src. 5742 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5743 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5744 } else { 5745 assert(opcode == Op_SignumVF, ""); 5746 vsubps(dst, zero, one, vec_enc); 5747 // if src < 0 ? -1 : 1 5748 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5749 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5750 // if src == NaN, -0.0 or 0.0 return src. 5751 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5752 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5753 } 5754 } 5755 5756 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5757 XMMRegister xtmp1, int vec_enc) { 5758 if (opcode == Op_SignumVD) { 5759 vsubpd(dst, zero, one, vec_enc); 5760 // if src < 0 ? -1 : 1 5761 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5762 // if src == NaN, -0.0 or 0.0 return src. 5763 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5764 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5765 } else { 5766 assert(opcode == Op_SignumVF, ""); 5767 vsubps(dst, zero, one, vec_enc); 5768 // if src < 0 ? -1 : 1 5769 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5770 // if src == NaN, -0.0 or 0.0 return src. 5771 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5772 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5773 } 5774 } 5775 5776 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5777 if (VM_Version::supports_avx512bw()) { 5778 if (mask_len > 32) { 5779 kmovql(dst, src); 5780 } else { 5781 kmovdl(dst, src); 5782 if (mask_len != 32) { 5783 kshiftrdl(dst, dst, 32 - mask_len); 5784 } 5785 } 5786 } else { 5787 assert(mask_len <= 16, ""); 5788 kmovwl(dst, src); 5789 if (mask_len != 16) { 5790 kshiftrwl(dst, dst, 16 - mask_len); 5791 } 5792 } 5793 } 5794 5795 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5796 int lane_size = type2aelembytes(bt); 5797 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5798 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) { 5799 movptr(rtmp, imm32); 5800 switch(lane_size) { 5801 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5802 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5803 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5804 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5805 fatal("Unsupported lane size %d", lane_size); 5806 break; 5807 } 5808 } else { 5809 movptr(rtmp, imm32); 5810 movq(dst, rtmp); 5811 switch(lane_size) { 5812 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5813 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5814 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5815 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5816 fatal("Unsupported lane size %d", lane_size); 5817 break; 5818 } 5819 } 5820 } 5821 5822 // 5823 // Following is lookup table based popcount computation algorithm:- 5824 // Index Bit set count 5825 // [ 0000 -> 0, 5826 // 0001 -> 1, 5827 // 0010 -> 1, 5828 // 0011 -> 2, 5829 // 0100 -> 1, 5830 // 0101 -> 2, 5831 // 0110 -> 2, 5832 // 0111 -> 3, 5833 // 1000 -> 1, 5834 // 1001 -> 2, 5835 // 1010 -> 3, 5836 // 1011 -> 3, 5837 // 1100 -> 2, 5838 // 1101 -> 3, 5839 // 1111 -> 4 ] 5840 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5841 // shuffle indices for lookup table access. 5842 // b. Right shift each byte of vector lane by 4 positions. 5843 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5844 // shuffle indices for lookup table access. 5845 // d. Add the bitset count of upper and lower 4 bits of each byte. 5846 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5847 // count of all the bytes of a quadword. 5848 // f. Perform step e. for upper 128bit vector lane. 5849 // g. Pack the bitset count of quadwords back to double word. 5850 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5851 5852 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5853 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5854 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5855 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5856 vpsrlw(dst, src, 4, vec_enc); 5857 vpand(dst, dst, xtmp1, vec_enc); 5858 vpand(xtmp1, src, xtmp1, vec_enc); 5859 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5860 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5861 vpshufb(dst, xtmp2, dst, vec_enc); 5862 vpaddb(dst, dst, xtmp1, vec_enc); 5863 } 5864 5865 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5866 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5867 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5868 // Following code is as per steps e,f,g and h of above algorithm. 5869 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5870 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5871 vpsadbw(dst, dst, xtmp2, vec_enc); 5872 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5873 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5874 vpackuswb(dst, xtmp1, dst, vec_enc); 5875 } 5876 5877 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5878 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5879 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5880 // Add the popcount of upper and lower bytes of word. 5881 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5882 vpsrlw(dst, xtmp1, 8, vec_enc); 5883 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5884 vpaddw(dst, dst, xtmp1, vec_enc); 5885 } 5886 5887 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5888 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5889 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5890 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5891 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5892 } 5893 5894 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5895 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5896 switch(bt) { 5897 case T_LONG: 5898 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5899 break; 5900 case T_INT: 5901 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5902 break; 5903 case T_CHAR: 5904 case T_SHORT: 5905 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5906 break; 5907 case T_BYTE: 5908 case T_BOOLEAN: 5909 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5910 break; 5911 default: 5912 fatal("Unsupported type %s", type2name(bt)); 5913 break; 5914 } 5915 } 5916 5917 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5918 KRegister mask, bool merge, int vec_enc) { 5919 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5920 switch(bt) { 5921 case T_LONG: 5922 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5923 evpopcntq(dst, mask, src, merge, vec_enc); 5924 break; 5925 case T_INT: 5926 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5927 evpopcntd(dst, mask, src, merge, vec_enc); 5928 break; 5929 case T_CHAR: 5930 case T_SHORT: 5931 assert(VM_Version::supports_avx512_bitalg(), ""); 5932 evpopcntw(dst, mask, src, merge, vec_enc); 5933 break; 5934 case T_BYTE: 5935 case T_BOOLEAN: 5936 assert(VM_Version::supports_avx512_bitalg(), ""); 5937 evpopcntb(dst, mask, src, merge, vec_enc); 5938 break; 5939 default: 5940 fatal("Unsupported type %s", type2name(bt)); 5941 break; 5942 } 5943 } 5944 5945 // Bit reversal algorithm first reverses the bits of each byte followed by 5946 // a byte level reversal for multi-byte primitive types (short/int/long). 5947 // Algorithm performs a lookup table access to get reverse bit sequence 5948 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5949 // is obtained by swapping the reverse bit sequences of upper and lower 5950 // nibble of a byte. 5951 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5952 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5953 if (VM_Version::supports_avx512vlbw()) { 5954 5955 // Get the reverse bit sequence of lower nibble of each byte. 5956 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5957 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5958 evpandq(dst, xtmp2, src, vec_enc); 5959 vpshufb(dst, xtmp1, dst, vec_enc); 5960 vpsllq(dst, dst, 4, vec_enc); 5961 5962 // Get the reverse bit sequence of upper nibble of each byte. 5963 vpandn(xtmp2, xtmp2, src, vec_enc); 5964 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5965 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5966 5967 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5968 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5969 evporq(xtmp2, dst, xtmp2, vec_enc); 5970 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5971 5972 } else if(vec_enc == Assembler::AVX_512bit) { 5973 // Shift based bit reversal. 5974 assert(bt == T_LONG || bt == T_INT, ""); 5975 5976 // Swap lower and upper nibble of each byte. 5977 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5978 5979 // Swap two least and most significant bits of each nibble. 5980 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5981 5982 // Swap adjacent pair of bits. 5983 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5984 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5985 5986 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5987 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5988 } else { 5989 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5990 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5991 5992 // Get the reverse bit sequence of lower nibble of each byte. 5993 vpand(dst, xtmp2, src, vec_enc); 5994 vpshufb(dst, xtmp1, dst, vec_enc); 5995 vpsllq(dst, dst, 4, vec_enc); 5996 5997 // Get the reverse bit sequence of upper nibble of each byte. 5998 vpandn(xtmp2, xtmp2, src, vec_enc); 5999 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6000 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6001 6002 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6003 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6004 vpor(xtmp2, dst, xtmp2, vec_enc); 6005 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6006 } 6007 } 6008 6009 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 6010 XMMRegister xtmp, Register rscratch) { 6011 assert(VM_Version::supports_gfni(), ""); 6012 assert(rscratch != noreg || always_reachable(mask), "missing"); 6013 6014 // Galois field instruction based bit reversal based on following algorithm. 6015 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6016 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 6017 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 6018 vector_reverse_byte(bt, dst, xtmp, vec_enc); 6019 } 6020 6021 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 6022 XMMRegister xtmp1, Register rtmp, int vec_enc) { 6023 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 6024 evpandq(dst, xtmp1, src, vec_enc); 6025 vpsllq(dst, dst, nbits, vec_enc); 6026 vpandn(xtmp1, xtmp1, src, vec_enc); 6027 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 6028 evporq(dst, dst, xtmp1, vec_enc); 6029 } 6030 6031 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6032 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6033 // Shift based bit reversal. 6034 assert(VM_Version::supports_evex(), ""); 6035 switch(bt) { 6036 case T_LONG: 6037 // Swap upper and lower double word of each quad word. 6038 evprorq(xtmp1, k0, src, 32, true, vec_enc); 6039 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 6040 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6041 break; 6042 case T_INT: 6043 // Swap upper and lower word of each double word. 6044 evprord(xtmp1, k0, src, 16, true, vec_enc); 6045 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6046 break; 6047 case T_CHAR: 6048 case T_SHORT: 6049 // Swap upper and lower byte of each word. 6050 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 6051 break; 6052 case T_BYTE: 6053 evmovdquq(dst, k0, src, true, vec_enc); 6054 break; 6055 default: 6056 fatal("Unsupported type %s", type2name(bt)); 6057 break; 6058 } 6059 } 6060 6061 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 6062 if (bt == T_BYTE) { 6063 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 6064 evmovdquq(dst, k0, src, true, vec_enc); 6065 } else { 6066 vmovdqu(dst, src); 6067 } 6068 return; 6069 } 6070 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 6071 // pre-computed shuffle indices. 6072 switch(bt) { 6073 case T_LONG: 6074 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 6075 break; 6076 case T_INT: 6077 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 6078 break; 6079 case T_CHAR: 6080 case T_SHORT: 6081 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6082 break; 6083 default: 6084 fatal("Unsupported type %s", type2name(bt)); 6085 break; 6086 } 6087 vpshufb(dst, src, dst, vec_enc); 6088 } 6089 6090 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6091 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6092 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6093 assert(is_integral_type(bt), ""); 6094 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6095 assert(VM_Version::supports_avx512cd(), ""); 6096 switch(bt) { 6097 case T_LONG: 6098 evplzcntq(dst, ktmp, src, merge, vec_enc); 6099 break; 6100 case T_INT: 6101 evplzcntd(dst, ktmp, src, merge, vec_enc); 6102 break; 6103 case T_SHORT: 6104 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6105 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6106 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6107 vpunpckhwd(dst, xtmp1, src, vec_enc); 6108 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6109 vpackusdw(dst, xtmp2, dst, vec_enc); 6110 break; 6111 case T_BYTE: 6112 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6113 // accessing the lookup table. 6114 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6115 // accessing the lookup table. 6116 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6117 assert(VM_Version::supports_avx512bw(), ""); 6118 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6119 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6120 vpand(xtmp2, dst, src, vec_enc); 6121 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6122 vpsrlw(xtmp3, src, 4, vec_enc); 6123 vpand(xtmp3, dst, xtmp3, vec_enc); 6124 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6125 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6126 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6127 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6128 break; 6129 default: 6130 fatal("Unsupported type %s", type2name(bt)); 6131 break; 6132 } 6133 } 6134 6135 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6136 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6137 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6138 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6139 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6140 // accessing the lookup table. 6141 vpand(dst, xtmp2, src, vec_enc); 6142 vpshufb(dst, xtmp1, dst, vec_enc); 6143 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6144 // accessing the lookup table. 6145 vpsrlw(xtmp3, src, 4, vec_enc); 6146 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6147 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6148 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6149 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6150 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6151 vpaddb(dst, dst, xtmp2, vec_enc); 6152 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6153 } 6154 6155 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6156 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6157 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6158 // Add zero counts of lower byte and upper byte of a word if 6159 // upper byte holds a zero value. 6160 vpsrlw(xtmp3, src, 8, vec_enc); 6161 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6162 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6163 vpsllw(xtmp2, dst, 8, vec_enc); 6164 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6165 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6166 vpsrlw(dst, dst, 8, vec_enc); 6167 } 6168 6169 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6170 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6171 // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float. 6172 // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the 6173 // exponent as the leading zero count. 6174 6175 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher 6176 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit 6177 // contributes to the leading number of zeros. 6178 vpsrld(dst, src, 1, vec_enc); 6179 vpandn(dst, dst, src, vec_enc); 6180 6181 vcvtdq2ps(dst, dst, vec_enc); 6182 6183 // By comparing the register to itself, all the bits in the destination are set. 6184 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6185 6186 // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit. 6187 vpsrld(xtmp2, xtmp1, 24, vec_enc); 6188 vpsrld(dst, dst, 23, vec_enc); 6189 vpand(dst, xtmp2, dst, vec_enc); 6190 6191 // Subtract 127 from the exponent, which removes the bias from the exponent. 6192 vpsrld(xtmp2, xtmp1, 25, vec_enc); 6193 vpsubd(dst, dst, xtmp2, vec_enc); 6194 6195 vpsrld(xtmp2, xtmp1, 27, vec_enc); 6196 6197 // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this 6198 // is found in any of the lanes, replace the lane with -1 from xtmp1. 6199 vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3); 6200 6201 // If the original value is negative, replace the lane with 31. 6202 vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3); 6203 6204 // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1, 6205 // and for negative numbers the result is 0 as the exponent was replaced with 31. 6206 vpsubd(dst, xtmp2, dst, vec_enc); 6207 } 6208 6209 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6210 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6211 // Find the leading zeros of the top and bottom halves of the long individually. 6212 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6213 6214 // Move the top half result to the bottom half of xtmp1, setting the top half to 0. 6215 vpsrlq(xtmp1, dst, 32, vec_enc); 6216 // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will 6217 // be in the most significant position of the bottom half. 6218 vpsrlq(xtmp2, dst, 6, vec_enc); 6219 6220 // In the bottom half, add the top half and bottom half results. 6221 vpaddq(dst, xtmp1, dst, vec_enc); 6222 6223 // For the bottom half, choose between the values using the most significant bit of xtmp2. 6224 // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen, 6225 // which contains only the top half result. 6226 // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears 6227 // the lane as required. 6228 vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3); 6229 } 6230 6231 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6232 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6233 Register rtmp, int vec_enc) { 6234 assert(is_integral_type(bt), "unexpected type"); 6235 assert(vec_enc < Assembler::AVX_512bit, ""); 6236 switch(bt) { 6237 case T_LONG: 6238 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6239 break; 6240 case T_INT: 6241 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6242 break; 6243 case T_SHORT: 6244 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6245 break; 6246 case T_BYTE: 6247 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6248 break; 6249 default: 6250 fatal("Unsupported type %s", type2name(bt)); 6251 break; 6252 } 6253 } 6254 6255 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6256 switch(bt) { 6257 case T_BYTE: 6258 vpsubb(dst, src1, src2, vec_enc); 6259 break; 6260 case T_SHORT: 6261 vpsubw(dst, src1, src2, vec_enc); 6262 break; 6263 case T_INT: 6264 vpsubd(dst, src1, src2, vec_enc); 6265 break; 6266 case T_LONG: 6267 vpsubq(dst, src1, src2, vec_enc); 6268 break; 6269 default: 6270 fatal("Unsupported type %s", type2name(bt)); 6271 break; 6272 } 6273 } 6274 6275 // Trailing zero count computation is based on leading zero count operation as per 6276 // following equation. All AVX3 targets support AVX512CD feature which offers 6277 // direct vector instruction to compute leading zero count. 6278 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6279 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6280 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6281 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6282 assert(is_integral_type(bt), ""); 6283 // xtmp = -1 6284 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6285 // xtmp = xtmp + src 6286 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6287 // xtmp = xtmp & ~src 6288 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6289 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6290 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6291 vpsub(bt, dst, xtmp4, dst, vec_enc); 6292 } 6293 6294 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6295 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6296 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6297 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6298 assert(is_integral_type(bt), ""); 6299 // xtmp = 0 6300 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6301 // xtmp = 0 - src 6302 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6303 // xtmp = xtmp | src 6304 vpor(xtmp3, xtmp3, src, vec_enc); 6305 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6306 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6307 vpsub(bt, dst, xtmp1, dst, vec_enc); 6308 } 6309 6310 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6311 Label done; 6312 Label neg_divisor_fastpath; 6313 cmpl(divisor, 0); 6314 jccb(Assembler::less, neg_divisor_fastpath); 6315 xorl(rdx, rdx); 6316 divl(divisor); 6317 jmpb(done); 6318 bind(neg_divisor_fastpath); 6319 // Fastpath for divisor < 0: 6320 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6321 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6322 movl(rdx, rax); 6323 subl(rdx, divisor); 6324 if (VM_Version::supports_bmi1()) { 6325 andnl(rax, rdx, rax); 6326 } else { 6327 notl(rdx); 6328 andl(rax, rdx); 6329 } 6330 shrl(rax, 31); 6331 bind(done); 6332 } 6333 6334 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6335 Label done; 6336 Label neg_divisor_fastpath; 6337 cmpl(divisor, 0); 6338 jccb(Assembler::less, neg_divisor_fastpath); 6339 xorl(rdx, rdx); 6340 divl(divisor); 6341 jmpb(done); 6342 bind(neg_divisor_fastpath); 6343 // Fastpath when divisor < 0: 6344 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6345 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6346 movl(rdx, rax); 6347 subl(rax, divisor); 6348 if (VM_Version::supports_bmi1()) { 6349 andnl(rax, rax, rdx); 6350 } else { 6351 notl(rax); 6352 andl(rax, rdx); 6353 } 6354 sarl(rax, 31); 6355 andl(rax, divisor); 6356 subl(rdx, rax); 6357 bind(done); 6358 } 6359 6360 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6361 Label done; 6362 Label neg_divisor_fastpath; 6363 6364 cmpl(divisor, 0); 6365 jccb(Assembler::less, neg_divisor_fastpath); 6366 xorl(rdx, rdx); 6367 divl(divisor); 6368 jmpb(done); 6369 bind(neg_divisor_fastpath); 6370 // Fastpath for divisor < 0: 6371 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6372 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6373 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6374 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6375 movl(rdx, rax); 6376 subl(rax, divisor); 6377 if (VM_Version::supports_bmi1()) { 6378 andnl(rax, rax, rdx); 6379 } else { 6380 notl(rax); 6381 andl(rax, rdx); 6382 } 6383 movl(tmp, rax); 6384 shrl(rax, 31); // quotient 6385 sarl(tmp, 31); 6386 andl(tmp, divisor); 6387 subl(rdx, tmp); // remainder 6388 bind(done); 6389 } 6390 6391 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6392 XMMRegister xtmp2, Register rtmp) { 6393 if(VM_Version::supports_gfni()) { 6394 // Galois field instruction based bit reversal based on following algorithm. 6395 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6396 mov64(rtmp, 0x8040201008040201L); 6397 movq(xtmp1, src); 6398 movq(xtmp2, rtmp); 6399 gf2p8affineqb(xtmp1, xtmp2, 0); 6400 movq(dst, xtmp1); 6401 } else { 6402 // Swap even and odd numbered bits. 6403 movl(rtmp, src); 6404 andl(rtmp, 0x55555555); 6405 shll(rtmp, 1); 6406 movl(dst, src); 6407 andl(dst, 0xAAAAAAAA); 6408 shrl(dst, 1); 6409 orl(dst, rtmp); 6410 6411 // Swap LSB and MSB 2 bits of each nibble. 6412 movl(rtmp, dst); 6413 andl(rtmp, 0x33333333); 6414 shll(rtmp, 2); 6415 andl(dst, 0xCCCCCCCC); 6416 shrl(dst, 2); 6417 orl(dst, rtmp); 6418 6419 // Swap LSB and MSB 4 bits of each byte. 6420 movl(rtmp, dst); 6421 andl(rtmp, 0x0F0F0F0F); 6422 shll(rtmp, 4); 6423 andl(dst, 0xF0F0F0F0); 6424 shrl(dst, 4); 6425 orl(dst, rtmp); 6426 } 6427 bswapl(dst); 6428 } 6429 6430 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6431 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6432 if(VM_Version::supports_gfni()) { 6433 // Galois field instruction based bit reversal based on following algorithm. 6434 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6435 mov64(rtmp1, 0x8040201008040201L); 6436 movq(xtmp1, src); 6437 movq(xtmp2, rtmp1); 6438 gf2p8affineqb(xtmp1, xtmp2, 0); 6439 movq(dst, xtmp1); 6440 } else { 6441 // Swap even and odd numbered bits. 6442 movq(rtmp1, src); 6443 mov64(rtmp2, 0x5555555555555555L); 6444 andq(rtmp1, rtmp2); 6445 shlq(rtmp1, 1); 6446 movq(dst, src); 6447 notq(rtmp2); 6448 andq(dst, rtmp2); 6449 shrq(dst, 1); 6450 orq(dst, rtmp1); 6451 6452 // Swap LSB and MSB 2 bits of each nibble. 6453 movq(rtmp1, dst); 6454 mov64(rtmp2, 0x3333333333333333L); 6455 andq(rtmp1, rtmp2); 6456 shlq(rtmp1, 2); 6457 notq(rtmp2); 6458 andq(dst, rtmp2); 6459 shrq(dst, 2); 6460 orq(dst, rtmp1); 6461 6462 // Swap LSB and MSB 4 bits of each byte. 6463 movq(rtmp1, dst); 6464 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6465 andq(rtmp1, rtmp2); 6466 shlq(rtmp1, 4); 6467 notq(rtmp2); 6468 andq(dst, rtmp2); 6469 shrq(dst, 4); 6470 orq(dst, rtmp1); 6471 } 6472 bswapq(dst); 6473 } 6474 6475 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6476 Label done; 6477 Label neg_divisor_fastpath; 6478 cmpq(divisor, 0); 6479 jccb(Assembler::less, neg_divisor_fastpath); 6480 xorl(rdx, rdx); 6481 divq(divisor); 6482 jmpb(done); 6483 bind(neg_divisor_fastpath); 6484 // Fastpath for divisor < 0: 6485 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6486 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6487 movq(rdx, rax); 6488 subq(rdx, divisor); 6489 if (VM_Version::supports_bmi1()) { 6490 andnq(rax, rdx, rax); 6491 } else { 6492 notq(rdx); 6493 andq(rax, rdx); 6494 } 6495 shrq(rax, 63); 6496 bind(done); 6497 } 6498 6499 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6500 Label done; 6501 Label neg_divisor_fastpath; 6502 cmpq(divisor, 0); 6503 jccb(Assembler::less, neg_divisor_fastpath); 6504 xorq(rdx, rdx); 6505 divq(divisor); 6506 jmp(done); 6507 bind(neg_divisor_fastpath); 6508 // Fastpath when divisor < 0: 6509 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6510 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6511 movq(rdx, rax); 6512 subq(rax, divisor); 6513 if (VM_Version::supports_bmi1()) { 6514 andnq(rax, rax, rdx); 6515 } else { 6516 notq(rax); 6517 andq(rax, rdx); 6518 } 6519 sarq(rax, 63); 6520 andq(rax, divisor); 6521 subq(rdx, rax); 6522 bind(done); 6523 } 6524 6525 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6526 Label done; 6527 Label neg_divisor_fastpath; 6528 cmpq(divisor, 0); 6529 jccb(Assembler::less, neg_divisor_fastpath); 6530 xorq(rdx, rdx); 6531 divq(divisor); 6532 jmp(done); 6533 bind(neg_divisor_fastpath); 6534 // Fastpath for divisor < 0: 6535 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6536 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6537 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6538 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6539 movq(rdx, rax); 6540 subq(rax, divisor); 6541 if (VM_Version::supports_bmi1()) { 6542 andnq(rax, rax, rdx); 6543 } else { 6544 notq(rax); 6545 andq(rax, rdx); 6546 } 6547 movq(tmp, rax); 6548 shrq(rax, 63); // quotient 6549 sarq(tmp, 63); 6550 andq(tmp, divisor); 6551 subq(rdx, tmp); // remainder 6552 bind(done); 6553 } 6554 6555 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6556 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6557 int vlen_enc) { 6558 assert(VM_Version::supports_avx512bw(), ""); 6559 // Byte shuffles are inlane operations and indices are determined using 6560 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6561 // normalized to index range 0-15. This makes sure that all the multiples 6562 // of an index value are placed at same relative position in 128 bit 6563 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6564 // will be 16th element in their respective 128 bit lanes. 6565 movl(rtmp, 16); 6566 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6567 6568 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6569 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6570 // original shuffle indices and move the shuffled lanes corresponding to true 6571 // mask to destination vector. 6572 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6573 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6574 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6575 6576 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6577 // and broadcasting second 128 bit lane. 6578 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6579 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6580 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6581 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6582 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6583 6584 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6585 // and broadcasting third 128 bit lane. 6586 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6587 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6588 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6589 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6590 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6591 6592 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6593 // and broadcasting third 128 bit lane. 6594 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6595 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6596 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6597 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6598 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6599 } 6600 6601 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6602 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6603 if (vlen_enc == AVX_128bit) { 6604 vpermilps(dst, src, shuffle, vlen_enc); 6605 } else if (bt == T_INT) { 6606 vpermd(dst, shuffle, src, vlen_enc); 6607 } else { 6608 assert(bt == T_FLOAT, ""); 6609 vpermps(dst, shuffle, src, vlen_enc); 6610 } 6611 } 6612 6613 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 6614 switch(opcode) { 6615 case Op_AddHF: vaddsh(dst, src1, src2); break; 6616 case Op_SubHF: vsubsh(dst, src1, src2); break; 6617 case Op_MulHF: vmulsh(dst, src1, src2); break; 6618 case Op_DivHF: vdivsh(dst, src1, src2); break; 6619 default: assert(false, "%s", NodeClassNames[opcode]); break; 6620 } 6621 } 6622 6623 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6624 switch(elem_bt) { 6625 case T_BYTE: 6626 if (ideal_opc == Op_SaturatingAddV) { 6627 vpaddsb(dst, src1, src2, vlen_enc); 6628 } else { 6629 assert(ideal_opc == Op_SaturatingSubV, ""); 6630 vpsubsb(dst, src1, src2, vlen_enc); 6631 } 6632 break; 6633 case T_SHORT: 6634 if (ideal_opc == Op_SaturatingAddV) { 6635 vpaddsw(dst, src1, src2, vlen_enc); 6636 } else { 6637 assert(ideal_opc == Op_SaturatingSubV, ""); 6638 vpsubsw(dst, src1, src2, vlen_enc); 6639 } 6640 break; 6641 default: 6642 fatal("Unsupported type %s", type2name(elem_bt)); 6643 break; 6644 } 6645 } 6646 6647 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6648 switch(elem_bt) { 6649 case T_BYTE: 6650 if (ideal_opc == Op_SaturatingAddV) { 6651 vpaddusb(dst, src1, src2, vlen_enc); 6652 } else { 6653 assert(ideal_opc == Op_SaturatingSubV, ""); 6654 vpsubusb(dst, src1, src2, vlen_enc); 6655 } 6656 break; 6657 case T_SHORT: 6658 if (ideal_opc == Op_SaturatingAddV) { 6659 vpaddusw(dst, src1, src2, vlen_enc); 6660 } else { 6661 assert(ideal_opc == Op_SaturatingSubV, ""); 6662 vpsubusw(dst, src1, src2, vlen_enc); 6663 } 6664 break; 6665 default: 6666 fatal("Unsupported type %s", type2name(elem_bt)); 6667 break; 6668 } 6669 } 6670 6671 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6672 XMMRegister src2, KRegister ktmp, int vlen_enc) { 6673 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input. 6674 // overflow_mask = Inp1 <u Inp2 6675 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc); 6676 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative) 6677 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false); 6678 } 6679 6680 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6681 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6682 // Emulate unsigned comparison using signed comparison 6683 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE 6684 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true); 6685 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc); 6686 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc); 6687 6688 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc); 6689 6690 // Res = INP1 - INP2 (non-commutative and non-associative) 6691 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6692 // Res = Mask ? Zero : Res 6693 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); 6694 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc); 6695 } 6696 6697 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6698 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) { 6699 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation. 6700 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2) 6701 // Res = Signed Add INP1, INP2 6702 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6703 // T1 = SRC1 | SRC2 6704 vpor(xtmp1, src1, src2, vlen_enc); 6705 // Max_Unsigned = -1 6706 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6707 // Unsigned compare: Mask = Res <u T1 6708 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc); 6709 // res = Mask ? Max_Unsigned : Res 6710 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc); 6711 } 6712 6713 // 6714 // Section 2-13 Hacker's Delight list following overflow detection check for saturating 6715 // unsigned addition operation. 6716 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1 6717 // 6718 // We empirically determined its semantic equivalence to following reduced expression 6719 // overflow_mask = (a + b) <u (a | b) 6720 // 6721 // and also verified it though Alive2 solver. 6722 // (https://alive2.llvm.org/ce/z/XDQ7dY) 6723 // 6724 6725 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6726 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) { 6727 // Res = Signed Add INP1, INP2 6728 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6729 // Compute T1 = INP1 | INP2 6730 vpor(xtmp3, src1, src2, vlen_enc); 6731 // T1 = Minimum signed value. 6732 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6733 // Convert T1 to signed value, T1 = T1 + MIN_VALUE 6734 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc); 6735 // Convert Res to signed value, Res<s> = Res + MIN_VALUE 6736 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc); 6737 // Compute overflow detection mask = Res<1> <s T1 6738 if (elem_bt == T_INT) { 6739 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc); 6740 } else { 6741 assert(elem_bt == T_LONG, ""); 6742 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc); 6743 } 6744 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6745 } 6746 6747 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6748 int vlen_enc, bool xtmp2_hold_M1) { 6749 if (VM_Version::supports_avx512dq()) { 6750 evpmovq2m(ktmp, src, vlen_enc); 6751 } else { 6752 assert(VM_Version::supports_evex(), ""); 6753 if (!xtmp2_hold_M1) { 6754 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6755 } 6756 evpsraq(xtmp1, src, 63, vlen_enc); 6757 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6758 } 6759 } 6760 6761 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6762 int vlen_enc, bool xtmp2_hold_M1) { 6763 if (VM_Version::supports_avx512dq()) { 6764 evpmovd2m(ktmp, src, vlen_enc); 6765 } else { 6766 assert(VM_Version::supports_evex(), ""); 6767 if (!xtmp2_hold_M1) { 6768 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6769 } 6770 vpsrad(xtmp1, src, 31, vlen_enc); 6771 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6772 } 6773 } 6774 6775 6776 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 6777 if (elem_bt == T_LONG) { 6778 if (VM_Version::supports_evex()) { 6779 evpsraq(dst, src, 63, vlen_enc); 6780 } else { 6781 vpsrad(dst, src, 31, vlen_enc); 6782 vpshufd(dst, dst, 0xF5, vlen_enc); 6783 } 6784 } else { 6785 assert(elem_bt == T_INT, ""); 6786 vpsrad(dst, src, 31, vlen_enc); 6787 } 6788 } 6789 6790 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6791 if (compute_allones) { 6792 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6793 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6794 } else { 6795 vpcmpeqq(allones, allones, allones, vlen_enc); 6796 } 6797 } 6798 if (elem_bt == T_LONG) { 6799 vpsrlq(dst, allones, 1, vlen_enc); 6800 } else { 6801 assert(elem_bt == T_INT, ""); 6802 vpsrld(dst, allones, 1, vlen_enc); 6803 } 6804 } 6805 6806 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6807 if (compute_allones) { 6808 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6809 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6810 } else { 6811 vpcmpeqq(allones, allones, allones, vlen_enc); 6812 } 6813 } 6814 if (elem_bt == T_LONG) { 6815 vpsllq(dst, allones, 63, vlen_enc); 6816 } else { 6817 assert(elem_bt == T_INT, ""); 6818 vpslld(dst, allones, 31, vlen_enc); 6819 } 6820 } 6821 6822 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2, 6823 Assembler::ComparisonPredicate cond, int vlen_enc) { 6824 switch(elem_bt) { 6825 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break; 6826 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break; 6827 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break; 6828 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break; 6829 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6830 } 6831 } 6832 6833 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6834 switch(elem_bt) { 6835 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break; 6836 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break; 6837 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break; 6838 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break; 6839 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6840 } 6841 } 6842 6843 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1, 6844 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) { 6845 if (elem_bt == T_LONG) { 6846 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6847 } else { 6848 assert(elem_bt == T_INT, ""); 6849 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6850 } 6851 } 6852 6853 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6854 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6855 KRegister ktmp1, KRegister ktmp2, int vlen_enc) { 6856 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6857 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6858 // Overflow detection based on Hacker's delight section 2-13. 6859 if (ideal_opc == Op_SaturatingAddV) { 6860 // res = src1 + src2 6861 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6862 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6863 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6864 vpxor(xtmp1, dst, src1, vlen_enc); 6865 vpxor(xtmp2, dst, src2, vlen_enc); 6866 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6867 } else { 6868 assert(ideal_opc == Op_SaturatingSubV, ""); 6869 // res = src1 - src2 6870 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6871 // Overflow occurs when both inputs have opposite polarity and 6872 // result polarity does not comply with first input polarity. 6873 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6874 vpxor(xtmp1, src1, src2, vlen_enc); 6875 vpxor(xtmp2, dst, src1, vlen_enc); 6876 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6877 } 6878 6879 // Compute overflow detection mask. 6880 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc); 6881 // Note: xtmp1 hold -1 in all its lanes after above call. 6882 6883 // Compute mask based on first input polarity. 6884 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true); 6885 6886 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6887 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6888 6889 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to 6890 // set bits in first input polarity mask holds a min value. 6891 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc); 6892 // Blend destination lanes with saturated values using overflow detection mask. 6893 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc); 6894 } 6895 6896 6897 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6898 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6899 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) { 6900 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6901 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6902 // Overflow detection based on Hacker's delight section 2-13. 6903 if (ideal_opc == Op_SaturatingAddV) { 6904 // res = src1 + src2 6905 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6906 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6907 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6908 vpxor(xtmp1, dst, src1, vlen_enc); 6909 vpxor(xtmp2, dst, src2, vlen_enc); 6910 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6911 } else { 6912 assert(ideal_opc == Op_SaturatingSubV, ""); 6913 // res = src1 - src2 6914 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6915 // Overflow occurs when both inputs have opposite polarity and 6916 // result polarity does not comply with first input polarity. 6917 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6918 vpxor(xtmp1, src1, src2, vlen_enc); 6919 vpxor(xtmp2, dst, src1, vlen_enc); 6920 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6921 } 6922 6923 // Sign-extend to compute overflow detection mask. 6924 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc); 6925 6926 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc); 6927 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc); 6928 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6929 6930 // Compose saturating min/max vector using first input polarity mask. 6931 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc); 6932 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc); 6933 6934 // Blend result with saturating vector using overflow detection mask. 6935 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6936 } 6937 6938 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6939 switch(elem_bt) { 6940 case T_BYTE: 6941 if (ideal_opc == Op_SaturatingAddV) { 6942 vpaddsb(dst, src1, src2, vlen_enc); 6943 } else { 6944 assert(ideal_opc == Op_SaturatingSubV, ""); 6945 vpsubsb(dst, src1, src2, vlen_enc); 6946 } 6947 break; 6948 case T_SHORT: 6949 if (ideal_opc == Op_SaturatingAddV) { 6950 vpaddsw(dst, src1, src2, vlen_enc); 6951 } else { 6952 assert(ideal_opc == Op_SaturatingSubV, ""); 6953 vpsubsw(dst, src1, src2, vlen_enc); 6954 } 6955 break; 6956 default: 6957 fatal("Unsupported type %s", type2name(elem_bt)); 6958 break; 6959 } 6960 } 6961 6962 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6963 switch(elem_bt) { 6964 case T_BYTE: 6965 if (ideal_opc == Op_SaturatingAddV) { 6966 vpaddusb(dst, src1, src2, vlen_enc); 6967 } else { 6968 assert(ideal_opc == Op_SaturatingSubV, ""); 6969 vpsubusb(dst, src1, src2, vlen_enc); 6970 } 6971 break; 6972 case T_SHORT: 6973 if (ideal_opc == Op_SaturatingAddV) { 6974 vpaddusw(dst, src1, src2, vlen_enc); 6975 } else { 6976 assert(ideal_opc == Op_SaturatingSubV, ""); 6977 vpsubusw(dst, src1, src2, vlen_enc); 6978 } 6979 break; 6980 default: 6981 fatal("Unsupported type %s", type2name(elem_bt)); 6982 break; 6983 } 6984 } 6985 6986 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6987 XMMRegister src2, int vlen_enc) { 6988 switch(elem_bt) { 6989 case T_BYTE: 6990 evpermi2b(dst, src1, src2, vlen_enc); 6991 break; 6992 case T_SHORT: 6993 evpermi2w(dst, src1, src2, vlen_enc); 6994 break; 6995 case T_INT: 6996 evpermi2d(dst, src1, src2, vlen_enc); 6997 break; 6998 case T_LONG: 6999 evpermi2q(dst, src1, src2, vlen_enc); 7000 break; 7001 case T_FLOAT: 7002 evpermi2ps(dst, src1, src2, vlen_enc); 7003 break; 7004 case T_DOUBLE: 7005 evpermi2pd(dst, src1, src2, vlen_enc); 7006 break; 7007 default: 7008 fatal("Unsupported type %s", type2name(elem_bt)); 7009 break; 7010 } 7011 } 7012 7013 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) { 7014 if (is_unsigned) { 7015 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7016 } else { 7017 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7018 } 7019 } 7020 7021 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) { 7022 if (is_unsigned) { 7023 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7024 } else { 7025 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 7026 } 7027 } 7028 7029 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 7030 switch(opcode) { 7031 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 7032 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 7033 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 7034 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 7035 default: assert(false, "%s", NodeClassNames[opcode]); break; 7036 } 7037 } 7038 7039 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7040 switch(opcode) { 7041 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 7042 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 7043 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 7044 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 7045 default: assert(false, "%s", NodeClassNames[opcode]); break; 7046 } 7047 } 7048 7049 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7050 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) { 7051 vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit); 7052 } 7053 7054 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7055 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 7056 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) { 7057 // Move sign bits of src2 to mask register. 7058 evpmovw2m(ktmp, src2, vlen_enc); 7059 // xtmp1 = src2 < 0 ? src2 : src1 7060 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7061 // xtmp2 = src2 < 0 ? ? src1 : src2 7062 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7063 // Idea behind above swapping is to make seconds source operand a +ve value. 7064 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in 7065 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction, 7066 // the second source operand, either a NaN or a valid floating-point value, is returned 7067 // dst = max(xtmp1, xtmp2) 7068 evmaxph(dst, xtmp1, xtmp2, vlen_enc); 7069 // isNaN = is_unordered_quiet(xtmp1) 7070 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7071 // Final result is same as first source if its a NaN value, 7072 // in case second operand holds a NaN value then as per above semantics 7073 // result is same as second operand. 7074 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7075 } else { 7076 assert(opcode == Op_MinVHF || opcode == Op_MinHF, ""); 7077 // Move sign bits of src1 to mask register. 7078 evpmovw2m(ktmp, src1, vlen_enc); 7079 // xtmp1 = src1 < 0 ? src2 : src1 7080 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7081 // xtmp2 = src1 < 0 ? src1 : src2 7082 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7083 // Idea behind above swapping is to make seconds source operand a -ve value. 7084 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in 7085 // the second source operand is returned. 7086 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN 7087 // or a valid floating-point value, is written to the result. 7088 // dst = min(xtmp1, xtmp2) 7089 evminph(dst, xtmp1, xtmp2, vlen_enc); 7090 // isNaN = is_unordered_quiet(xtmp1) 7091 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7092 // Final result is same as first source if its a NaN value, 7093 // in case second operand holds a NaN value then as per above semantics 7094 // result is same as second operand. 7095 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7096 } 7097 } --- EOF ---