1 //
    2 // Copyright (c) 2011, 2025, Oracle and/or its affiliates. All rights reserved.
    3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4 //
    5 // This code is free software; you can redistribute it and/or modify it
    6 // under the terms of the GNU General Public License version 2 only, as
    7 // published by the Free Software Foundation.
    8 //
    9 // This code is distributed in the hope that it will be useful, but WITHOUT
   10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12 // version 2 for more details (a copy is included in the LICENSE file that
   13 // accompanied this code).
   14 //
   15 // You should have received a copy of the GNU General Public License version
   16 // 2 along with this work; if not, write to the Free Software Foundation,
   17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18 //
   19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20 // or visit www.oracle.com if you need additional information or have any
   21 // questions.
   22 //
   23 //
   24 
   25 // X86 Common Architecture Description File
   26 
   27 //----------REGISTER DEFINITION BLOCK------------------------------------------
   28 // This information is used by the matcher and the register allocator to
   29 // describe individual registers and classes of registers within the target
   30 // architecture.
   31 
   32 register %{
   33 //----------Architecture Description Register Definitions----------------------
   34 // General Registers
   35 // "reg_def"  name ( register save type, C convention save type,
   36 //                   ideal register type, encoding );
   37 // Register Save Types:
   38 //
   39 // NS  = No-Save:       The register allocator assumes that these registers
   40 //                      can be used without saving upon entry to the method, &
   41 //                      that they do not need to be saved at call sites.
   42 //
   43 // SOC = Save-On-Call:  The register allocator assumes that these registers
   44 //                      can be used without saving upon entry to the method,
   45 //                      but that they must be saved at call sites.
   46 //
   47 // SOE = Save-On-Entry: The register allocator assumes that these registers
   48 //                      must be saved before using them upon entry to the
   49 //                      method, but they do not need to be saved at call
   50 //                      sites.
   51 //
   52 // AS  = Always-Save:   The register allocator assumes that these registers
   53 //                      must be saved before using them upon entry to the
   54 //                      method, & that they must be saved at call sites.
   55 //
   56 // Ideal Register Type is used to determine how to save & restore a
   57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
   58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
   59 //
   60 // The encoding number is the actual bit-pattern placed into the opcodes.
   61 
   62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
   63 // Word a in each register holds a Float, words ab hold a Double.
   64 // The whole registers are used in SSE4.2 version intrinsics,
   65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
   66 // UseXMMForArrayCopy and UseSuperword flags).
   67 // For pre EVEX enabled architectures:
   68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
   69 // For EVEX enabled architectures:
   70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
   71 //
   72 // Linux ABI:   No register preserved across function calls
   73 //              XMM0-XMM7 might hold parameters
   74 // Windows ABI: XMM6-XMM15 preserved across function calls
   75 //              XMM0-XMM3 might hold parameters
   76 
   77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
   79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
   80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
   81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
   82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
   83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
   84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
   85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
   86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
   87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
   88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
   89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
   90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
   91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
   92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
   93 
   94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
   96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
   97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
   98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
   99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
  103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
  104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
  105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
  106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
  107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
  108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
  109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
  110 
  111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
  120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
  121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
  122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
  123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
  124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
  125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
  126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
  127 
  128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
  129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
  130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
  131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
  132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
  133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
  134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
  135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
  136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
  137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
  138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
  139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
  140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
  141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
  142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
  143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
  144 
  145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
  146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
  147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
  148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
  149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
  150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
  151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
  152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
  153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
  154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
  155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
  156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
  157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
  158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
  159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
  160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
  161 
  162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
  163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
  164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
  165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
  166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
  167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
  168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
  169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
  170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
  171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
  172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
  173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
  174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
  175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
  176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
  177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
  178 
  179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
  180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
  181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
  182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
  183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
  184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
  185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
  186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
  187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
  188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
  189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
  190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
  191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
  192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
  193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
  194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
  195 
  196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
  197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
  198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
  199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
  200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
  201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
  202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
  203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
  204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
  205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
  206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
  207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
  208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
  209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
  210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
  211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
  212 
  213 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
  214 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
  215 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
  216 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
  217 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
  218 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
  219 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
  220 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
  221 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
  222 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
  223 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
  224 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
  225 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
  226 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
  227 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
  228 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
  229 
  230 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
  231 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
  232 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
  233 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
  234 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
  235 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
  236 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
  237 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
  238 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
  239 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
  240 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
  241 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
  242 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
  243 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
  244 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
  245 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
  246 
  247 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
  248 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
  249 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
  250 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
  251 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
  252 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
  253 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
  254 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
  255 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
  256 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
  257 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
  258 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
  259 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
  260 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
  261 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
  262 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
  263 
  264 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
  265 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
  266 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
  267 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
  268 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
  269 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
  270 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
  271 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
  272 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
  273 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
  274 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
  275 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
  276 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
  277 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
  278 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
  279 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
  280 
  281 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
  282 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
  283 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
  284 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
  285 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
  286 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
  287 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
  288 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
  289 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
  290 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
  291 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
  292 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
  293 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
  294 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
  295 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
  296 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
  297 
  298 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
  299 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
  300 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
  301 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
  302 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
  303 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
  304 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
  305 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
  306 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
  307 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
  308 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
  309 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
  310 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
  311 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
  312 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
  313 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
  314 
  315 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
  316 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
  317 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
  318 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
  319 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
  320 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
  321 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
  322 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
  323 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
  324 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
  325 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
  326 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
  327 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
  328 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
  329 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
  330 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
  331 
  332 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
  333 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
  334 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
  335 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
  336 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
  337 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
  338 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
  339 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
  340 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
  341 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
  342 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
  343 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
  344 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
  345 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
  346 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
  347 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
  348 
  349 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
  350 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
  351 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
  352 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
  353 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
  354 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
  355 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
  356 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
  357 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
  358 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
  359 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
  360 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
  361 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
  362 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
  363 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
  364 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
  365 
  366 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
  367 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
  368 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
  369 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
  370 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
  371 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
  372 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
  373 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
  374 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
  375 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
  376 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
  377 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
  378 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
  379 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
  380 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
  381 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
  382 
  383 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
  384 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
  385 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
  386 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
  387 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
  388 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
  389 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
  390 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
  391 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
  392 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
  393 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
  394 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
  395 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
  396 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
  397 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
  398 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
  399 
  400 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
  401 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
  402 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
  403 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
  404 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
  405 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
  406 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
  407 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
  408 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
  409 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
  410 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
  411 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
  412 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
  413 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
  414 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
  415 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
  416 
  417 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
  418 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
  419 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
  420 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
  421 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
  422 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
  423 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
  424 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
  425 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
  426 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
  427 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
  428 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
  429 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
  430 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
  431 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
  432 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
  433 
  434 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
  435 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
  436 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
  437 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
  438 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
  439 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
  440 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
  441 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
  442 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
  443 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
  444 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
  445 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
  446 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
  447 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
  448 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
  449 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
  450 
  451 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
  452 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
  453 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
  454 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
  455 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
  456 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
  457 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
  458 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
  459 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
  460 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
  461 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
  462 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
  463 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
  464 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
  465 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
  466 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
  467 
  468 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
  469 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
  470 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
  471 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
  472 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
  473 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
  474 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
  475 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
  476 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
  477 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
  478 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
  479 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
  480 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
  481 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
  482 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
  483 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
  484 
  485 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
  486 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
  487 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
  488 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
  489 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
  490 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
  491 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
  492 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
  493 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
  494 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
  495 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
  496 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
  497 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
  498 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
  499 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
  500 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
  501 
  502 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
  503 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
  504 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
  505 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
  506 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
  507 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
  508 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
  509 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
  510 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
  511 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
  512 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
  513 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
  514 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
  515 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
  516 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
  517 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
  518 
  519 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
  520 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
  521 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
  522 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
  523 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
  524 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
  525 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
  526 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
  527 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
  528 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
  529 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
  530 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
  531 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
  532 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
  533 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
  534 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
  535 
  536 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
  537 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
  538 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
  539 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
  540 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
  541 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
  542 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
  543 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
  544 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
  545 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
  546 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
  547 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
  548 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
  549 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
  550 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
  551 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
  552 
  553 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
  554 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
  555 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
  556 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
  557 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
  558 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
  559 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
  560 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
  561 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
  562 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
  563 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
  564 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
  565 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
  566 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
  567 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
  568 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
  569 
  570 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
  571 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
  572 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
  573 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
  574 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
  575 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
  576 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
  577 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
  578 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
  579 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
  580 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
  581 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
  582 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
  583 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
  584 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
  585 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
  586 
  587 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
  588 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
  589 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
  590 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
  591 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
  592 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
  593 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
  594 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
  595 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
  596 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
  597 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
  598 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
  599 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
  600 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
  601 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
  602 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
  603 
  604 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
  605 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
  606 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
  607 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
  608 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
  609 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
  610 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
  611 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
  612 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
  613 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
  614 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
  615 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
  616 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
  617 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
  618 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
  619 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
  620 
  621 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
  622 
  623 // AVX3 Mask Registers.
  624 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
  625 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
  626 
  627 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
  628 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
  629 
  630 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
  631 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
  632 
  633 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
  634 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
  635 
  636 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
  637 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
  638 
  639 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
  640 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
  641 
  642 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
  643 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
  644 
  645 
  646 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
  647                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
  648                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
  649                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
  650                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
  651                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
  652                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
  653                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
  654                    XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
  655                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
  656                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
  657                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
  658                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
  659                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
  660                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
  661                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p,
  662                    XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
  663                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
  664                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
  665                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
  666                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
  667                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
  668                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
  669                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
  670                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
  671                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
  672                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
  673                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
  674                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
  675                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
  676                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
  677                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
  678 
  679 alloc_class chunk2(K7, K7_H,
  680                    K6, K6_H,
  681                    K5, K5_H,
  682                    K4, K4_H,
  683                    K3, K3_H,
  684                    K2, K2_H,
  685                    K1, K1_H);
  686 
  687 reg_class  vectmask_reg(K1, K1_H,
  688                         K2, K2_H,
  689                         K3, K3_H,
  690                         K4, K4_H,
  691                         K5, K5_H,
  692                         K6, K6_H,
  693                         K7, K7_H);
  694 
  695 reg_class vectmask_reg_K1(K1, K1_H);
  696 reg_class vectmask_reg_K2(K2, K2_H);
  697 reg_class vectmask_reg_K3(K3, K3_H);
  698 reg_class vectmask_reg_K4(K4, K4_H);
  699 reg_class vectmask_reg_K5(K5, K5_H);
  700 reg_class vectmask_reg_K6(K6, K6_H);
  701 reg_class vectmask_reg_K7(K7, K7_H);
  702 
  703 // flags allocation class should be last.
  704 alloc_class chunk3(RFLAGS);
  705 
  706 
  707 // Singleton class for condition codes
  708 reg_class int_flags(RFLAGS);
  709 
  710 // Class for pre evex float registers
  711 reg_class float_reg_legacy(XMM0,
  712                     XMM1,
  713                     XMM2,
  714                     XMM3,
  715                     XMM4,
  716                     XMM5,
  717                     XMM6,
  718                     XMM7,
  719                     XMM8,
  720                     XMM9,
  721                     XMM10,
  722                     XMM11,
  723                     XMM12,
  724                     XMM13,
  725                     XMM14,
  726                     XMM15);
  727 
  728 // Class for evex float registers
  729 reg_class float_reg_evex(XMM0,
  730                     XMM1,
  731                     XMM2,
  732                     XMM3,
  733                     XMM4,
  734                     XMM5,
  735                     XMM6,
  736                     XMM7,
  737                     XMM8,
  738                     XMM9,
  739                     XMM10,
  740                     XMM11,
  741                     XMM12,
  742                     XMM13,
  743                     XMM14,
  744                     XMM15,
  745                     XMM16,
  746                     XMM17,
  747                     XMM18,
  748                     XMM19,
  749                     XMM20,
  750                     XMM21,
  751                     XMM22,
  752                     XMM23,
  753                     XMM24,
  754                     XMM25,
  755                     XMM26,
  756                     XMM27,
  757                     XMM28,
  758                     XMM29,
  759                     XMM30,
  760                     XMM31);
  761 
  762 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
  763 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  764 
  765 // Class for pre evex double registers
  766 reg_class double_reg_legacy(XMM0,  XMM0b,
  767                      XMM1,  XMM1b,
  768                      XMM2,  XMM2b,
  769                      XMM3,  XMM3b,
  770                      XMM4,  XMM4b,
  771                      XMM5,  XMM5b,
  772                      XMM6,  XMM6b,
  773                      XMM7,  XMM7b,
  774                      XMM8,  XMM8b,
  775                      XMM9,  XMM9b,
  776                      XMM10, XMM10b,
  777                      XMM11, XMM11b,
  778                      XMM12, XMM12b,
  779                      XMM13, XMM13b,
  780                      XMM14, XMM14b,
  781                      XMM15, XMM15b);
  782 
  783 // Class for evex double registers
  784 reg_class double_reg_evex(XMM0,  XMM0b,
  785                      XMM1,  XMM1b,
  786                      XMM2,  XMM2b,
  787                      XMM3,  XMM3b,
  788                      XMM4,  XMM4b,
  789                      XMM5,  XMM5b,
  790                      XMM6,  XMM6b,
  791                      XMM7,  XMM7b,
  792                      XMM8,  XMM8b,
  793                      XMM9,  XMM9b,
  794                      XMM10, XMM10b,
  795                      XMM11, XMM11b,
  796                      XMM12, XMM12b,
  797                      XMM13, XMM13b,
  798                      XMM14, XMM14b,
  799                      XMM15, XMM15b,
  800                      XMM16, XMM16b,
  801                      XMM17, XMM17b,
  802                      XMM18, XMM18b,
  803                      XMM19, XMM19b,
  804                      XMM20, XMM20b,
  805                      XMM21, XMM21b,
  806                      XMM22, XMM22b,
  807                      XMM23, XMM23b,
  808                      XMM24, XMM24b,
  809                      XMM25, XMM25b,
  810                      XMM26, XMM26b,
  811                      XMM27, XMM27b,
  812                      XMM28, XMM28b,
  813                      XMM29, XMM29b,
  814                      XMM30, XMM30b,
  815                      XMM31, XMM31b);
  816 
  817 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
  818 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  819 
  820 // Class for pre evex 32bit vector registers
  821 reg_class vectors_reg_legacy(XMM0,
  822                       XMM1,
  823                       XMM2,
  824                       XMM3,
  825                       XMM4,
  826                       XMM5,
  827                       XMM6,
  828                       XMM7,
  829                       XMM8,
  830                       XMM9,
  831                       XMM10,
  832                       XMM11,
  833                       XMM12,
  834                       XMM13,
  835                       XMM14,
  836                       XMM15);
  837 
  838 // Class for evex 32bit vector registers
  839 reg_class vectors_reg_evex(XMM0,
  840                       XMM1,
  841                       XMM2,
  842                       XMM3,
  843                       XMM4,
  844                       XMM5,
  845                       XMM6,
  846                       XMM7,
  847                       XMM8,
  848                       XMM9,
  849                       XMM10,
  850                       XMM11,
  851                       XMM12,
  852                       XMM13,
  853                       XMM14,
  854                       XMM15,
  855                       XMM16,
  856                       XMM17,
  857                       XMM18,
  858                       XMM19,
  859                       XMM20,
  860                       XMM21,
  861                       XMM22,
  862                       XMM23,
  863                       XMM24,
  864                       XMM25,
  865                       XMM26,
  866                       XMM27,
  867                       XMM28,
  868                       XMM29,
  869                       XMM30,
  870                       XMM31);
  871 
  872 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
  873 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  874 
  875 // Class for all 64bit vector registers
  876 reg_class vectord_reg_legacy(XMM0,  XMM0b,
  877                       XMM1,  XMM1b,
  878                       XMM2,  XMM2b,
  879                       XMM3,  XMM3b,
  880                       XMM4,  XMM4b,
  881                       XMM5,  XMM5b,
  882                       XMM6,  XMM6b,
  883                       XMM7,  XMM7b,
  884                       XMM8,  XMM8b,
  885                       XMM9,  XMM9b,
  886                       XMM10, XMM10b,
  887                       XMM11, XMM11b,
  888                       XMM12, XMM12b,
  889                       XMM13, XMM13b,
  890                       XMM14, XMM14b,
  891                       XMM15, XMM15b);
  892 
  893 // Class for all 64bit vector registers
  894 reg_class vectord_reg_evex(XMM0,  XMM0b,
  895                       XMM1,  XMM1b,
  896                       XMM2,  XMM2b,
  897                       XMM3,  XMM3b,
  898                       XMM4,  XMM4b,
  899                       XMM5,  XMM5b,
  900                       XMM6,  XMM6b,
  901                       XMM7,  XMM7b,
  902                       XMM8,  XMM8b,
  903                       XMM9,  XMM9b,
  904                       XMM10, XMM10b,
  905                       XMM11, XMM11b,
  906                       XMM12, XMM12b,
  907                       XMM13, XMM13b,
  908                       XMM14, XMM14b,
  909                       XMM15, XMM15b,
  910                       XMM16, XMM16b,
  911                       XMM17, XMM17b,
  912                       XMM18, XMM18b,
  913                       XMM19, XMM19b,
  914                       XMM20, XMM20b,
  915                       XMM21, XMM21b,
  916                       XMM22, XMM22b,
  917                       XMM23, XMM23b,
  918                       XMM24, XMM24b,
  919                       XMM25, XMM25b,
  920                       XMM26, XMM26b,
  921                       XMM27, XMM27b,
  922                       XMM28, XMM28b,
  923                       XMM29, XMM29b,
  924                       XMM30, XMM30b,
  925                       XMM31, XMM31b);
  926 
  927 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
  928 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  929 
  930 // Class for all 128bit vector registers
  931 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
  932                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  933                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  934                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  935                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  936                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  937                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  938                       XMM7,  XMM7b,  XMM7c,  XMM7d,
  939                       XMM8,  XMM8b,  XMM8c,  XMM8d,
  940                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  941                       XMM10, XMM10b, XMM10c, XMM10d,
  942                       XMM11, XMM11b, XMM11c, XMM11d,
  943                       XMM12, XMM12b, XMM12c, XMM12d,
  944                       XMM13, XMM13b, XMM13c, XMM13d,
  945                       XMM14, XMM14b, XMM14c, XMM14d,
  946                       XMM15, XMM15b, XMM15c, XMM15d);
  947 
  948 // Class for all 128bit vector registers
  949 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
  950                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  951                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  952                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  953                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  954                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  955                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  956                       XMM7,  XMM7b,  XMM7c,  XMM7d,
  957                       XMM8,  XMM8b,  XMM8c,  XMM8d,
  958                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  959                       XMM10, XMM10b, XMM10c, XMM10d,
  960                       XMM11, XMM11b, XMM11c, XMM11d,
  961                       XMM12, XMM12b, XMM12c, XMM12d,
  962                       XMM13, XMM13b, XMM13c, XMM13d,
  963                       XMM14, XMM14b, XMM14c, XMM14d,
  964                       XMM15, XMM15b, XMM15c, XMM15d,
  965                       XMM16, XMM16b, XMM16c, XMM16d,
  966                       XMM17, XMM17b, XMM17c, XMM17d,
  967                       XMM18, XMM18b, XMM18c, XMM18d,
  968                       XMM19, XMM19b, XMM19c, XMM19d,
  969                       XMM20, XMM20b, XMM20c, XMM20d,
  970                       XMM21, XMM21b, XMM21c, XMM21d,
  971                       XMM22, XMM22b, XMM22c, XMM22d,
  972                       XMM23, XMM23b, XMM23c, XMM23d,
  973                       XMM24, XMM24b, XMM24c, XMM24d,
  974                       XMM25, XMM25b, XMM25c, XMM25d,
  975                       XMM26, XMM26b, XMM26c, XMM26d,
  976                       XMM27, XMM27b, XMM27c, XMM27d,
  977                       XMM28, XMM28b, XMM28c, XMM28d,
  978                       XMM29, XMM29b, XMM29c, XMM29d,
  979                       XMM30, XMM30b, XMM30c, XMM30d,
  980                       XMM31, XMM31b, XMM31c, XMM31d);
  981 
  982 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
  983 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  984 
  985 // Class for all 256bit vector registers
  986 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
  987                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
  988                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
  989                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
  990                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
  991                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
  992                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
  993                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,
  994                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
  995                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
  996                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
  997                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
  998                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
  999                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1000                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1001                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h);
 1002 
 1003 // Class for all 256bit vector registers
 1004 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1005                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1006                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1007                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1008                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1009                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1010                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1011                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,
 1012                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1013                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1014                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1015                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1016                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1017                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1018                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1019                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
 1020                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
 1021                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
 1022                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
 1023                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
 1024                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
 1025                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
 1026                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
 1027                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
 1028                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
 1029                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
 1030                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
 1031                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
 1032                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
 1033                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
 1034                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
 1035                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h);
 1036 
 1037 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
 1038 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1039 
 1040 // Class for all 512bit vector registers
 1041 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1042                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1043                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1044                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1045                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1046                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1047                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1048                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
 1049                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1050                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1051                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1052                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1053                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1054                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1055                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1056                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p,
 1057                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 1058                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 1059                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 1060                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 1061                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 1062                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 1063                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 1064                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 1065                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 1066                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 1067                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 1068                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 1069                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 1070                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 1071                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 1072                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
 1073 
 1074 // Class for restricted 512bit vector registers
 1075 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1076                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1077                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1078                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1079                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1080                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1081                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1082                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
 1083                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1084                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1085                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1086                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1087                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1088                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1089                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1090                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p);
 1091 
 1092 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
 1093 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 1094 
 1095 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 1096 %}
 1097 
 1098 
 1099 //----------SOURCE BLOCK-------------------------------------------------------
 1100 // This is a block of C++ code which provides values, functions, and
 1101 // definitions necessary in the rest of the architecture description
 1102 
 1103 source_hpp %{
 1104 // Header information of the source block.
 1105 // Method declarations/definitions which are used outside
 1106 // the ad-scope can conveniently be defined here.
 1107 //
 1108 // To keep related declarations/definitions/uses close together,
 1109 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 1110 
 1111 #include "runtime/vm_version.hpp"
 1112 
 1113 class NativeJump;
 1114 
 1115 class CallStubImpl {
 1116 
 1117   //--------------------------------------------------------------
 1118   //---<  Used for optimization in Compile::shorten_branches  >---
 1119   //--------------------------------------------------------------
 1120 
 1121  public:
 1122   // Size of call trampoline stub.
 1123   static uint size_call_trampoline() {
 1124     return 0; // no call trampolines on this platform
 1125   }
 1126 
 1127   // number of relocations needed by a call trampoline stub
 1128   static uint reloc_call_trampoline() {
 1129     return 0; // no call trampolines on this platform
 1130   }
 1131 };
 1132 
 1133 class HandlerImpl {
 1134 
 1135  public:
 1136 
 1137   static int emit_exception_handler(C2_MacroAssembler *masm);
 1138   static int emit_deopt_handler(C2_MacroAssembler* masm);
 1139 
 1140   static uint size_exception_handler() {
 1141     // NativeCall instruction size is the same as NativeJump.
 1142     // exception handler starts out as jump and can be patched to
 1143     // a call be deoptimization.  (4932387)
 1144     // Note that this value is also credited (in output.cpp) to
 1145     // the size of the code section.
 1146     return NativeJump::instruction_size;
 1147   }
 1148 
 1149   static uint size_deopt_handler() {
 1150     // three 5 byte instructions plus one move for unreachable address.
 1151     return 15+3;
 1152   }
 1153 };
 1154 
 1155 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
 1156   switch(bytes) {
 1157     case  4: // fall-through
 1158     case  8: // fall-through
 1159     case 16: return Assembler::AVX_128bit;
 1160     case 32: return Assembler::AVX_256bit;
 1161     case 64: return Assembler::AVX_512bit;
 1162 
 1163     default: {
 1164       ShouldNotReachHere();
 1165       return Assembler::AVX_NoVec;
 1166     }
 1167   }
 1168 }
 1169 
 1170 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
 1171   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
 1172 }
 1173 
 1174 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
 1175   uint def_idx = use->operand_index(opnd);
 1176   Node* def = use->in(def_idx);
 1177   return vector_length_encoding(def);
 1178 }
 1179 
 1180 static inline bool is_vector_popcount_predicate(BasicType bt) {
 1181   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1182          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1183 }
 1184 
 1185 static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
 1186   return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
 1187            (VM_Version::supports_avx512vl() || vlen_bytes == 64);
 1188 }
 1189 
 1190 class Node::PD {
 1191 public:
 1192   enum NodeFlags {
 1193     Flag_intel_jcc_erratum    = Node::_last_flag << 1,
 1194     Flag_sets_carry_flag      = Node::_last_flag << 2,
 1195     Flag_sets_parity_flag     = Node::_last_flag << 3,
 1196     Flag_sets_zero_flag       = Node::_last_flag << 4,
 1197     Flag_sets_overflow_flag   = Node::_last_flag << 5,
 1198     Flag_sets_sign_flag       = Node::_last_flag << 6,
 1199     Flag_clears_carry_flag    = Node::_last_flag << 7,
 1200     Flag_clears_parity_flag   = Node::_last_flag << 8,
 1201     Flag_clears_zero_flag     = Node::_last_flag << 9,
 1202     Flag_clears_overflow_flag = Node::_last_flag << 10,
 1203     Flag_clears_sign_flag     = Node::_last_flag << 11,
 1204     _last_flag                = Flag_clears_sign_flag
 1205   };
 1206 };
 1207 
 1208 %} // end source_hpp
 1209 
 1210 source %{
 1211 
 1212 #include "opto/addnode.hpp"
 1213 #include "c2_intelJccErratum_x86.hpp"
 1214 
 1215 void PhaseOutput::pd_perform_mach_node_analysis() {
 1216   if (VM_Version::has_intel_jcc_erratum()) {
 1217     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
 1218     _buf_sizes._code += extra_padding;
 1219   }
 1220 }
 1221 
 1222 int MachNode::pd_alignment_required() const {
 1223   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
 1224     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
 1225     return IntelJccErratum::largest_jcc_size() + 1;
 1226   } else {
 1227     return 1;
 1228   }
 1229 }
 1230 
 1231 int MachNode::compute_padding(int current_offset) const {
 1232   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
 1233     Compile* C = Compile::current();
 1234     PhaseOutput* output = C->output();
 1235     Block* block = output->block();
 1236     int index = output->index();
 1237     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
 1238   } else {
 1239     return 0;
 1240   }
 1241 }
 1242 
 1243 // Emit exception handler code.
 1244 // Stuff framesize into a register and call a VM stub routine.
 1245 int HandlerImpl::emit_exception_handler(C2_MacroAssembler* masm) {
 1246 
 1247   // Note that the code buffer's insts_mark is always relative to insts.
 1248   // That's why we must use the macroassembler to generate a handler.
 1249   address base = __ start_a_stub(size_exception_handler());
 1250   if (base == nullptr) {
 1251     ciEnv::current()->record_failure("CodeCache is full");
 1252     return 0;  // CodeBuffer::expand failed
 1253   }
 1254   int offset = __ offset();
 1255   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 1256   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 1257   __ end_a_stub();
 1258   return offset;
 1259 }
 1260 
 1261 // Emit deopt handler code.
 1262 int HandlerImpl::emit_deopt_handler(C2_MacroAssembler* masm) {
 1263 
 1264   // Note that the code buffer's insts_mark is always relative to insts.
 1265   // That's why we must use the macroassembler to generate a handler.
 1266   address base = __ start_a_stub(size_deopt_handler());
 1267   if (base == nullptr) {
 1268     ciEnv::current()->record_failure("CodeCache is full");
 1269     return 0;  // CodeBuffer::expand failed
 1270   }
 1271   int offset = __ offset();
 1272 
 1273   address the_pc = (address) __ pc();
 1274   Label next;
 1275   // push a "the_pc" on the stack without destroying any registers
 1276   // as they all may be live.
 1277 
 1278   // push address of "next"
 1279   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 1280   __ bind(next);
 1281   // adjust it so it matches "the_pc"
 1282   __ subptr(Address(rsp, 0), __ offset() - offset);
 1283 
 1284   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 1285   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
 1286   __ end_a_stub();
 1287   return offset;
 1288 }
 1289 
 1290 static Assembler::Width widthForType(BasicType bt) {
 1291   if (bt == T_BYTE) {
 1292     return Assembler::B;
 1293   } else if (bt == T_SHORT) {
 1294     return Assembler::W;
 1295   } else if (bt == T_INT) {
 1296     return Assembler::D;
 1297   } else {
 1298     assert(bt == T_LONG, "not a long: %s", type2name(bt));
 1299     return Assembler::Q;
 1300   }
 1301 }
 1302 
 1303 //=============================================================================
 1304 
 1305   // Float masks come from different places depending on platform.
 1306   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 1307   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 1308   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 1309   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 1310   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
 1311   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
 1312   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
 1313   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
 1314   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
 1315   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
 1316   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
 1317   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
 1318   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
 1319   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
 1320   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
 1321   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
 1322   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
 1323   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
 1324   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
 1325 
 1326 //=============================================================================
 1327 bool Matcher::match_rule_supported(int opcode) {
 1328   if (!has_match_rule(opcode)) {
 1329     return false; // no match rule present
 1330   }
 1331   switch (opcode) {
 1332     case Op_AbsVL:
 1333     case Op_StoreVectorScatter:
 1334       if (UseAVX < 3) {
 1335         return false;
 1336       }
 1337       break;
 1338     case Op_PopCountI:
 1339     case Op_PopCountL:
 1340       if (!UsePopCountInstruction) {
 1341         return false;
 1342       }
 1343       break;
 1344     case Op_PopCountVI:
 1345       if (UseAVX < 2) {
 1346         return false;
 1347       }
 1348       break;
 1349     case Op_CompressV:
 1350     case Op_ExpandV:
 1351     case Op_PopCountVL:
 1352       if (UseAVX < 2) {
 1353         return false;
 1354       }
 1355       break;
 1356     case Op_MulVI:
 1357       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
 1358         return false;
 1359       }
 1360       break;
 1361     case Op_MulVL:
 1362       if (UseSSE < 4) { // only with SSE4_1 or AVX
 1363         return false;
 1364       }
 1365       break;
 1366     case Op_MulReductionVL:
 1367       if (VM_Version::supports_avx512dq() == false) {
 1368         return false;
 1369       }
 1370       break;
 1371     case Op_AbsVB:
 1372     case Op_AbsVS:
 1373     case Op_AbsVI:
 1374     case Op_AddReductionVI:
 1375     case Op_AndReductionV:
 1376     case Op_OrReductionV:
 1377     case Op_XorReductionV:
 1378       if (UseSSE < 3) { // requires at least SSSE3
 1379         return false;
 1380       }
 1381       break;
 1382     case Op_MaxHF:
 1383     case Op_MinHF:
 1384       if (!VM_Version::supports_avx512vlbw()) {
 1385         return false;
 1386       }  // fallthrough
 1387     case Op_AddHF:
 1388     case Op_DivHF:
 1389     case Op_FmaHF:
 1390     case Op_MulHF:
 1391     case Op_ReinterpretS2HF:
 1392     case Op_ReinterpretHF2S:
 1393     case Op_SubHF:
 1394     case Op_SqrtHF:
 1395       if (!VM_Version::supports_avx512_fp16()) {
 1396         return false;
 1397       }
 1398       break;
 1399     case Op_VectorLoadShuffle:
 1400     case Op_VectorRearrange:
 1401     case Op_MulReductionVI:
 1402       if (UseSSE < 4) { // requires at least SSE4
 1403         return false;
 1404       }
 1405       break;
 1406     case Op_IsInfiniteF:
 1407     case Op_IsInfiniteD:
 1408       if (!VM_Version::supports_avx512dq()) {
 1409         return false;
 1410       }
 1411       break;
 1412     case Op_SqrtVD:
 1413     case Op_SqrtVF:
 1414     case Op_VectorMaskCmp:
 1415     case Op_VectorCastB2X:
 1416     case Op_VectorCastS2X:
 1417     case Op_VectorCastI2X:
 1418     case Op_VectorCastL2X:
 1419     case Op_VectorCastF2X:
 1420     case Op_VectorCastD2X:
 1421     case Op_VectorUCastB2X:
 1422     case Op_VectorUCastS2X:
 1423     case Op_VectorUCastI2X:
 1424     case Op_VectorMaskCast:
 1425       if (UseAVX < 1) { // enabled for AVX only
 1426         return false;
 1427       }
 1428       break;
 1429     case Op_PopulateIndex:
 1430       if (UseAVX < 2) {
 1431         return false;
 1432       }
 1433       break;
 1434     case Op_RoundVF:
 1435       if (UseAVX < 2) { // enabled for AVX2 only
 1436         return false;
 1437       }
 1438       break;
 1439     case Op_RoundVD:
 1440       if (UseAVX < 3) {
 1441         return false;  // enabled for AVX3 only
 1442       }
 1443       break;
 1444     case Op_CompareAndSwapL:
 1445     case Op_CompareAndSwapP:
 1446       break;
 1447     case Op_StrIndexOf:
 1448       if (!UseSSE42Intrinsics) {
 1449         return false;
 1450       }
 1451       break;
 1452     case Op_StrIndexOfChar:
 1453       if (!UseSSE42Intrinsics) {
 1454         return false;
 1455       }
 1456       break;
 1457     case Op_OnSpinWait:
 1458       if (VM_Version::supports_on_spin_wait() == false) {
 1459         return false;
 1460       }
 1461       break;
 1462     case Op_MulVB:
 1463     case Op_LShiftVB:
 1464     case Op_RShiftVB:
 1465     case Op_URShiftVB:
 1466     case Op_VectorInsert:
 1467     case Op_VectorLoadMask:
 1468     case Op_VectorStoreMask:
 1469     case Op_VectorBlend:
 1470       if (UseSSE < 4) {
 1471         return false;
 1472       }
 1473       break;
 1474     case Op_MaxD:
 1475     case Op_MaxF:
 1476     case Op_MinD:
 1477     case Op_MinF:
 1478       if (UseAVX < 1) { // enabled for AVX only
 1479         return false;
 1480       }
 1481       break;
 1482     case Op_CacheWB:
 1483     case Op_CacheWBPreSync:
 1484     case Op_CacheWBPostSync:
 1485       if (!VM_Version::supports_data_cache_line_flush()) {
 1486         return false;
 1487       }
 1488       break;
 1489     case Op_ExtractB:
 1490     case Op_ExtractL:
 1491     case Op_ExtractI:
 1492     case Op_RoundDoubleMode:
 1493       if (UseSSE < 4) {
 1494         return false;
 1495       }
 1496       break;
 1497     case Op_RoundDoubleModeV:
 1498       if (VM_Version::supports_avx() == false) {
 1499         return false; // 128bit vroundpd is not available
 1500       }
 1501       break;
 1502     case Op_LoadVectorGather:
 1503     case Op_LoadVectorGatherMasked:
 1504       if (UseAVX < 2) {
 1505         return false;
 1506       }
 1507       break;
 1508     case Op_FmaF:
 1509     case Op_FmaD:
 1510     case Op_FmaVD:
 1511     case Op_FmaVF:
 1512       if (!UseFMA) {
 1513         return false;
 1514       }
 1515       break;
 1516     case Op_MacroLogicV:
 1517       if (UseAVX < 3 || !UseVectorMacroLogic) {
 1518         return false;
 1519       }
 1520       break;
 1521 
 1522     case Op_VectorCmpMasked:
 1523     case Op_VectorMaskGen:
 1524       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1525         return false;
 1526       }
 1527       break;
 1528     case Op_VectorMaskFirstTrue:
 1529     case Op_VectorMaskLastTrue:
 1530     case Op_VectorMaskTrueCount:
 1531     case Op_VectorMaskToLong:
 1532       if (UseAVX < 1) {
 1533          return false;
 1534       }
 1535       break;
 1536     case Op_RoundF:
 1537     case Op_RoundD:
 1538       break;
 1539     case Op_CopySignD:
 1540     case Op_CopySignF:
 1541       if (UseAVX < 3)  {
 1542         return false;
 1543       }
 1544       if (!VM_Version::supports_avx512vl()) {
 1545         return false;
 1546       }
 1547       break;
 1548     case Op_CompressBits:
 1549     case Op_ExpandBits:
 1550       if (!VM_Version::supports_bmi2()) {
 1551         return false;
 1552       }
 1553       break;
 1554     case Op_CompressM:
 1555       if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
 1556         return false;
 1557       }
 1558       break;
 1559     case Op_ConvF2HF:
 1560     case Op_ConvHF2F:
 1561       if (!VM_Version::supports_float16()) {
 1562         return false;
 1563       }
 1564       break;
 1565     case Op_VectorCastF2HF:
 1566     case Op_VectorCastHF2F:
 1567       if (!VM_Version::supports_f16c() && !VM_Version::supports_evex()) {
 1568         return false;
 1569       }
 1570       break;
 1571   }
 1572   return true;  // Match rules are supported by default.
 1573 }
 1574 
 1575 //------------------------------------------------------------------------
 1576 
 1577 static inline bool is_pop_count_instr_target(BasicType bt) {
 1578   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1579          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1580 }
 1581 
 1582 bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
 1583   return match_rule_supported_vector(opcode, vlen, bt);
 1584 }
 1585 
 1586 // Identify extra cases that we might want to provide match rules for vector nodes and
 1587 // other intrinsics guarded with vector length (vlen) and element type (bt).
 1588 bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
 1589   if (!match_rule_supported(opcode)) {
 1590     return false;
 1591   }
 1592   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
 1593   //   * SSE2 supports 128bit vectors for all types;
 1594   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
 1595   //   * AVX2 supports 256bit vectors for all types;
 1596   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
 1597   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
 1598   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
 1599   // And MaxVectorSize is taken into account as well.
 1600   if (!vector_size_supported(bt, vlen)) {
 1601     return false;
 1602   }
 1603   // Special cases which require vector length follow:
 1604   //   * implementation limitations
 1605   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
 1606   //   * 128bit vroundpd instruction is present only in AVX1
 1607   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1608   switch (opcode) {
 1609     case Op_MaxVHF:
 1610     case Op_MinVHF:
 1611       if (!VM_Version::supports_avx512bw()) {
 1612         return false;
 1613       }
 1614     case Op_AddVHF:
 1615     case Op_DivVHF:
 1616     case Op_FmaVHF:
 1617     case Op_MulVHF:
 1618     case Op_SubVHF:
 1619     case Op_SqrtVHF:
 1620       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1621         return false;
 1622       }
 1623       if (!VM_Version::supports_avx512_fp16()) {
 1624         return false;
 1625       }
 1626       break;
 1627     case Op_AbsVF:
 1628     case Op_NegVF:
 1629       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
 1630         return false; // 512bit vandps and vxorps are not available
 1631       }
 1632       break;
 1633     case Op_AbsVD:
 1634     case Op_NegVD:
 1635       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
 1636         return false; // 512bit vpmullq, vandpd and vxorpd are not available
 1637       }
 1638       break;
 1639     case Op_RotateRightV:
 1640     case Op_RotateLeftV:
 1641       if (bt != T_INT && bt != T_LONG) {
 1642         return false;
 1643       } // fallthrough
 1644     case Op_MacroLogicV:
 1645       if (!VM_Version::supports_evex() ||
 1646           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
 1647         return false;
 1648       }
 1649       break;
 1650     case Op_ClearArray:
 1651     case Op_VectorMaskGen:
 1652     case Op_VectorCmpMasked:
 1653       if (!VM_Version::supports_avx512bw()) {
 1654         return false;
 1655       }
 1656       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
 1657         return false;
 1658       }
 1659       break;
 1660     case Op_LoadVectorMasked:
 1661     case Op_StoreVectorMasked:
 1662       if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
 1663         return false;
 1664       }
 1665       break;
 1666     case Op_UMinV:
 1667     case Op_UMaxV:
 1668       if (UseAVX == 0) {
 1669         return false;
 1670       }
 1671       break;
 1672     case Op_MaxV:
 1673     case Op_MinV:
 1674       if (UseSSE < 4 && is_integral_type(bt)) {
 1675         return false;
 1676       }
 1677       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
 1678           // Float/Double intrinsics are enabled for AVX family currently.
 1679           if (UseAVX == 0) {
 1680             return false;
 1681           }
 1682           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
 1683             return false;
 1684           }
 1685       }
 1686       break;
 1687     case Op_CallLeafVector:
 1688       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
 1689         return false;
 1690       }
 1691       break;
 1692     case Op_AddReductionVI:
 1693       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
 1694         return false;
 1695       }
 1696       // fallthrough
 1697     case Op_AndReductionV:
 1698     case Op_OrReductionV:
 1699     case Op_XorReductionV:
 1700       if (is_subword_type(bt) && (UseSSE < 4)) {
 1701         return false;
 1702       }
 1703       break;
 1704     case Op_MinReductionV:
 1705     case Op_MaxReductionV:
 1706       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
 1707         return false;
 1708       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
 1709         return false;
 1710       }
 1711       // Float/Double intrinsics enabled for AVX family.
 1712       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
 1713         return false;
 1714       }
 1715       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
 1716         return false;
 1717       }
 1718       break;
 1719     case Op_VectorTest:
 1720       if (UseSSE < 4) {
 1721         return false; // Implementation limitation
 1722       } else if (size_in_bits < 32) {
 1723         return false; // Implementation limitation
 1724       }
 1725       break;
 1726     case Op_VectorLoadShuffle:
 1727     case Op_VectorRearrange:
 1728       if(vlen == 2) {
 1729         return false; // Implementation limitation due to how shuffle is loaded
 1730       } else if (size_in_bits == 256 && UseAVX < 2) {
 1731         return false; // Implementation limitation
 1732       }
 1733       break;
 1734     case Op_VectorLoadMask:
 1735     case Op_VectorMaskCast:
 1736       if (size_in_bits == 256 && UseAVX < 2) {
 1737         return false; // Implementation limitation
 1738       }
 1739       // fallthrough
 1740     case Op_VectorStoreMask:
 1741       if (vlen == 2) {
 1742         return false; // Implementation limitation
 1743       }
 1744       break;
 1745     case Op_PopulateIndex:
 1746       if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
 1747         return false;
 1748       }
 1749       break;
 1750     case Op_VectorCastB2X:
 1751     case Op_VectorCastS2X:
 1752     case Op_VectorCastI2X:
 1753       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
 1754         return false;
 1755       }
 1756       break;
 1757     case Op_VectorCastL2X:
 1758       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
 1759         return false;
 1760       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
 1761         return false;
 1762       }
 1763       break;
 1764     case Op_VectorCastF2X: {
 1765         // As per JLS section 5.1.3 narrowing conversion to sub-word types
 1766         // happen after intermediate conversion to integer and special handling
 1767         // code needs AVX2 vpcmpeqd instruction for 256 bit vectors.
 1768         int src_size_in_bits = type2aelembytes(T_FLOAT) * vlen * BitsPerByte;
 1769         if (is_integral_type(bt) && src_size_in_bits == 256 && UseAVX < 2) {
 1770           return false;
 1771         }
 1772       }
 1773       // fallthrough
 1774     case Op_VectorCastD2X:
 1775       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
 1776         return false;
 1777       }
 1778       break;
 1779     case Op_VectorCastF2HF:
 1780     case Op_VectorCastHF2F:
 1781       if (!VM_Version::supports_f16c() &&
 1782          ((!VM_Version::supports_evex() ||
 1783          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())))) {
 1784         return false;
 1785       }
 1786       break;
 1787     case Op_RoundVD:
 1788       if (!VM_Version::supports_avx512dq()) {
 1789         return false;
 1790       }
 1791       break;
 1792     case Op_MulReductionVI:
 1793       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1794         return false;
 1795       }
 1796       break;
 1797     case Op_LoadVectorGatherMasked:
 1798       if (!is_subword_type(bt) && size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1799         return false;
 1800       }
 1801       if (is_subword_type(bt) &&
 1802          ((size_in_bits > 256 && !VM_Version::supports_avx512bw()) ||
 1803           (size_in_bits < 64)                                      ||
 1804           (bt == T_SHORT && !VM_Version::supports_bmi2()))) {
 1805         return false;
 1806       }
 1807       break;
 1808     case Op_StoreVectorScatterMasked:
 1809     case Op_StoreVectorScatter:
 1810       if (is_subword_type(bt)) {
 1811         return false;
 1812       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1813         return false;
 1814       }
 1815       // fallthrough
 1816     case Op_LoadVectorGather:
 1817       if (!is_subword_type(bt) && size_in_bits == 64) {
 1818         return false;
 1819       }
 1820       if (is_subword_type(bt) && size_in_bits < 64) {
 1821         return false;
 1822       }
 1823       break;
 1824     case Op_SaturatingAddV:
 1825     case Op_SaturatingSubV:
 1826       if (UseAVX < 1) {
 1827         return false; // Implementation limitation
 1828       }
 1829       if (is_subword_type(bt) && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1830         return false;
 1831       }
 1832       break;
 1833     case Op_SelectFromTwoVector:
 1834        if (size_in_bits < 128 || (size_in_bits < 512 && !VM_Version::supports_avx512vl())) {
 1835          return false;
 1836        }
 1837        if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 1838          return false;
 1839        }
 1840        if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 1841          return false;
 1842        }
 1843        if ((bt == T_INT || bt == T_FLOAT || bt == T_DOUBLE) && !VM_Version::supports_evex()) {
 1844          return false;
 1845        }
 1846        break;
 1847     case Op_MaskAll:
 1848       if (!VM_Version::supports_evex()) {
 1849         return false;
 1850       }
 1851       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
 1852         return false;
 1853       }
 1854       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1855         return false;
 1856       }
 1857       break;
 1858     case Op_VectorMaskCmp:
 1859       if (vlen < 2 || size_in_bits < 32) {
 1860         return false;
 1861       }
 1862       break;
 1863     case Op_CompressM:
 1864       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1865         return false;
 1866       }
 1867       break;
 1868     case Op_CompressV:
 1869     case Op_ExpandV:
 1870       if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
 1871         return false;
 1872       }
 1873       if (size_in_bits < 128 ) {
 1874         return false;
 1875       }
 1876     case Op_VectorLongToMask:
 1877       if (UseAVX < 1) {
 1878         return false;
 1879       }
 1880       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
 1881         return false;
 1882       }
 1883       break;
 1884     case Op_SignumVD:
 1885     case Op_SignumVF:
 1886       if (UseAVX < 1) {
 1887         return false;
 1888       }
 1889       break;
 1890     case Op_PopCountVI:
 1891     case Op_PopCountVL: {
 1892         if (!is_pop_count_instr_target(bt) &&
 1893             (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
 1894           return false;
 1895         }
 1896       }
 1897       break;
 1898     case Op_ReverseV:
 1899     case Op_ReverseBytesV:
 1900       if (UseAVX < 2) {
 1901         return false;
 1902       }
 1903       break;
 1904     case Op_CountTrailingZerosV:
 1905     case Op_CountLeadingZerosV:
 1906       if (UseAVX < 2) {
 1907         return false;
 1908       }
 1909       break;
 1910   }
 1911   return true;  // Per default match rules are supported.
 1912 }
 1913 
 1914 bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
 1915   // ADLC based match_rule_supported routine checks for the existence of pattern based
 1916   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
 1917   // of their non-masked counterpart with mask edge being the differentiator.
 1918   // This routine does a strict check on the existence of masked operation patterns
 1919   // by returning a default false value for all the other opcodes apart from the
 1920   // ones whose masked instruction patterns are defined in this file.
 1921   if (!match_rule_supported_vector(opcode, vlen, bt)) {
 1922     return false;
 1923   }
 1924 
 1925   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1926   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
 1927     return false;
 1928   }
 1929   switch(opcode) {
 1930     // Unary masked operations
 1931     case Op_AbsVB:
 1932     case Op_AbsVS:
 1933       if(!VM_Version::supports_avx512bw()) {
 1934         return false;  // Implementation limitation
 1935       }
 1936     case Op_AbsVI:
 1937     case Op_AbsVL:
 1938       return true;
 1939 
 1940     // Ternary masked operations
 1941     case Op_FmaVF:
 1942     case Op_FmaVD:
 1943       return true;
 1944 
 1945     case Op_MacroLogicV:
 1946       if(bt != T_INT && bt != T_LONG) {
 1947         return false;
 1948       }
 1949       return true;
 1950 
 1951     // Binary masked operations
 1952     case Op_AddVB:
 1953     case Op_AddVS:
 1954     case Op_SubVB:
 1955     case Op_SubVS:
 1956     case Op_MulVS:
 1957     case Op_LShiftVS:
 1958     case Op_RShiftVS:
 1959     case Op_URShiftVS:
 1960       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1961       if (!VM_Version::supports_avx512bw()) {
 1962         return false;  // Implementation limitation
 1963       }
 1964       return true;
 1965 
 1966     case Op_MulVL:
 1967       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1968       if (!VM_Version::supports_avx512dq()) {
 1969         return false;  // Implementation limitation
 1970       }
 1971       return true;
 1972 
 1973     case Op_AndV:
 1974     case Op_OrV:
 1975     case Op_XorV:
 1976     case Op_RotateRightV:
 1977     case Op_RotateLeftV:
 1978       if (bt != T_INT && bt != T_LONG) {
 1979         return false; // Implementation limitation
 1980       }
 1981       return true;
 1982 
 1983     case Op_VectorLoadMask:
 1984       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1985       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 1986         return false;
 1987       }
 1988       return true;
 1989 
 1990     case Op_AddVI:
 1991     case Op_AddVL:
 1992     case Op_AddVF:
 1993     case Op_AddVD:
 1994     case Op_SubVI:
 1995     case Op_SubVL:
 1996     case Op_SubVF:
 1997     case Op_SubVD:
 1998     case Op_MulVI:
 1999     case Op_MulVF:
 2000     case Op_MulVD:
 2001     case Op_DivVF:
 2002     case Op_DivVD:
 2003     case Op_SqrtVF:
 2004     case Op_SqrtVD:
 2005     case Op_LShiftVI:
 2006     case Op_LShiftVL:
 2007     case Op_RShiftVI:
 2008     case Op_RShiftVL:
 2009     case Op_URShiftVI:
 2010     case Op_URShiftVL:
 2011     case Op_LoadVectorMasked:
 2012     case Op_StoreVectorMasked:
 2013     case Op_LoadVectorGatherMasked:
 2014     case Op_StoreVectorScatterMasked:
 2015       return true;
 2016 
 2017     case Op_UMinV:
 2018     case Op_UMaxV:
 2019       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 2020         return false;
 2021       } // fallthrough
 2022     case Op_MaxV:
 2023     case Op_MinV:
 2024       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2025         return false; // Implementation limitation
 2026       }
 2027       if (is_floating_point_type(bt) && !VM_Version::supports_avx10_2()) {
 2028         return false; // Implementation limitation
 2029       }
 2030       return true;
 2031     case Op_SaturatingAddV:
 2032     case Op_SaturatingSubV:
 2033       if (!is_subword_type(bt)) {
 2034         return false;
 2035       }
 2036       if (size_in_bits < 128 || !VM_Version::supports_avx512bw()) {
 2037         return false; // Implementation limitation
 2038       }
 2039       return true;
 2040 
 2041     case Op_VectorMaskCmp:
 2042       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2043         return false; // Implementation limitation
 2044       }
 2045       return true;
 2046 
 2047     case Op_VectorRearrange:
 2048       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 2049         return false; // Implementation limitation
 2050       }
 2051       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 2052         return false; // Implementation limitation
 2053       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
 2054         return false; // Implementation limitation
 2055       }
 2056       return true;
 2057 
 2058     // Binary Logical operations
 2059     case Op_AndVMask:
 2060     case Op_OrVMask:
 2061     case Op_XorVMask:
 2062       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
 2063         return false; // Implementation limitation
 2064       }
 2065       return true;
 2066 
 2067     case Op_PopCountVI:
 2068     case Op_PopCountVL:
 2069       if (!is_pop_count_instr_target(bt)) {
 2070         return false;
 2071       }
 2072       return true;
 2073 
 2074     case Op_MaskAll:
 2075       return true;
 2076 
 2077     case Op_CountLeadingZerosV:
 2078       if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
 2079         return true;
 2080       }
 2081     default:
 2082       return false;
 2083   }
 2084 }
 2085 
 2086 bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
 2087   return false;
 2088 }
 2089 
 2090 // Return true if Vector::rearrange needs preparation of the shuffle argument
 2091 bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen) {
 2092   switch (elem_bt) {
 2093     case T_BYTE:  return false;
 2094     case T_SHORT: return !VM_Version::supports_avx512bw();
 2095     case T_INT:   return !VM_Version::supports_avx();
 2096     case T_LONG:  return vlen < 8 && !VM_Version::supports_avx512vl();
 2097     default:
 2098       ShouldNotReachHere();
 2099       return false;
 2100   }
 2101 }
 2102 
 2103 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
 2104   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
 2105   bool legacy = (generic_opnd->opcode() == LEGVEC);
 2106   if (!VM_Version::supports_avx512vlbwdq() && // KNL
 2107       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
 2108     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
 2109     return new legVecZOper();
 2110   }
 2111   if (legacy) {
 2112     switch (ideal_reg) {
 2113       case Op_VecS: return new legVecSOper();
 2114       case Op_VecD: return new legVecDOper();
 2115       case Op_VecX: return new legVecXOper();
 2116       case Op_VecY: return new legVecYOper();
 2117       case Op_VecZ: return new legVecZOper();
 2118     }
 2119   } else {
 2120     switch (ideal_reg) {
 2121       case Op_VecS: return new vecSOper();
 2122       case Op_VecD: return new vecDOper();
 2123       case Op_VecX: return new vecXOper();
 2124       case Op_VecY: return new vecYOper();
 2125       case Op_VecZ: return new vecZOper();
 2126     }
 2127   }
 2128   ShouldNotReachHere();
 2129   return nullptr;
 2130 }
 2131 
 2132 bool Matcher::is_reg2reg_move(MachNode* m) {
 2133   switch (m->rule()) {
 2134     case MoveVec2Leg_rule:
 2135     case MoveLeg2Vec_rule:
 2136     case MoveF2VL_rule:
 2137     case MoveF2LEG_rule:
 2138     case MoveVL2F_rule:
 2139     case MoveLEG2F_rule:
 2140     case MoveD2VL_rule:
 2141     case MoveD2LEG_rule:
 2142     case MoveVL2D_rule:
 2143     case MoveLEG2D_rule:
 2144       return true;
 2145     default:
 2146       return false;
 2147   }
 2148 }
 2149 
 2150 bool Matcher::is_generic_vector(MachOper* opnd) {
 2151   switch (opnd->opcode()) {
 2152     case VEC:
 2153     case LEGVEC:
 2154       return true;
 2155     default:
 2156       return false;
 2157   }
 2158 }
 2159 
 2160 //------------------------------------------------------------------------
 2161 
 2162 const RegMask* Matcher::predicate_reg_mask(void) {
 2163   return &_VECTMASK_REG_mask;
 2164 }
 2165 
 2166 // Max vector size in bytes. 0 if not supported.
 2167 int Matcher::vector_width_in_bytes(BasicType bt) {
 2168   assert(is_java_primitive(bt), "only primitive type vectors");
 2169   // SSE2 supports 128bit vectors for all types.
 2170   // AVX2 supports 256bit vectors for all types.
 2171   // AVX2/EVEX supports 512bit vectors for all types.
 2172   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
 2173   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 2174   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 2175     size = (UseAVX > 2) ? 64 : 32;
 2176   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
 2177     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
 2178   // Use flag to limit vector size.
 2179   size = MIN2(size,(int)MaxVectorSize);
 2180   // Minimum 2 values in vector (or 4 for bytes).
 2181   switch (bt) {
 2182   case T_DOUBLE:
 2183   case T_LONG:
 2184     if (size < 16) return 0;
 2185     break;
 2186   case T_FLOAT:
 2187   case T_INT:
 2188     if (size < 8) return 0;
 2189     break;
 2190   case T_BOOLEAN:
 2191     if (size < 4) return 0;
 2192     break;
 2193   case T_CHAR:
 2194     if (size < 4) return 0;
 2195     break;
 2196   case T_BYTE:
 2197     if (size < 4) return 0;
 2198     break;
 2199   case T_SHORT:
 2200     if (size < 4) return 0;
 2201     break;
 2202   default:
 2203     ShouldNotReachHere();
 2204   }
 2205   return size;
 2206 }
 2207 
 2208 // Limits on vector size (number of elements) loaded into vector.
 2209 int Matcher::max_vector_size(const BasicType bt) {
 2210   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 2211 }
 2212 int Matcher::min_vector_size(const BasicType bt) {
 2213   int max_size = max_vector_size(bt);
 2214   // Min size which can be loaded into vector is 4 bytes.
 2215   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 2216   // Support for calling svml double64 vectors
 2217   if (bt == T_DOUBLE) {
 2218     size = 1;
 2219   }
 2220   return MIN2(size,max_size);
 2221 }
 2222 
 2223 int Matcher::max_vector_size_auto_vectorization(const BasicType bt) {
 2224   // Limit the max vector size for auto vectorization to 256 bits (32 bytes)
 2225   // by default on Cascade Lake
 2226   if (VM_Version::is_default_intel_cascade_lake()) {
 2227     return MIN2(Matcher::max_vector_size(bt), 32 / type2aelembytes(bt));
 2228   }
 2229   return Matcher::max_vector_size(bt);
 2230 }
 2231 
 2232 int Matcher::scalable_vector_reg_size(const BasicType bt) {
 2233   return -1;
 2234 }
 2235 
 2236 // Vector ideal reg corresponding to specified size in bytes
 2237 uint Matcher::vector_ideal_reg(int size) {
 2238   assert(MaxVectorSize >= size, "");
 2239   switch(size) {
 2240     case  4: return Op_VecS;
 2241     case  8: return Op_VecD;
 2242     case 16: return Op_VecX;
 2243     case 32: return Op_VecY;
 2244     case 64: return Op_VecZ;
 2245   }
 2246   ShouldNotReachHere();
 2247   return 0;
 2248 }
 2249 
 2250 // Check for shift by small constant as well
 2251 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
 2252   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
 2253       shift->in(2)->get_int() <= 3 &&
 2254       // Are there other uses besides address expressions?
 2255       !matcher->is_visited(shift)) {
 2256     address_visited.set(shift->_idx); // Flag as address_visited
 2257     mstack.push(shift->in(2), Matcher::Visit);
 2258     Node *conv = shift->in(1);
 2259     // Allow Matcher to match the rule which bypass
 2260     // ConvI2L operation for an array index on LP64
 2261     // if the index value is positive.
 2262     if (conv->Opcode() == Op_ConvI2L &&
 2263         conv->as_Type()->type()->is_long()->_lo >= 0 &&
 2264         // Are there other uses besides address expressions?
 2265         !matcher->is_visited(conv)) {
 2266       address_visited.set(conv->_idx); // Flag as address_visited
 2267       mstack.push(conv->in(1), Matcher::Pre_Visit);
 2268     } else {
 2269       mstack.push(conv, Matcher::Pre_Visit);
 2270     }
 2271     return true;
 2272   }
 2273   return false;
 2274 }
 2275 
 2276 // This function identifies sub-graphs in which a 'load' node is
 2277 // input to two different nodes, and such that it can be matched
 2278 // with BMI instructions like blsi, blsr, etc.
 2279 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
 2280 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
 2281 // refers to the same node.
 2282 //
 2283 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
 2284 // This is a temporary solution until we make DAGs expressible in ADL.
 2285 template<typename ConType>
 2286 class FusedPatternMatcher {
 2287   Node* _op1_node;
 2288   Node* _mop_node;
 2289   int _con_op;
 2290 
 2291   static int match_next(Node* n, int next_op, int next_op_idx) {
 2292     if (n->in(1) == nullptr || n->in(2) == nullptr) {
 2293       return -1;
 2294     }
 2295 
 2296     if (next_op_idx == -1) { // n is commutative, try rotations
 2297       if (n->in(1)->Opcode() == next_op) {
 2298         return 1;
 2299       } else if (n->in(2)->Opcode() == next_op) {
 2300         return 2;
 2301       }
 2302     } else {
 2303       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
 2304       if (n->in(next_op_idx)->Opcode() == next_op) {
 2305         return next_op_idx;
 2306       }
 2307     }
 2308     return -1;
 2309   }
 2310 
 2311  public:
 2312   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
 2313     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
 2314 
 2315   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
 2316              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
 2317              typename ConType::NativeType con_value) {
 2318     if (_op1_node->Opcode() != op1) {
 2319       return false;
 2320     }
 2321     if (_mop_node->outcnt() > 2) {
 2322       return false;
 2323     }
 2324     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
 2325     if (op1_op2_idx == -1) {
 2326       return false;
 2327     }
 2328     // Memory operation must be the other edge
 2329     int op1_mop_idx = (op1_op2_idx & 1) + 1;
 2330 
 2331     // Check that the mop node is really what we want
 2332     if (_op1_node->in(op1_mop_idx) == _mop_node) {
 2333       Node* op2_node = _op1_node->in(op1_op2_idx);
 2334       if (op2_node->outcnt() > 1) {
 2335         return false;
 2336       }
 2337       assert(op2_node->Opcode() == op2, "Should be");
 2338       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
 2339       if (op2_con_idx == -1) {
 2340         return false;
 2341       }
 2342       // Memory operation must be the other edge
 2343       int op2_mop_idx = (op2_con_idx & 1) + 1;
 2344       // Check that the memory operation is the same node
 2345       if (op2_node->in(op2_mop_idx) == _mop_node) {
 2346         // Now check the constant
 2347         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
 2348         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
 2349           return true;
 2350         }
 2351       }
 2352     }
 2353     return false;
 2354   }
 2355 };
 2356 
 2357 static bool is_bmi_pattern(Node* n, Node* m) {
 2358   assert(UseBMI1Instructions, "sanity");
 2359   if (n != nullptr && m != nullptr) {
 2360     if (m->Opcode() == Op_LoadI) {
 2361       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
 2362       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
 2363              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
 2364              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
 2365     } else if (m->Opcode() == Op_LoadL) {
 2366       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
 2367       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
 2368              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
 2369              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
 2370     }
 2371   }
 2372   return false;
 2373 }
 2374 
 2375 // Should the matcher clone input 'm' of node 'n'?
 2376 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 2377   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
 2378   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
 2379     mstack.push(m, Visit);
 2380     return true;
 2381   }
 2382   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
 2383     mstack.push(m, Visit);           // m = ShiftCntV
 2384     return true;
 2385   }
 2386   if (is_encode_and_store_pattern(n, m)) {
 2387     mstack.push(m, Visit);
 2388     return true;
 2389   }
 2390   return false;
 2391 }
 2392 
 2393 // Should the Matcher clone shifts on addressing modes, expecting them
 2394 // to be subsumed into complex addressing expressions or compute them
 2395 // into registers?
 2396 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 2397   Node *off = m->in(AddPNode::Offset);
 2398   if (off->is_Con()) {
 2399     address_visited.test_set(m->_idx); // Flag as address_visited
 2400     Node *adr = m->in(AddPNode::Address);
 2401 
 2402     // Intel can handle 2 adds in addressing mode, with one of them using an immediate offset.
 2403     // AtomicAdd is not an addressing expression.
 2404     // Cheap to find it by looking for screwy base.
 2405     if (adr->is_AddP() &&
 2406         !adr->in(AddPNode::Base)->is_top() &&
 2407         !adr->in(AddPNode::Offset)->is_Con() &&
 2408         off->get_long() == (int) (off->get_long()) && // immL32
 2409         // Are there other uses besides address expressions?
 2410         !is_visited(adr)) {
 2411       address_visited.set(adr->_idx); // Flag as address_visited
 2412       Node *shift = adr->in(AddPNode::Offset);
 2413       if (!clone_shift(shift, this, mstack, address_visited)) {
 2414         mstack.push(shift, Pre_Visit);
 2415       }
 2416       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
 2417       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
 2418     } else {
 2419       mstack.push(adr, Pre_Visit);
 2420     }
 2421 
 2422     // Clone X+offset as it also folds into most addressing expressions
 2423     mstack.push(off, Visit);
 2424     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2425     return true;
 2426   } else if (clone_shift(off, this, mstack, address_visited)) {
 2427     address_visited.test_set(m->_idx); // Flag as address_visited
 2428     mstack.push(m->in(AddPNode::Address), Pre_Visit);
 2429     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2430     return true;
 2431   }
 2432   return false;
 2433 }
 2434 
 2435 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
 2436   switch (bt) {
 2437     case BoolTest::eq:
 2438       return Assembler::eq;
 2439     case BoolTest::ne:
 2440       return Assembler::neq;
 2441     case BoolTest::le:
 2442     case BoolTest::ule:
 2443       return Assembler::le;
 2444     case BoolTest::ge:
 2445     case BoolTest::uge:
 2446       return Assembler::nlt;
 2447     case BoolTest::lt:
 2448     case BoolTest::ult:
 2449       return Assembler::lt;
 2450     case BoolTest::gt:
 2451     case BoolTest::ugt:
 2452       return Assembler::nle;
 2453     default : ShouldNotReachHere(); return Assembler::_false;
 2454   }
 2455 }
 2456 
 2457 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
 2458   switch (bt) {
 2459   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
 2460   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
 2461   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
 2462   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
 2463   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
 2464   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
 2465   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
 2466   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
 2467   }
 2468 }
 2469 
 2470 // Helper methods for MachSpillCopyNode::implementation().
 2471 static void vec_mov_helper(C2_MacroAssembler *masm, int src_lo, int dst_lo,
 2472                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 2473   assert(ireg == Op_VecS || // 32bit vector
 2474          ((src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 2475           (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi),
 2476          "no non-adjacent vector moves" );
 2477   if (masm) {
 2478     switch (ireg) {
 2479     case Op_VecS: // copy whole register
 2480     case Op_VecD:
 2481     case Op_VecX:
 2482       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2483         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2484       } else {
 2485         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2486      }
 2487       break;
 2488     case Op_VecY:
 2489       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2490         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2491       } else {
 2492         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2493      }
 2494       break;
 2495     case Op_VecZ:
 2496       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
 2497       break;
 2498     default:
 2499       ShouldNotReachHere();
 2500     }
 2501 #ifndef PRODUCT
 2502   } else {
 2503     switch (ireg) {
 2504     case Op_VecS:
 2505     case Op_VecD:
 2506     case Op_VecX:
 2507       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2508       break;
 2509     case Op_VecY:
 2510     case Op_VecZ:
 2511       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2512       break;
 2513     default:
 2514       ShouldNotReachHere();
 2515     }
 2516 #endif
 2517   }
 2518 }
 2519 
 2520 void vec_spill_helper(C2_MacroAssembler *masm, bool is_load,
 2521                      int stack_offset, int reg, uint ireg, outputStream* st) {
 2522   if (masm) {
 2523     if (is_load) {
 2524       switch (ireg) {
 2525       case Op_VecS:
 2526         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2527         break;
 2528       case Op_VecD:
 2529         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2530         break;
 2531       case Op_VecX:
 2532         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2533           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2534         } else {
 2535           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2536           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2537         }
 2538         break;
 2539       case Op_VecY:
 2540         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2541           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2542         } else {
 2543           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2544           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2545         }
 2546         break;
 2547       case Op_VecZ:
 2548         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
 2549         break;
 2550       default:
 2551         ShouldNotReachHere();
 2552       }
 2553     } else { // store
 2554       switch (ireg) {
 2555       case Op_VecS:
 2556         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2557         break;
 2558       case Op_VecD:
 2559         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2560         break;
 2561       case Op_VecX:
 2562         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2563           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2564         }
 2565         else {
 2566           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2567         }
 2568         break;
 2569       case Op_VecY:
 2570         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2571           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2572         }
 2573         else {
 2574           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2575         }
 2576         break;
 2577       case Op_VecZ:
 2578         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2579         break;
 2580       default:
 2581         ShouldNotReachHere();
 2582       }
 2583     }
 2584 #ifndef PRODUCT
 2585   } else {
 2586     if (is_load) {
 2587       switch (ireg) {
 2588       case Op_VecS:
 2589         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2590         break;
 2591       case Op_VecD:
 2592         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2593         break;
 2594        case Op_VecX:
 2595         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2596         break;
 2597       case Op_VecY:
 2598       case Op_VecZ:
 2599         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2600         break;
 2601       default:
 2602         ShouldNotReachHere();
 2603       }
 2604     } else { // store
 2605       switch (ireg) {
 2606       case Op_VecS:
 2607         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2608         break;
 2609       case Op_VecD:
 2610         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2611         break;
 2612        case Op_VecX:
 2613         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2614         break;
 2615       case Op_VecY:
 2616       case Op_VecZ:
 2617         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2618         break;
 2619       default:
 2620         ShouldNotReachHere();
 2621       }
 2622     }
 2623 #endif
 2624   }
 2625 }
 2626 
 2627 template <class T>
 2628 static inline GrowableArray<jbyte>* vreplicate_imm(BasicType bt, T con, int len) {
 2629   int size = type2aelembytes(bt) * len;
 2630   GrowableArray<jbyte>* val = new GrowableArray<jbyte>(size, size, 0);
 2631   for (int i = 0; i < len; i++) {
 2632     int offset = i * type2aelembytes(bt);
 2633     switch (bt) {
 2634       case T_BYTE: val->at(i) = con; break;
 2635       case T_SHORT: {
 2636         jshort c = con;
 2637         memcpy(val->adr_at(offset), &c, sizeof(jshort));
 2638         break;
 2639       }
 2640       case T_INT: {
 2641         jint c = con;
 2642         memcpy(val->adr_at(offset), &c, sizeof(jint));
 2643         break;
 2644       }
 2645       case T_LONG: {
 2646         jlong c = con;
 2647         memcpy(val->adr_at(offset), &c, sizeof(jlong));
 2648         break;
 2649       }
 2650       case T_FLOAT: {
 2651         jfloat c = con;
 2652         memcpy(val->adr_at(offset), &c, sizeof(jfloat));
 2653         break;
 2654       }
 2655       case T_DOUBLE: {
 2656         jdouble c = con;
 2657         memcpy(val->adr_at(offset), &c, sizeof(jdouble));
 2658         break;
 2659       }
 2660       default: assert(false, "%s", type2name(bt));
 2661     }
 2662   }
 2663   return val;
 2664 }
 2665 
 2666 static inline jlong high_bit_set(BasicType bt) {
 2667   switch (bt) {
 2668     case T_BYTE:  return 0x8080808080808080;
 2669     case T_SHORT: return 0x8000800080008000;
 2670     case T_INT:   return 0x8000000080000000;
 2671     case T_LONG:  return 0x8000000000000000;
 2672     default:
 2673       ShouldNotReachHere();
 2674       return 0;
 2675   }
 2676 }
 2677 
 2678 #ifndef PRODUCT
 2679   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 2680     st->print("nop \t# %d bytes pad for loops and calls", _count);
 2681   }
 2682 #endif
 2683 
 2684   void MachNopNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc*) const {
 2685     __ nop(_count);
 2686   }
 2687 
 2688   uint MachNopNode::size(PhaseRegAlloc*) const {
 2689     return _count;
 2690   }
 2691 
 2692 #ifndef PRODUCT
 2693   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 2694     st->print("# breakpoint");
 2695   }
 2696 #endif
 2697 
 2698   void MachBreakpointNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc* ra_) const {
 2699     __ int3();
 2700   }
 2701 
 2702   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 2703     return MachNode::size(ra_);
 2704   }
 2705 
 2706 %}
 2707 
 2708 encode %{
 2709 
 2710   enc_class call_epilog %{
 2711     if (VerifyStackAtCalls) {
 2712       // Check that stack depth is unchanged: find majik cookie on stack
 2713       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 2714       Label L;
 2715       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 2716       __ jccb(Assembler::equal, L);
 2717       // Die if stack mismatch
 2718       __ int3();
 2719       __ bind(L);
 2720     }
 2721   %}
 2722 
 2723 %}
 2724 
 2725 // Operands for bound floating pointer register arguments
 2726 operand rxmm0() %{
 2727   constraint(ALLOC_IN_RC(xmm0_reg));
 2728   match(VecX);
 2729   format%{%}
 2730   interface(REG_INTER);
 2731 %}
 2732 
 2733 //----------OPERANDS-----------------------------------------------------------
 2734 // Operand definitions must precede instruction definitions for correct parsing
 2735 // in the ADLC because operands constitute user defined types which are used in
 2736 // instruction definitions.
 2737 
 2738 // Vectors
 2739 
 2740 // Dummy generic vector class. Should be used for all vector operands.
 2741 // Replaced with vec[SDXYZ] during post-selection pass.
 2742 operand vec() %{
 2743   constraint(ALLOC_IN_RC(dynamic));
 2744   match(VecX);
 2745   match(VecY);
 2746   match(VecZ);
 2747   match(VecS);
 2748   match(VecD);
 2749 
 2750   format %{ %}
 2751   interface(REG_INTER);
 2752 %}
 2753 
 2754 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
 2755 // Replaced with legVec[SDXYZ] during post-selection cleanup.
 2756 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
 2757 // runtime code generation via reg_class_dynamic.
 2758 operand legVec() %{
 2759   constraint(ALLOC_IN_RC(dynamic));
 2760   match(VecX);
 2761   match(VecY);
 2762   match(VecZ);
 2763   match(VecS);
 2764   match(VecD);
 2765 
 2766   format %{ %}
 2767   interface(REG_INTER);
 2768 %}
 2769 
 2770 // Replaces vec during post-selection cleanup. See above.
 2771 operand vecS() %{
 2772   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
 2773   match(VecS);
 2774 
 2775   format %{ %}
 2776   interface(REG_INTER);
 2777 %}
 2778 
 2779 // Replaces legVec during post-selection cleanup. See above.
 2780 operand legVecS() %{
 2781   constraint(ALLOC_IN_RC(vectors_reg_legacy));
 2782   match(VecS);
 2783 
 2784   format %{ %}
 2785   interface(REG_INTER);
 2786 %}
 2787 
 2788 // Replaces vec during post-selection cleanup. See above.
 2789 operand vecD() %{
 2790   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
 2791   match(VecD);
 2792 
 2793   format %{ %}
 2794   interface(REG_INTER);
 2795 %}
 2796 
 2797 // Replaces legVec during post-selection cleanup. See above.
 2798 operand legVecD() %{
 2799   constraint(ALLOC_IN_RC(vectord_reg_legacy));
 2800   match(VecD);
 2801 
 2802   format %{ %}
 2803   interface(REG_INTER);
 2804 %}
 2805 
 2806 // Replaces vec during post-selection cleanup. See above.
 2807 operand vecX() %{
 2808   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
 2809   match(VecX);
 2810 
 2811   format %{ %}
 2812   interface(REG_INTER);
 2813 %}
 2814 
 2815 // Replaces legVec during post-selection cleanup. See above.
 2816 operand legVecX() %{
 2817   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
 2818   match(VecX);
 2819 
 2820   format %{ %}
 2821   interface(REG_INTER);
 2822 %}
 2823 
 2824 // Replaces vec during post-selection cleanup. See above.
 2825 operand vecY() %{
 2826   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
 2827   match(VecY);
 2828 
 2829   format %{ %}
 2830   interface(REG_INTER);
 2831 %}
 2832 
 2833 // Replaces legVec during post-selection cleanup. See above.
 2834 operand legVecY() %{
 2835   constraint(ALLOC_IN_RC(vectory_reg_legacy));
 2836   match(VecY);
 2837 
 2838   format %{ %}
 2839   interface(REG_INTER);
 2840 %}
 2841 
 2842 // Replaces vec during post-selection cleanup. See above.
 2843 operand vecZ() %{
 2844   constraint(ALLOC_IN_RC(vectorz_reg));
 2845   match(VecZ);
 2846 
 2847   format %{ %}
 2848   interface(REG_INTER);
 2849 %}
 2850 
 2851 // Replaces legVec during post-selection cleanup. See above.
 2852 operand legVecZ() %{
 2853   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
 2854   match(VecZ);
 2855 
 2856   format %{ %}
 2857   interface(REG_INTER);
 2858 %}
 2859 
 2860 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 2861 
 2862 // ============================================================================
 2863 
 2864 instruct ShouldNotReachHere() %{
 2865   match(Halt);
 2866   format %{ "stop\t# ShouldNotReachHere" %}
 2867   ins_encode %{
 2868     if (is_reachable()) {
 2869       const char* str = __ code_string(_halt_reason);
 2870       __ stop(str);
 2871     }
 2872   %}
 2873   ins_pipe(pipe_slow);
 2874 %}
 2875 
 2876 // ============================================================================
 2877 
 2878 instruct addF_reg(regF dst, regF src) %{
 2879   predicate(UseAVX == 0);
 2880   match(Set dst (AddF dst src));
 2881 
 2882   format %{ "addss   $dst, $src" %}
 2883   ins_cost(150);
 2884   ins_encode %{
 2885     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 2886   %}
 2887   ins_pipe(pipe_slow);
 2888 %}
 2889 
 2890 instruct addF_mem(regF dst, memory src) %{
 2891   predicate(UseAVX == 0);
 2892   match(Set dst (AddF dst (LoadF src)));
 2893 
 2894   format %{ "addss   $dst, $src" %}
 2895   ins_cost(150);
 2896   ins_encode %{
 2897     __ addss($dst$$XMMRegister, $src$$Address);
 2898   %}
 2899   ins_pipe(pipe_slow);
 2900 %}
 2901 
 2902 instruct addF_imm(regF dst, immF con) %{
 2903   predicate(UseAVX == 0);
 2904   match(Set dst (AddF dst con));
 2905   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 2906   ins_cost(150);
 2907   ins_encode %{
 2908     __ addss($dst$$XMMRegister, $constantaddress($con));
 2909   %}
 2910   ins_pipe(pipe_slow);
 2911 %}
 2912 
 2913 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 2914   predicate(UseAVX > 0);
 2915   match(Set dst (AddF src1 src2));
 2916 
 2917   format %{ "vaddss  $dst, $src1, $src2" %}
 2918   ins_cost(150);
 2919   ins_encode %{
 2920     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 2921   %}
 2922   ins_pipe(pipe_slow);
 2923 %}
 2924 
 2925 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 2926   predicate(UseAVX > 0);
 2927   match(Set dst (AddF src1 (LoadF src2)));
 2928 
 2929   format %{ "vaddss  $dst, $src1, $src2" %}
 2930   ins_cost(150);
 2931   ins_encode %{
 2932     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 2933   %}
 2934   ins_pipe(pipe_slow);
 2935 %}
 2936 
 2937 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 2938   predicate(UseAVX > 0);
 2939   match(Set dst (AddF src con));
 2940 
 2941   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 2942   ins_cost(150);
 2943   ins_encode %{
 2944     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 2945   %}
 2946   ins_pipe(pipe_slow);
 2947 %}
 2948 
 2949 instruct addD_reg(regD dst, regD src) %{
 2950   predicate(UseAVX == 0);
 2951   match(Set dst (AddD dst src));
 2952 
 2953   format %{ "addsd   $dst, $src" %}
 2954   ins_cost(150);
 2955   ins_encode %{
 2956     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 2957   %}
 2958   ins_pipe(pipe_slow);
 2959 %}
 2960 
 2961 instruct addD_mem(regD dst, memory src) %{
 2962   predicate(UseAVX == 0);
 2963   match(Set dst (AddD dst (LoadD src)));
 2964 
 2965   format %{ "addsd   $dst, $src" %}
 2966   ins_cost(150);
 2967   ins_encode %{
 2968     __ addsd($dst$$XMMRegister, $src$$Address);
 2969   %}
 2970   ins_pipe(pipe_slow);
 2971 %}
 2972 
 2973 instruct addD_imm(regD dst, immD con) %{
 2974   predicate(UseAVX == 0);
 2975   match(Set dst (AddD dst con));
 2976   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 2977   ins_cost(150);
 2978   ins_encode %{
 2979     __ addsd($dst$$XMMRegister, $constantaddress($con));
 2980   %}
 2981   ins_pipe(pipe_slow);
 2982 %}
 2983 
 2984 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 2985   predicate(UseAVX > 0);
 2986   match(Set dst (AddD src1 src2));
 2987 
 2988   format %{ "vaddsd  $dst, $src1, $src2" %}
 2989   ins_cost(150);
 2990   ins_encode %{
 2991     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 2992   %}
 2993   ins_pipe(pipe_slow);
 2994 %}
 2995 
 2996 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 2997   predicate(UseAVX > 0);
 2998   match(Set dst (AddD src1 (LoadD src2)));
 2999 
 3000   format %{ "vaddsd  $dst, $src1, $src2" %}
 3001   ins_cost(150);
 3002   ins_encode %{
 3003     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3004   %}
 3005   ins_pipe(pipe_slow);
 3006 %}
 3007 
 3008 instruct addD_reg_imm(regD dst, regD src, immD con) %{
 3009   predicate(UseAVX > 0);
 3010   match(Set dst (AddD src con));
 3011 
 3012   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3013   ins_cost(150);
 3014   ins_encode %{
 3015     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3016   %}
 3017   ins_pipe(pipe_slow);
 3018 %}
 3019 
 3020 instruct subF_reg(regF dst, regF src) %{
 3021   predicate(UseAVX == 0);
 3022   match(Set dst (SubF dst src));
 3023 
 3024   format %{ "subss   $dst, $src" %}
 3025   ins_cost(150);
 3026   ins_encode %{
 3027     __ subss($dst$$XMMRegister, $src$$XMMRegister);
 3028   %}
 3029   ins_pipe(pipe_slow);
 3030 %}
 3031 
 3032 instruct subF_mem(regF dst, memory src) %{
 3033   predicate(UseAVX == 0);
 3034   match(Set dst (SubF dst (LoadF src)));
 3035 
 3036   format %{ "subss   $dst, $src" %}
 3037   ins_cost(150);
 3038   ins_encode %{
 3039     __ subss($dst$$XMMRegister, $src$$Address);
 3040   %}
 3041   ins_pipe(pipe_slow);
 3042 %}
 3043 
 3044 instruct subF_imm(regF dst, immF con) %{
 3045   predicate(UseAVX == 0);
 3046   match(Set dst (SubF dst con));
 3047   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3048   ins_cost(150);
 3049   ins_encode %{
 3050     __ subss($dst$$XMMRegister, $constantaddress($con));
 3051   %}
 3052   ins_pipe(pipe_slow);
 3053 %}
 3054 
 3055 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
 3056   predicate(UseAVX > 0);
 3057   match(Set dst (SubF src1 src2));
 3058 
 3059   format %{ "vsubss  $dst, $src1, $src2" %}
 3060   ins_cost(150);
 3061   ins_encode %{
 3062     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3063   %}
 3064   ins_pipe(pipe_slow);
 3065 %}
 3066 
 3067 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
 3068   predicate(UseAVX > 0);
 3069   match(Set dst (SubF src1 (LoadF src2)));
 3070 
 3071   format %{ "vsubss  $dst, $src1, $src2" %}
 3072   ins_cost(150);
 3073   ins_encode %{
 3074     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3075   %}
 3076   ins_pipe(pipe_slow);
 3077 %}
 3078 
 3079 instruct subF_reg_imm(regF dst, regF src, immF con) %{
 3080   predicate(UseAVX > 0);
 3081   match(Set dst (SubF src con));
 3082 
 3083   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3084   ins_cost(150);
 3085   ins_encode %{
 3086     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3087   %}
 3088   ins_pipe(pipe_slow);
 3089 %}
 3090 
 3091 instruct subD_reg(regD dst, regD src) %{
 3092   predicate(UseAVX == 0);
 3093   match(Set dst (SubD dst src));
 3094 
 3095   format %{ "subsd   $dst, $src" %}
 3096   ins_cost(150);
 3097   ins_encode %{
 3098     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
 3099   %}
 3100   ins_pipe(pipe_slow);
 3101 %}
 3102 
 3103 instruct subD_mem(regD dst, memory src) %{
 3104   predicate(UseAVX == 0);
 3105   match(Set dst (SubD dst (LoadD src)));
 3106 
 3107   format %{ "subsd   $dst, $src" %}
 3108   ins_cost(150);
 3109   ins_encode %{
 3110     __ subsd($dst$$XMMRegister, $src$$Address);
 3111   %}
 3112   ins_pipe(pipe_slow);
 3113 %}
 3114 
 3115 instruct subD_imm(regD dst, immD con) %{
 3116   predicate(UseAVX == 0);
 3117   match(Set dst (SubD dst con));
 3118   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3119   ins_cost(150);
 3120   ins_encode %{
 3121     __ subsd($dst$$XMMRegister, $constantaddress($con));
 3122   %}
 3123   ins_pipe(pipe_slow);
 3124 %}
 3125 
 3126 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
 3127   predicate(UseAVX > 0);
 3128   match(Set dst (SubD src1 src2));
 3129 
 3130   format %{ "vsubsd  $dst, $src1, $src2" %}
 3131   ins_cost(150);
 3132   ins_encode %{
 3133     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3134   %}
 3135   ins_pipe(pipe_slow);
 3136 %}
 3137 
 3138 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
 3139   predicate(UseAVX > 0);
 3140   match(Set dst (SubD src1 (LoadD src2)));
 3141 
 3142   format %{ "vsubsd  $dst, $src1, $src2" %}
 3143   ins_cost(150);
 3144   ins_encode %{
 3145     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3146   %}
 3147   ins_pipe(pipe_slow);
 3148 %}
 3149 
 3150 instruct subD_reg_imm(regD dst, regD src, immD con) %{
 3151   predicate(UseAVX > 0);
 3152   match(Set dst (SubD src con));
 3153 
 3154   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3155   ins_cost(150);
 3156   ins_encode %{
 3157     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3158   %}
 3159   ins_pipe(pipe_slow);
 3160 %}
 3161 
 3162 instruct mulF_reg(regF dst, regF src) %{
 3163   predicate(UseAVX == 0);
 3164   match(Set dst (MulF dst src));
 3165 
 3166   format %{ "mulss   $dst, $src" %}
 3167   ins_cost(150);
 3168   ins_encode %{
 3169     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
 3170   %}
 3171   ins_pipe(pipe_slow);
 3172 %}
 3173 
 3174 instruct mulF_mem(regF dst, memory src) %{
 3175   predicate(UseAVX == 0);
 3176   match(Set dst (MulF dst (LoadF src)));
 3177 
 3178   format %{ "mulss   $dst, $src" %}
 3179   ins_cost(150);
 3180   ins_encode %{
 3181     __ mulss($dst$$XMMRegister, $src$$Address);
 3182   %}
 3183   ins_pipe(pipe_slow);
 3184 %}
 3185 
 3186 instruct mulF_imm(regF dst, immF con) %{
 3187   predicate(UseAVX == 0);
 3188   match(Set dst (MulF dst con));
 3189   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3190   ins_cost(150);
 3191   ins_encode %{
 3192     __ mulss($dst$$XMMRegister, $constantaddress($con));
 3193   %}
 3194   ins_pipe(pipe_slow);
 3195 %}
 3196 
 3197 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
 3198   predicate(UseAVX > 0);
 3199   match(Set dst (MulF src1 src2));
 3200 
 3201   format %{ "vmulss  $dst, $src1, $src2" %}
 3202   ins_cost(150);
 3203   ins_encode %{
 3204     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3205   %}
 3206   ins_pipe(pipe_slow);
 3207 %}
 3208 
 3209 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 3210   predicate(UseAVX > 0);
 3211   match(Set dst (MulF src1 (LoadF src2)));
 3212 
 3213   format %{ "vmulss  $dst, $src1, $src2" %}
 3214   ins_cost(150);
 3215   ins_encode %{
 3216     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3217   %}
 3218   ins_pipe(pipe_slow);
 3219 %}
 3220 
 3221 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 3222   predicate(UseAVX > 0);
 3223   match(Set dst (MulF src con));
 3224 
 3225   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3226   ins_cost(150);
 3227   ins_encode %{
 3228     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3229   %}
 3230   ins_pipe(pipe_slow);
 3231 %}
 3232 
 3233 instruct mulD_reg(regD dst, regD src) %{
 3234   predicate(UseAVX == 0);
 3235   match(Set dst (MulD dst src));
 3236 
 3237   format %{ "mulsd   $dst, $src" %}
 3238   ins_cost(150);
 3239   ins_encode %{
 3240     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
 3241   %}
 3242   ins_pipe(pipe_slow);
 3243 %}
 3244 
 3245 instruct mulD_mem(regD dst, memory src) %{
 3246   predicate(UseAVX == 0);
 3247   match(Set dst (MulD dst (LoadD src)));
 3248 
 3249   format %{ "mulsd   $dst, $src" %}
 3250   ins_cost(150);
 3251   ins_encode %{
 3252     __ mulsd($dst$$XMMRegister, $src$$Address);
 3253   %}
 3254   ins_pipe(pipe_slow);
 3255 %}
 3256 
 3257 instruct mulD_imm(regD dst, immD con) %{
 3258   predicate(UseAVX == 0);
 3259   match(Set dst (MulD dst con));
 3260   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3261   ins_cost(150);
 3262   ins_encode %{
 3263     __ mulsd($dst$$XMMRegister, $constantaddress($con));
 3264   %}
 3265   ins_pipe(pipe_slow);
 3266 %}
 3267 
 3268 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
 3269   predicate(UseAVX > 0);
 3270   match(Set dst (MulD src1 src2));
 3271 
 3272   format %{ "vmulsd  $dst, $src1, $src2" %}
 3273   ins_cost(150);
 3274   ins_encode %{
 3275     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3276   %}
 3277   ins_pipe(pipe_slow);
 3278 %}
 3279 
 3280 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
 3281   predicate(UseAVX > 0);
 3282   match(Set dst (MulD src1 (LoadD src2)));
 3283 
 3284   format %{ "vmulsd  $dst, $src1, $src2" %}
 3285   ins_cost(150);
 3286   ins_encode %{
 3287     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3288   %}
 3289   ins_pipe(pipe_slow);
 3290 %}
 3291 
 3292 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
 3293   predicate(UseAVX > 0);
 3294   match(Set dst (MulD src con));
 3295 
 3296   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3297   ins_cost(150);
 3298   ins_encode %{
 3299     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3300   %}
 3301   ins_pipe(pipe_slow);
 3302 %}
 3303 
 3304 instruct divF_reg(regF dst, regF src) %{
 3305   predicate(UseAVX == 0);
 3306   match(Set dst (DivF dst src));
 3307 
 3308   format %{ "divss   $dst, $src" %}
 3309   ins_cost(150);
 3310   ins_encode %{
 3311     __ divss($dst$$XMMRegister, $src$$XMMRegister);
 3312   %}
 3313   ins_pipe(pipe_slow);
 3314 %}
 3315 
 3316 instruct divF_mem(regF dst, memory src) %{
 3317   predicate(UseAVX == 0);
 3318   match(Set dst (DivF dst (LoadF src)));
 3319 
 3320   format %{ "divss   $dst, $src" %}
 3321   ins_cost(150);
 3322   ins_encode %{
 3323     __ divss($dst$$XMMRegister, $src$$Address);
 3324   %}
 3325   ins_pipe(pipe_slow);
 3326 %}
 3327 
 3328 instruct divF_imm(regF dst, immF con) %{
 3329   predicate(UseAVX == 0);
 3330   match(Set dst (DivF dst con));
 3331   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3332   ins_cost(150);
 3333   ins_encode %{
 3334     __ divss($dst$$XMMRegister, $constantaddress($con));
 3335   %}
 3336   ins_pipe(pipe_slow);
 3337 %}
 3338 
 3339 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
 3340   predicate(UseAVX > 0);
 3341   match(Set dst (DivF src1 src2));
 3342 
 3343   format %{ "vdivss  $dst, $src1, $src2" %}
 3344   ins_cost(150);
 3345   ins_encode %{
 3346     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3347   %}
 3348   ins_pipe(pipe_slow);
 3349 %}
 3350 
 3351 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
 3352   predicate(UseAVX > 0);
 3353   match(Set dst (DivF src1 (LoadF src2)));
 3354 
 3355   format %{ "vdivss  $dst, $src1, $src2" %}
 3356   ins_cost(150);
 3357   ins_encode %{
 3358     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3359   %}
 3360   ins_pipe(pipe_slow);
 3361 %}
 3362 
 3363 instruct divF_reg_imm(regF dst, regF src, immF con) %{
 3364   predicate(UseAVX > 0);
 3365   match(Set dst (DivF src con));
 3366 
 3367   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3368   ins_cost(150);
 3369   ins_encode %{
 3370     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3371   %}
 3372   ins_pipe(pipe_slow);
 3373 %}
 3374 
 3375 instruct divD_reg(regD dst, regD src) %{
 3376   predicate(UseAVX == 0);
 3377   match(Set dst (DivD dst src));
 3378 
 3379   format %{ "divsd   $dst, $src" %}
 3380   ins_cost(150);
 3381   ins_encode %{
 3382     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
 3383   %}
 3384   ins_pipe(pipe_slow);
 3385 %}
 3386 
 3387 instruct divD_mem(regD dst, memory src) %{
 3388   predicate(UseAVX == 0);
 3389   match(Set dst (DivD dst (LoadD src)));
 3390 
 3391   format %{ "divsd   $dst, $src" %}
 3392   ins_cost(150);
 3393   ins_encode %{
 3394     __ divsd($dst$$XMMRegister, $src$$Address);
 3395   %}
 3396   ins_pipe(pipe_slow);
 3397 %}
 3398 
 3399 instruct divD_imm(regD dst, immD con) %{
 3400   predicate(UseAVX == 0);
 3401   match(Set dst (DivD dst con));
 3402   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3403   ins_cost(150);
 3404   ins_encode %{
 3405     __ divsd($dst$$XMMRegister, $constantaddress($con));
 3406   %}
 3407   ins_pipe(pipe_slow);
 3408 %}
 3409 
 3410 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
 3411   predicate(UseAVX > 0);
 3412   match(Set dst (DivD src1 src2));
 3413 
 3414   format %{ "vdivsd  $dst, $src1, $src2" %}
 3415   ins_cost(150);
 3416   ins_encode %{
 3417     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3418   %}
 3419   ins_pipe(pipe_slow);
 3420 %}
 3421 
 3422 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
 3423   predicate(UseAVX > 0);
 3424   match(Set dst (DivD src1 (LoadD src2)));
 3425 
 3426   format %{ "vdivsd  $dst, $src1, $src2" %}
 3427   ins_cost(150);
 3428   ins_encode %{
 3429     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3430   %}
 3431   ins_pipe(pipe_slow);
 3432 %}
 3433 
 3434 instruct divD_reg_imm(regD dst, regD src, immD con) %{
 3435   predicate(UseAVX > 0);
 3436   match(Set dst (DivD src con));
 3437 
 3438   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3439   ins_cost(150);
 3440   ins_encode %{
 3441     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3442   %}
 3443   ins_pipe(pipe_slow);
 3444 %}
 3445 
 3446 instruct absF_reg(regF dst) %{
 3447   predicate(UseAVX == 0);
 3448   match(Set dst (AbsF dst));
 3449   ins_cost(150);
 3450   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
 3451   ins_encode %{
 3452     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
 3453   %}
 3454   ins_pipe(pipe_slow);
 3455 %}
 3456 
 3457 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
 3458   predicate(UseAVX > 0);
 3459   match(Set dst (AbsF src));
 3460   ins_cost(150);
 3461   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
 3462   ins_encode %{
 3463     int vlen_enc = Assembler::AVX_128bit;
 3464     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
 3465               ExternalAddress(float_signmask()), vlen_enc);
 3466   %}
 3467   ins_pipe(pipe_slow);
 3468 %}
 3469 
 3470 instruct absD_reg(regD dst) %{
 3471   predicate(UseAVX == 0);
 3472   match(Set dst (AbsD dst));
 3473   ins_cost(150);
 3474   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
 3475             "# abs double by sign masking" %}
 3476   ins_encode %{
 3477     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
 3478   %}
 3479   ins_pipe(pipe_slow);
 3480 %}
 3481 
 3482 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
 3483   predicate(UseAVX > 0);
 3484   match(Set dst (AbsD src));
 3485   ins_cost(150);
 3486   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
 3487             "# abs double by sign masking" %}
 3488   ins_encode %{
 3489     int vlen_enc = Assembler::AVX_128bit;
 3490     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
 3491               ExternalAddress(double_signmask()), vlen_enc);
 3492   %}
 3493   ins_pipe(pipe_slow);
 3494 %}
 3495 
 3496 instruct negF_reg(regF dst) %{
 3497   predicate(UseAVX == 0);
 3498   match(Set dst (NegF dst));
 3499   ins_cost(150);
 3500   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
 3501   ins_encode %{
 3502     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
 3503   %}
 3504   ins_pipe(pipe_slow);
 3505 %}
 3506 
 3507 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
 3508   predicate(UseAVX > 0);
 3509   match(Set dst (NegF src));
 3510   ins_cost(150);
 3511   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
 3512   ins_encode %{
 3513     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
 3514                  ExternalAddress(float_signflip()));
 3515   %}
 3516   ins_pipe(pipe_slow);
 3517 %}
 3518 
 3519 instruct negD_reg(regD dst) %{
 3520   predicate(UseAVX == 0);
 3521   match(Set dst (NegD dst));
 3522   ins_cost(150);
 3523   format %{ "xorpd   $dst, [0x8000000000000000]\t"
 3524             "# neg double by sign flipping" %}
 3525   ins_encode %{
 3526     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
 3527   %}
 3528   ins_pipe(pipe_slow);
 3529 %}
 3530 
 3531 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
 3532   predicate(UseAVX > 0);
 3533   match(Set dst (NegD src));
 3534   ins_cost(150);
 3535   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
 3536             "# neg double by sign flipping" %}
 3537   ins_encode %{
 3538     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
 3539                  ExternalAddress(double_signflip()));
 3540   %}
 3541   ins_pipe(pipe_slow);
 3542 %}
 3543 
 3544 // sqrtss instruction needs destination register to be pre initialized for best performance
 3545 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3546 instruct sqrtF_reg(regF dst) %{
 3547   match(Set dst (SqrtF dst));
 3548   format %{ "sqrtss  $dst, $dst" %}
 3549   ins_encode %{
 3550     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
 3551   %}
 3552   ins_pipe(pipe_slow);
 3553 %}
 3554 
 3555 // sqrtsd instruction needs destination register to be pre initialized for best performance
 3556 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3557 instruct sqrtD_reg(regD dst) %{
 3558   match(Set dst (SqrtD dst));
 3559   format %{ "sqrtsd  $dst, $dst" %}
 3560   ins_encode %{
 3561     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
 3562   %}
 3563   ins_pipe(pipe_slow);
 3564 %}
 3565 
 3566 instruct convF2HF_reg_reg(rRegI dst, vlRegF src, vlRegF tmp) %{
 3567   effect(TEMP tmp);
 3568   match(Set dst (ConvF2HF src));
 3569   ins_cost(125);
 3570   format %{ "vcvtps2ph $dst,$src \t using $tmp as TEMP"%}
 3571   ins_encode %{
 3572     __ flt_to_flt16($dst$$Register, $src$$XMMRegister, $tmp$$XMMRegister);
 3573   %}
 3574   ins_pipe( pipe_slow );
 3575 %}
 3576 
 3577 instruct convF2HF_mem_reg(memory mem, regF src, kReg ktmp, rRegI rtmp) %{
 3578   predicate((UseAVX > 2) && VM_Version::supports_avx512vl());
 3579   effect(TEMP ktmp, TEMP rtmp);
 3580   match(Set mem (StoreC mem (ConvF2HF src)));
 3581   format %{ "evcvtps2ph $mem,$src \t using $ktmp and $rtmp as TEMP" %}
 3582   ins_encode %{
 3583     __ movl($rtmp$$Register, 0x1);
 3584     __ kmovwl($ktmp$$KRegister, $rtmp$$Register);
 3585     __ evcvtps2ph($mem$$Address, $ktmp$$KRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
 3586   %}
 3587   ins_pipe( pipe_slow );
 3588 %}
 3589 
 3590 instruct vconvF2HF(vec dst, vec src) %{
 3591   match(Set dst (VectorCastF2HF src));
 3592   format %{ "vector_conv_F2HF $dst $src" %}
 3593   ins_encode %{
 3594     int vlen_enc = vector_length_encoding(this, $src);
 3595     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, vlen_enc);
 3596   %}
 3597   ins_pipe( pipe_slow );
 3598 %}
 3599 
 3600 instruct vconvF2HF_mem_reg(memory mem, vec src) %{
 3601   predicate(n->as_StoreVector()->memory_size() >= 16);
 3602   match(Set mem (StoreVector mem (VectorCastF2HF src)));
 3603   format %{ "vcvtps2ph $mem,$src" %}
 3604   ins_encode %{
 3605     int vlen_enc = vector_length_encoding(this, $src);
 3606     __ vcvtps2ph($mem$$Address, $src$$XMMRegister, 0x04, vlen_enc);
 3607   %}
 3608   ins_pipe( pipe_slow );
 3609 %}
 3610 
 3611 instruct convHF2F_reg_reg(vlRegF dst, rRegI src) %{
 3612   match(Set dst (ConvHF2F src));
 3613   format %{ "vcvtph2ps $dst,$src" %}
 3614   ins_encode %{
 3615     __ flt16_to_flt($dst$$XMMRegister, $src$$Register);
 3616   %}
 3617   ins_pipe( pipe_slow );
 3618 %}
 3619 
 3620 instruct vconvHF2F_reg_mem(vec dst, memory mem) %{
 3621   match(Set dst (VectorCastHF2F (LoadVector mem)));
 3622   format %{ "vcvtph2ps $dst,$mem" %}
 3623   ins_encode %{
 3624     int vlen_enc = vector_length_encoding(this);
 3625     __ vcvtph2ps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 3626   %}
 3627   ins_pipe( pipe_slow );
 3628 %}
 3629 
 3630 instruct vconvHF2F(vec dst, vec src) %{
 3631   match(Set dst (VectorCastHF2F src));
 3632   ins_cost(125);
 3633   format %{ "vector_conv_HF2F $dst,$src" %}
 3634   ins_encode %{
 3635     int vlen_enc = vector_length_encoding(this);
 3636     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 3637   %}
 3638   ins_pipe( pipe_slow );
 3639 %}
 3640 
 3641 // ---------------------------------------- VectorReinterpret ------------------------------------
 3642 instruct reinterpret_mask(kReg dst) %{
 3643   predicate(n->bottom_type()->isa_vectmask() &&
 3644             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
 3645   match(Set dst (VectorReinterpret dst));
 3646   ins_cost(125);
 3647   format %{ "vector_reinterpret $dst\t!" %}
 3648   ins_encode %{
 3649     // empty
 3650   %}
 3651   ins_pipe( pipe_slow );
 3652 %}
 3653 
 3654 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
 3655   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3656             n->bottom_type()->isa_vectmask() &&
 3657             n->in(1)->bottom_type()->isa_vectmask() &&
 3658             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
 3659             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3660   match(Set dst (VectorReinterpret src));
 3661   effect(TEMP xtmp);
 3662   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
 3663   ins_encode %{
 3664      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
 3665      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3666      assert(src_sz == dst_sz , "src and dst size mismatch");
 3667      int vlen_enc = vector_length_encoding(src_sz);
 3668      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3669      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3670   %}
 3671   ins_pipe( pipe_slow );
 3672 %}
 3673 
 3674 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
 3675   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3676             n->bottom_type()->isa_vectmask() &&
 3677             n->in(1)->bottom_type()->isa_vectmask() &&
 3678             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
 3679              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
 3680             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3681   match(Set dst (VectorReinterpret src));
 3682   effect(TEMP xtmp);
 3683   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
 3684   ins_encode %{
 3685      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
 3686      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3687      assert(src_sz == dst_sz , "src and dst size mismatch");
 3688      int vlen_enc = vector_length_encoding(src_sz);
 3689      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3690      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3691   %}
 3692   ins_pipe( pipe_slow );
 3693 %}
 3694 
 3695 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
 3696   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3697             n->bottom_type()->isa_vectmask() &&
 3698             n->in(1)->bottom_type()->isa_vectmask() &&
 3699             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
 3700              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
 3701             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3702   match(Set dst (VectorReinterpret src));
 3703   effect(TEMP xtmp);
 3704   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
 3705   ins_encode %{
 3706      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
 3707      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3708      assert(src_sz == dst_sz , "src and dst size mismatch");
 3709      int vlen_enc = vector_length_encoding(src_sz);
 3710      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3711      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3712   %}
 3713   ins_pipe( pipe_slow );
 3714 %}
 3715 
 3716 instruct reinterpret(vec dst) %{
 3717   predicate(!n->bottom_type()->isa_vectmask() &&
 3718             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
 3719   match(Set dst (VectorReinterpret dst));
 3720   ins_cost(125);
 3721   format %{ "vector_reinterpret $dst\t!" %}
 3722   ins_encode %{
 3723     // empty
 3724   %}
 3725   ins_pipe( pipe_slow );
 3726 %}
 3727 
 3728 instruct reinterpret_expand(vec dst, vec src) %{
 3729   predicate(UseAVX == 0 &&
 3730             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3731   match(Set dst (VectorReinterpret src));
 3732   ins_cost(125);
 3733   effect(TEMP dst);
 3734   format %{ "vector_reinterpret_expand $dst,$src" %}
 3735   ins_encode %{
 3736     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
 3737     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
 3738 
 3739     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
 3740     if (src_vlen_in_bytes == 4) {
 3741       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), noreg);
 3742     } else {
 3743       assert(src_vlen_in_bytes == 8, "");
 3744       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), noreg);
 3745     }
 3746     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 3747   %}
 3748   ins_pipe( pipe_slow );
 3749 %}
 3750 
 3751 instruct vreinterpret_expand4(legVec dst, vec src) %{
 3752   predicate(UseAVX > 0 &&
 3753             !n->bottom_type()->isa_vectmask() &&
 3754             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
 3755             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3756   match(Set dst (VectorReinterpret src));
 3757   ins_cost(125);
 3758   format %{ "vector_reinterpret_expand $dst,$src" %}
 3759   ins_encode %{
 3760     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, noreg);
 3761   %}
 3762   ins_pipe( pipe_slow );
 3763 %}
 3764 
 3765 
 3766 instruct vreinterpret_expand(legVec dst, vec src) %{
 3767   predicate(UseAVX > 0 &&
 3768             !n->bottom_type()->isa_vectmask() &&
 3769             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
 3770             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3771   match(Set dst (VectorReinterpret src));
 3772   ins_cost(125);
 3773   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
 3774   ins_encode %{
 3775     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3776       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3777       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3778       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3779       default: ShouldNotReachHere();
 3780     }
 3781   %}
 3782   ins_pipe( pipe_slow );
 3783 %}
 3784 
 3785 instruct reinterpret_shrink(vec dst, legVec src) %{
 3786   predicate(!n->bottom_type()->isa_vectmask() &&
 3787             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
 3788   match(Set dst (VectorReinterpret src));
 3789   ins_cost(125);
 3790   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
 3791   ins_encode %{
 3792     switch (Matcher::vector_length_in_bytes(this)) {
 3793       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
 3794       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3795       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3796       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3797       default: ShouldNotReachHere();
 3798     }
 3799   %}
 3800   ins_pipe( pipe_slow );
 3801 %}
 3802 
 3803 // ----------------------------------------------------------------------------------------------------
 3804 
 3805 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
 3806   match(Set dst (RoundDoubleMode src rmode));
 3807   format %{ "roundsd $dst,$src" %}
 3808   ins_cost(150);
 3809   ins_encode %{
 3810     assert(UseSSE >= 4, "required");
 3811     if ((UseAVX == 0) && ($dst$$XMMRegister != $src$$XMMRegister)) {
 3812       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 3813     }
 3814     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
 3815   %}
 3816   ins_pipe(pipe_slow);
 3817 %}
 3818 
 3819 instruct roundD_imm(legRegD dst, immD con, immU8 rmode) %{
 3820   match(Set dst (RoundDoubleMode con rmode));
 3821   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
 3822   ins_cost(150);
 3823   ins_encode %{
 3824     assert(UseSSE >= 4, "required");
 3825     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, noreg);
 3826   %}
 3827   ins_pipe(pipe_slow);
 3828 %}
 3829 
 3830 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
 3831   predicate(Matcher::vector_length(n) < 8);
 3832   match(Set dst (RoundDoubleModeV src rmode));
 3833   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
 3834   ins_encode %{
 3835     assert(UseAVX > 0, "required");
 3836     int vlen_enc = vector_length_encoding(this);
 3837     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
 3838   %}
 3839   ins_pipe( pipe_slow );
 3840 %}
 3841 
 3842 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
 3843   predicate(Matcher::vector_length(n) == 8);
 3844   match(Set dst (RoundDoubleModeV src rmode));
 3845   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
 3846   ins_encode %{
 3847     assert(UseAVX > 2, "required");
 3848     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
 3849   %}
 3850   ins_pipe( pipe_slow );
 3851 %}
 3852 
 3853 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
 3854   predicate(Matcher::vector_length(n) < 8);
 3855   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3856   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
 3857   ins_encode %{
 3858     assert(UseAVX > 0, "required");
 3859     int vlen_enc = vector_length_encoding(this);
 3860     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
 3861   %}
 3862   ins_pipe( pipe_slow );
 3863 %}
 3864 
 3865 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
 3866   predicate(Matcher::vector_length(n) == 8);
 3867   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3868   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
 3869   ins_encode %{
 3870     assert(UseAVX > 2, "required");
 3871     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
 3872   %}
 3873   ins_pipe( pipe_slow );
 3874 %}
 3875 
 3876 instruct onspinwait() %{
 3877   match(OnSpinWait);
 3878   ins_cost(200);
 3879 
 3880   format %{
 3881     $$template
 3882     $$emit$$"pause\t! membar_onspinwait"
 3883   %}
 3884   ins_encode %{
 3885     __ pause();
 3886   %}
 3887   ins_pipe(pipe_slow);
 3888 %}
 3889 
 3890 // a * b + c
 3891 instruct fmaD_reg(regD a, regD b, regD c) %{
 3892   match(Set c (FmaD  c (Binary a b)));
 3893   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
 3894   ins_cost(150);
 3895   ins_encode %{
 3896     assert(UseFMA, "Needs FMA instructions support.");
 3897     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3898   %}
 3899   ins_pipe( pipe_slow );
 3900 %}
 3901 
 3902 // a * b + c
 3903 instruct fmaF_reg(regF a, regF b, regF c) %{
 3904   match(Set c (FmaF  c (Binary a b)));
 3905   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
 3906   ins_cost(150);
 3907   ins_encode %{
 3908     assert(UseFMA, "Needs FMA instructions support.");
 3909     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3910   %}
 3911   ins_pipe( pipe_slow );
 3912 %}
 3913 
 3914 // ====================VECTOR INSTRUCTIONS=====================================
 3915 
 3916 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
 3917 instruct MoveVec2Leg(legVec dst, vec src) %{
 3918   match(Set dst src);
 3919   format %{ "" %}
 3920   ins_encode %{
 3921     ShouldNotReachHere();
 3922   %}
 3923   ins_pipe( fpu_reg_reg );
 3924 %}
 3925 
 3926 instruct MoveLeg2Vec(vec dst, legVec src) %{
 3927   match(Set dst src);
 3928   format %{ "" %}
 3929   ins_encode %{
 3930     ShouldNotReachHere();
 3931   %}
 3932   ins_pipe( fpu_reg_reg );
 3933 %}
 3934 
 3935 // ============================================================================
 3936 
 3937 // Load vectors generic operand pattern
 3938 instruct loadV(vec dst, memory mem) %{
 3939   match(Set dst (LoadVector mem));
 3940   ins_cost(125);
 3941   format %{ "load_vector $dst,$mem" %}
 3942   ins_encode %{
 3943     BasicType bt = Matcher::vector_element_basic_type(this);
 3944     __ load_vector(bt, $dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
 3945   %}
 3946   ins_pipe( pipe_slow );
 3947 %}
 3948 
 3949 // Store vectors generic operand pattern.
 3950 instruct storeV(memory mem, vec src) %{
 3951   match(Set mem (StoreVector mem src));
 3952   ins_cost(145);
 3953   format %{ "store_vector $mem,$src\n\t" %}
 3954   ins_encode %{
 3955     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3956       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
 3957       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
 3958       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
 3959       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
 3960       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
 3961       default: ShouldNotReachHere();
 3962     }
 3963   %}
 3964   ins_pipe( pipe_slow );
 3965 %}
 3966 
 3967 // ---------------------------------------- Gather ------------------------------------
 3968 
 3969 // Gather BYTE, SHORT, INT, LONG, FLOAT, DOUBLE
 3970 
 3971 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
 3972   predicate(!VM_Version::supports_avx512vl() && !is_subword_type(Matcher::vector_element_basic_type(n)) &&
 3973             Matcher::vector_length_in_bytes(n) <= 32);
 3974   match(Set dst (LoadVectorGather mem idx));
 3975   effect(TEMP dst, TEMP tmp, TEMP mask);
 3976   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
 3977   ins_encode %{
 3978     int vlen_enc = vector_length_encoding(this);
 3979     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 3980     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 3981     __ vpcmpeqd($mask$$XMMRegister, $mask$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 3982     __ lea($tmp$$Register, $mem$$Address);
 3983     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 3984   %}
 3985   ins_pipe( pipe_slow );
 3986 %}
 3987 
 3988 
 3989 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
 3990   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 3991             !is_subword_type(Matcher::vector_element_basic_type(n)));
 3992   match(Set dst (LoadVectorGather mem idx));
 3993   effect(TEMP dst, TEMP tmp, TEMP ktmp);
 3994   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
 3995   ins_encode %{
 3996     int vlen_enc = vector_length_encoding(this);
 3997     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 3998     __ kxnorwl($ktmp$$KRegister, $ktmp$$KRegister, $ktmp$$KRegister);
 3999     __ lea($tmp$$Register, $mem$$Address);
 4000     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4001   %}
 4002   ins_pipe( pipe_slow );
 4003 %}
 4004 
 4005 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4006   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4007             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4008   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
 4009   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
 4010   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
 4011   ins_encode %{
 4012     assert(UseAVX > 2, "sanity");
 4013     int vlen_enc = vector_length_encoding(this);
 4014     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4015     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4016     // Note: Since gather instruction partially updates the opmask register used
 4017     // for predication hense moving mask operand to a temporary.
 4018     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4019     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4020     __ lea($tmp$$Register, $mem$$Address);
 4021     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4022   %}
 4023   ins_pipe( pipe_slow );
 4024 %}
 4025 
 4026 instruct vgather_subwordLE8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegI rtmp) %{
 4027   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4028   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4029   effect(TEMP tmp, TEMP rtmp);
 4030   format %{ "vector_gatherLE8 $dst, $mem, $idx_base\t! using $tmp and $rtmp as TEMP" %}
 4031   ins_encode %{
 4032     int vlen_enc = vector_length_encoding(this);
 4033     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4034     __ lea($tmp$$Register, $mem$$Address);
 4035     __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp$$Register, vlen_enc);
 4036   %}
 4037   ins_pipe( pipe_slow );
 4038 %}
 4039 
 4040 instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegP idx_base_temp,
 4041                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4042   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4043   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4044   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4045   format %{ "vector_gatherGT8 $dst, $mem, $idx_base\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4046   ins_encode %{
 4047     int vlen_enc = vector_length_encoding(this);
 4048     int vector_len = Matcher::vector_length(this);
 4049     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4050     __ lea($tmp$$Register, $mem$$Address);
 4051     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4052     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, noreg, $xtmp1$$XMMRegister,
 4053                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4054   %}
 4055   ins_pipe( pipe_slow );
 4056 %}
 4057 
 4058 instruct vgather_subwordLE8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegI rtmp, rFlagsReg cr) %{
 4059   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4060   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4061   effect(TEMP tmp, TEMP rtmp, KILL cr);
 4062   format %{ "vector_gatherLE8_off $dst, $mem, $idx_base, $offset\t! using $tmp and $rtmp as TEMP" %}
 4063   ins_encode %{
 4064     int vlen_enc = vector_length_encoding(this);
 4065     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4066     __ lea($tmp$$Register, $mem$$Address);
 4067     __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register, $rtmp$$Register, vlen_enc);
 4068   %}
 4069   ins_pipe( pipe_slow );
 4070 %}
 4071 
 4072 
 4073 instruct vgather_subwordGT8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegP idx_base_temp,
 4074                                  vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4075   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4076   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4077   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4078   format %{ "vector_gatherGT8_off $dst, $mem, $idx_base, $offset\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4079   ins_encode %{
 4080     int vlen_enc = vector_length_encoding(this);
 4081     int vector_len = Matcher::vector_length(this);
 4082     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4083     __ lea($tmp$$Register, $mem$$Address);
 4084     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4085     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, noreg, $xtmp1$$XMMRegister,
 4086                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4087   %}
 4088   ins_pipe( pipe_slow );
 4089 %}
 4090 
 4091 
 4092 instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4093   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4094   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4095   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4096   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4097   ins_encode %{
 4098     int vlen_enc = vector_length_encoding(this);
 4099     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4100     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4101     __ lea($tmp$$Register, $mem$$Address);
 4102     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4103     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4104   %}
 4105   ins_pipe( pipe_slow );
 4106 %}
 4107 
 4108 instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4109                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4110   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4111   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4112   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4113   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4114   ins_encode %{
 4115     int vlen_enc = vector_length_encoding(this);
 4116     int vector_len = Matcher::vector_length(this);
 4117     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4118     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4119     __ lea($tmp$$Register, $mem$$Address);
 4120     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4121     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4122     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4123                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4124   %}
 4125   ins_pipe( pipe_slow );
 4126 %}
 4127 
 4128 instruct vgather_masked_subwordLE8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4129   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4130   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4131   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4132   format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4133   ins_encode %{
 4134     int vlen_enc = vector_length_encoding(this);
 4135     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4136     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4137     __ lea($tmp$$Register, $mem$$Address);
 4138     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4139     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
 4140                                 $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4141   %}
 4142   ins_pipe( pipe_slow );
 4143 %}
 4144 
 4145 instruct vgather_masked_subwordGT8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4146                                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4147   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4148   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4149   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4150   format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4151   ins_encode %{
 4152     int vlen_enc = vector_length_encoding(this);
 4153     int vector_len = Matcher::vector_length(this);
 4154     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4155     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4156     __ lea($tmp$$Register, $mem$$Address);
 4157     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4158     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4159     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4160                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4161   %}
 4162   ins_pipe( pipe_slow );
 4163 %}
 4164 
 4165 instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4166   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4167   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4168   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4169   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4170   ins_encode %{
 4171     int vlen_enc = vector_length_encoding(this);
 4172     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4173     __ lea($tmp$$Register, $mem$$Address);
 4174     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4175     if (elem_bt == T_SHORT) {
 4176       __ movl($mask_idx$$Register, 0x55555555);
 4177       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4178     }
 4179     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4180     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4181   %}
 4182   ins_pipe( pipe_slow );
 4183 %}
 4184 
 4185 instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegP tmp, rRegP idx_base_temp,
 4186                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4187   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4188   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4189   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4190   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4191   ins_encode %{
 4192     int vlen_enc = vector_length_encoding(this);
 4193     int vector_len = Matcher::vector_length(this);
 4194     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4195     __ lea($tmp$$Register, $mem$$Address);
 4196     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4197     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4198     if (elem_bt == T_SHORT) {
 4199       __ movl($mask_idx$$Register, 0x55555555);
 4200       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4201     }
 4202     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4203     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4204                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4205   %}
 4206   ins_pipe( pipe_slow );
 4207 %}
 4208 
 4209 instruct vgather_masked_subwordLE8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4210   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4211   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4212   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4213   format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4214   ins_encode %{
 4215     int vlen_enc = vector_length_encoding(this);
 4216     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4217     __ lea($tmp$$Register, $mem$$Address);
 4218     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4219     if (elem_bt == T_SHORT) {
 4220       __ movl($mask_idx$$Register, 0x55555555);
 4221       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4222     }
 4223     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4224     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
 4225                                 $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4226   %}
 4227   ins_pipe( pipe_slow );
 4228 %}
 4229 
 4230 instruct vgather_masked_subwordGT8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegP tmp, rRegP idx_base_temp,
 4231                                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4232   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4233   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4234   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4235   format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4236   ins_encode %{
 4237     int vlen_enc = vector_length_encoding(this);
 4238     int vector_len = Matcher::vector_length(this);
 4239     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4240     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4241     __ lea($tmp$$Register, $mem$$Address);
 4242     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4243     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4244     if (elem_bt == T_SHORT) {
 4245       __ movl($mask_idx$$Register, 0x55555555);
 4246       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4247     }
 4248     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4249     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4250                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4251   %}
 4252   ins_pipe( pipe_slow );
 4253 %}
 4254 
 4255 // ====================Scatter=======================================
 4256 
 4257 // Scatter INT, LONG, FLOAT, DOUBLE
 4258 
 4259 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
 4260   predicate(UseAVX > 2);
 4261   match(Set mem (StoreVectorScatter mem (Binary src idx)));
 4262   effect(TEMP tmp, TEMP ktmp);
 4263   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
 4264   ins_encode %{
 4265     int vlen_enc = vector_length_encoding(this, $src);
 4266     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4267 
 4268     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4269     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4270 
 4271     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4272     __ lea($tmp$$Register, $mem$$Address);
 4273     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4274   %}
 4275   ins_pipe( pipe_slow );
 4276 %}
 4277 
 4278 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4279   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
 4280   effect(TEMP tmp, TEMP ktmp);
 4281   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
 4282   ins_encode %{
 4283     int vlen_enc = vector_length_encoding(this, $src);
 4284     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4285     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4286     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4287     // Note: Since scatter instruction partially updates the opmask register used
 4288     // for predication hense moving mask operand to a temporary.
 4289     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4290     __ lea($tmp$$Register, $mem$$Address);
 4291     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4292   %}
 4293   ins_pipe( pipe_slow );
 4294 %}
 4295 
 4296 // ====================REPLICATE=======================================
 4297 
 4298 // Replicate byte scalar to be vector
 4299 instruct vReplB_reg(vec dst, rRegI src) %{
 4300   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 4301   match(Set dst (Replicate src));
 4302   format %{ "replicateB $dst,$src" %}
 4303   ins_encode %{
 4304     uint vlen = Matcher::vector_length(this);
 4305     if (UseAVX >= 2) {
 4306       int vlen_enc = vector_length_encoding(this);
 4307       if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4308         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
 4309         __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
 4310       } else {
 4311         __ movdl($dst$$XMMRegister, $src$$Register);
 4312         __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4313       }
 4314     } else {
 4315        assert(UseAVX < 2, "");
 4316       __ movdl($dst$$XMMRegister, $src$$Register);
 4317       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
 4318       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4319       if (vlen >= 16) {
 4320         assert(vlen == 16, "");
 4321         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4322       }
 4323     }
 4324   %}
 4325   ins_pipe( pipe_slow );
 4326 %}
 4327 
 4328 instruct ReplB_mem(vec dst, memory mem) %{
 4329   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_BYTE);
 4330   match(Set dst (Replicate (LoadB mem)));
 4331   format %{ "replicateB $dst,$mem" %}
 4332   ins_encode %{
 4333     int vlen_enc = vector_length_encoding(this);
 4334     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4335   %}
 4336   ins_pipe( pipe_slow );
 4337 %}
 4338 
 4339 // ====================ReplicateS=======================================
 4340 
 4341 instruct vReplS_reg(vec dst, rRegI src) %{
 4342   predicate(Matcher::vector_element_basic_type(n) == T_SHORT);
 4343   match(Set dst (Replicate src));
 4344   format %{ "replicateS $dst,$src" %}
 4345   ins_encode %{
 4346     uint vlen = Matcher::vector_length(this);
 4347     int vlen_enc = vector_length_encoding(this);
 4348     if (UseAVX >= 2) {
 4349       if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4350         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
 4351         __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
 4352       } else {
 4353         __ movdl($dst$$XMMRegister, $src$$Register);
 4354         __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4355       }
 4356     } else {
 4357       assert(UseAVX < 2, "");
 4358       __ movdl($dst$$XMMRegister, $src$$Register);
 4359       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4360       if (vlen >= 8) {
 4361         assert(vlen == 8, "");
 4362         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4363       }
 4364     }
 4365   %}
 4366   ins_pipe( pipe_slow );
 4367 %}
 4368 
 4369 instruct ReplHF_imm(vec dst, immH con, rRegI rtmp) %{
 4370   match(Set dst (Replicate con));
 4371   effect(TEMP rtmp);
 4372   format %{ "replicateHF $dst, $con \t! using $rtmp as TEMP" %}
 4373   ins_encode %{
 4374     int vlen_enc = vector_length_encoding(this);
 4375     BasicType bt = Matcher::vector_element_basic_type(this);
 4376     assert(VM_Version::supports_avx512_fp16() && bt == T_SHORT, "");
 4377     __ movl($rtmp$$Register, $con$$constant);
 4378     __ evpbroadcastw($dst$$XMMRegister, $rtmp$$Register, vlen_enc);
 4379   %}
 4380   ins_pipe( pipe_slow );
 4381 %}
 4382 
 4383 instruct ReplHF_reg(vec dst, regF src, rRegI rtmp) %{
 4384   predicate(VM_Version::supports_avx512_fp16() && Matcher::vector_element_basic_type(n) == T_SHORT);
 4385   match(Set dst (Replicate src));
 4386   effect(TEMP rtmp);
 4387   format %{ "replicateHF $dst, $src \t! using $rtmp as TEMP" %}
 4388   ins_encode %{
 4389     int vlen_enc = vector_length_encoding(this);
 4390     __ vmovw($rtmp$$Register, $src$$XMMRegister);
 4391     __ evpbroadcastw($dst$$XMMRegister, $rtmp$$Register, vlen_enc);
 4392   %}
 4393   ins_pipe( pipe_slow );
 4394 %}
 4395 
 4396 instruct ReplS_mem(vec dst, memory mem) %{
 4397   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_SHORT);
 4398   match(Set dst (Replicate (LoadS mem)));
 4399   format %{ "replicateS $dst,$mem" %}
 4400   ins_encode %{
 4401     int vlen_enc = vector_length_encoding(this);
 4402     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4403   %}
 4404   ins_pipe( pipe_slow );
 4405 %}
 4406 
 4407 // ====================ReplicateI=======================================
 4408 
 4409 instruct ReplI_reg(vec dst, rRegI src) %{
 4410   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4411   match(Set dst (Replicate src));
 4412   format %{ "replicateI $dst,$src" %}
 4413   ins_encode %{
 4414     uint vlen = Matcher::vector_length(this);
 4415     int vlen_enc = vector_length_encoding(this);
 4416     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4417       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
 4418     } else if (VM_Version::supports_avx2()) {
 4419       __ movdl($dst$$XMMRegister, $src$$Register);
 4420       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4421     } else {
 4422       __ movdl($dst$$XMMRegister, $src$$Register);
 4423       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4424     }
 4425   %}
 4426   ins_pipe( pipe_slow );
 4427 %}
 4428 
 4429 instruct ReplI_mem(vec dst, memory mem) %{
 4430   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4431   match(Set dst (Replicate (LoadI mem)));
 4432   format %{ "replicateI $dst,$mem" %}
 4433   ins_encode %{
 4434     int vlen_enc = vector_length_encoding(this);
 4435     if (VM_Version::supports_avx2()) {
 4436       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4437     } else if (VM_Version::supports_avx()) {
 4438       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4439     } else {
 4440       __ movdl($dst$$XMMRegister, $mem$$Address);
 4441       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4442     }
 4443   %}
 4444   ins_pipe( pipe_slow );
 4445 %}
 4446 
 4447 instruct ReplI_imm(vec dst, immI con) %{
 4448   predicate(Matcher::is_non_long_integral_vector(n));
 4449   match(Set dst (Replicate con));
 4450   format %{ "replicateI $dst,$con" %}
 4451   ins_encode %{
 4452     InternalAddress addr = $constantaddress(vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
 4453                                                            (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 16) /
 4454                                                                    type2aelembytes(Matcher::vector_element_basic_type(this))));
 4455     BasicType bt = Matcher::vector_element_basic_type(this);
 4456     int vlen = Matcher::vector_length_in_bytes(this);
 4457     __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
 4458   %}
 4459   ins_pipe( pipe_slow );
 4460 %}
 4461 
 4462 // Replicate scalar zero to be vector
 4463 instruct ReplI_zero(vec dst, immI_0 zero) %{
 4464   predicate(Matcher::is_non_long_integral_vector(n));
 4465   match(Set dst (Replicate zero));
 4466   format %{ "replicateI $dst,$zero" %}
 4467   ins_encode %{
 4468     int vlen_enc = vector_length_encoding(this);
 4469     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4470       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4471     } else {
 4472       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4473     }
 4474   %}
 4475   ins_pipe( fpu_reg_reg );
 4476 %}
 4477 
 4478 instruct ReplI_M1(vec dst, immI_M1 con) %{
 4479   predicate(Matcher::is_non_long_integral_vector(n));
 4480   match(Set dst (Replicate con));
 4481   format %{ "vallones $dst" %}
 4482   ins_encode %{
 4483     int vector_len = vector_length_encoding(this);
 4484     __ vallones($dst$$XMMRegister, vector_len);
 4485   %}
 4486   ins_pipe( pipe_slow );
 4487 %}
 4488 
 4489 // ====================ReplicateL=======================================
 4490 
 4491 // Replicate long (8 byte) scalar to be vector
 4492 instruct ReplL_reg(vec dst, rRegL src) %{
 4493   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4494   match(Set dst (Replicate src));
 4495   format %{ "replicateL $dst,$src" %}
 4496   ins_encode %{
 4497     int vlen = Matcher::vector_length(this);
 4498     int vlen_enc = vector_length_encoding(this);
 4499     if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4500       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
 4501     } else if (VM_Version::supports_avx2()) {
 4502       __ movdq($dst$$XMMRegister, $src$$Register);
 4503       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4504     } else {
 4505       __ movdq($dst$$XMMRegister, $src$$Register);
 4506       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4507     }
 4508   %}
 4509   ins_pipe( pipe_slow );
 4510 %}
 4511 
 4512 instruct ReplL_mem(vec dst, memory mem) %{
 4513   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4514   match(Set dst (Replicate (LoadL mem)));
 4515   format %{ "replicateL $dst,$mem" %}
 4516   ins_encode %{
 4517     int vlen_enc = vector_length_encoding(this);
 4518     if (VM_Version::supports_avx2()) {
 4519       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4520     } else if (VM_Version::supports_sse3()) {
 4521       __ movddup($dst$$XMMRegister, $mem$$Address);
 4522     } else {
 4523       __ movq($dst$$XMMRegister, $mem$$Address);
 4524       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4525     }
 4526   %}
 4527   ins_pipe( pipe_slow );
 4528 %}
 4529 
 4530 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
 4531 instruct ReplL_imm(vec dst, immL con) %{
 4532   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4533   match(Set dst (Replicate con));
 4534   format %{ "replicateL $dst,$con" %}
 4535   ins_encode %{
 4536     InternalAddress addr = $constantaddress(vreplicate_imm(T_LONG, $con$$constant, VM_Version::supports_sse3() ? 1 : 2));
 4537     int vlen = Matcher::vector_length_in_bytes(this);
 4538     __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
 4539   %}
 4540   ins_pipe( pipe_slow );
 4541 %}
 4542 
 4543 instruct ReplL_zero(vec dst, immL0 zero) %{
 4544   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4545   match(Set dst (Replicate zero));
 4546   format %{ "replicateL $dst,$zero" %}
 4547   ins_encode %{
 4548     int vlen_enc = vector_length_encoding(this);
 4549     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4550       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4551     } else {
 4552       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4553     }
 4554   %}
 4555   ins_pipe( fpu_reg_reg );
 4556 %}
 4557 
 4558 instruct ReplL_M1(vec dst, immL_M1 con) %{
 4559   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4560   match(Set dst (Replicate con));
 4561   format %{ "vallones $dst" %}
 4562   ins_encode %{
 4563     int vector_len = vector_length_encoding(this);
 4564     __ vallones($dst$$XMMRegister, vector_len);
 4565   %}
 4566   ins_pipe( pipe_slow );
 4567 %}
 4568 
 4569 // ====================ReplicateF=======================================
 4570 
 4571 instruct vReplF_reg(vec dst, vlRegF src) %{
 4572   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4573   match(Set dst (Replicate src));
 4574   format %{ "replicateF $dst,$src" %}
 4575   ins_encode %{
 4576     uint vlen = Matcher::vector_length(this);
 4577     int vlen_enc = vector_length_encoding(this);
 4578     if (vlen <= 4) {
 4579       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4580     } else if (VM_Version::supports_avx2()) {
 4581       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4582     } else {
 4583       assert(vlen == 8, "sanity");
 4584       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4585       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4586     }
 4587   %}
 4588   ins_pipe( pipe_slow );
 4589 %}
 4590 
 4591 instruct ReplF_reg(vec dst, vlRegF src) %{
 4592   predicate(UseAVX == 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4593   match(Set dst (Replicate src));
 4594   format %{ "replicateF $dst,$src" %}
 4595   ins_encode %{
 4596     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4597   %}
 4598   ins_pipe( pipe_slow );
 4599 %}
 4600 
 4601 instruct ReplF_mem(vec dst, memory mem) %{
 4602   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4603   match(Set dst (Replicate (LoadF mem)));
 4604   format %{ "replicateF $dst,$mem" %}
 4605   ins_encode %{
 4606     int vlen_enc = vector_length_encoding(this);
 4607     __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4608   %}
 4609   ins_pipe( pipe_slow );
 4610 %}
 4611 
 4612 // Replicate float scalar immediate to be vector by loading from const table.
 4613 instruct ReplF_imm(vec dst, immF con) %{
 4614   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4615   match(Set dst (Replicate con));
 4616   format %{ "replicateF $dst,$con" %}
 4617   ins_encode %{
 4618     InternalAddress addr = $constantaddress(vreplicate_imm(T_FLOAT, $con$$constant,
 4619                                                            VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 4));
 4620     int vlen = Matcher::vector_length_in_bytes(this);
 4621     __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
 4622   %}
 4623   ins_pipe( pipe_slow );
 4624 %}
 4625 
 4626 instruct ReplF_zero(vec dst, immF0 zero) %{
 4627   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4628   match(Set dst (Replicate zero));
 4629   format %{ "replicateF $dst,$zero" %}
 4630   ins_encode %{
 4631     int vlen_enc = vector_length_encoding(this);
 4632     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4633       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4634     } else {
 4635       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4636     }
 4637   %}
 4638   ins_pipe( fpu_reg_reg );
 4639 %}
 4640 
 4641 // ====================ReplicateD=======================================
 4642 
 4643 // Replicate double (8 bytes) scalar to be vector
 4644 instruct vReplD_reg(vec dst, vlRegD src) %{
 4645   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4646   match(Set dst (Replicate src));
 4647   format %{ "replicateD $dst,$src" %}
 4648   ins_encode %{
 4649     uint vlen = Matcher::vector_length(this);
 4650     int vlen_enc = vector_length_encoding(this);
 4651     if (vlen <= 2) {
 4652       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4653     } else if (VM_Version::supports_avx2()) {
 4654       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4655     } else {
 4656       assert(vlen == 4, "sanity");
 4657       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4658       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4659     }
 4660   %}
 4661   ins_pipe( pipe_slow );
 4662 %}
 4663 
 4664 instruct ReplD_reg(vec dst, vlRegD src) %{
 4665   predicate(UseSSE < 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4666   match(Set dst (Replicate src));
 4667   format %{ "replicateD $dst,$src" %}
 4668   ins_encode %{
 4669     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4670   %}
 4671   ins_pipe( pipe_slow );
 4672 %}
 4673 
 4674 instruct ReplD_mem(vec dst, memory mem) %{
 4675   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4676   match(Set dst (Replicate (LoadD mem)));
 4677   format %{ "replicateD $dst,$mem" %}
 4678   ins_encode %{
 4679     if (Matcher::vector_length(this) >= 4) {
 4680       int vlen_enc = vector_length_encoding(this);
 4681       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4682     } else {
 4683       __ movddup($dst$$XMMRegister, $mem$$Address);
 4684     }
 4685   %}
 4686   ins_pipe( pipe_slow );
 4687 %}
 4688 
 4689 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
 4690 instruct ReplD_imm(vec dst, immD con) %{
 4691   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4692   match(Set dst (Replicate con));
 4693   format %{ "replicateD $dst,$con" %}
 4694   ins_encode %{
 4695     InternalAddress addr = $constantaddress(vreplicate_imm(T_DOUBLE, $con$$constant, VM_Version::supports_sse3() ? 1 : 2));
 4696     int vlen = Matcher::vector_length_in_bytes(this);
 4697     __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
 4698   %}
 4699   ins_pipe( pipe_slow );
 4700 %}
 4701 
 4702 instruct ReplD_zero(vec dst, immD0 zero) %{
 4703   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4704   match(Set dst (Replicate zero));
 4705   format %{ "replicateD $dst,$zero" %}
 4706   ins_encode %{
 4707     int vlen_enc = vector_length_encoding(this);
 4708     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4709       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4710     } else {
 4711       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4712     }
 4713   %}
 4714   ins_pipe( fpu_reg_reg );
 4715 %}
 4716 
 4717 // ====================VECTOR INSERT=======================================
 4718 
 4719 instruct insert(vec dst, rRegI val, immU8 idx) %{
 4720   predicate(Matcher::vector_length_in_bytes(n) < 32);
 4721   match(Set dst (VectorInsert (Binary dst val) idx));
 4722   format %{ "vector_insert $dst,$val,$idx" %}
 4723   ins_encode %{
 4724     assert(UseSSE >= 4, "required");
 4725     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
 4726 
 4727     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4728 
 4729     assert(is_integral_type(elem_bt), "");
 4730     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4731 
 4732     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
 4733   %}
 4734   ins_pipe( pipe_slow );
 4735 %}
 4736 
 4737 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
 4738   predicate(Matcher::vector_length_in_bytes(n) == 32);
 4739   match(Set dst (VectorInsert (Binary src val) idx));
 4740   effect(TEMP vtmp);
 4741   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4742   ins_encode %{
 4743     int vlen_enc = Assembler::AVX_256bit;
 4744     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4745     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4746     int log2epr = log2(elem_per_lane);
 4747 
 4748     assert(is_integral_type(elem_bt), "sanity");
 4749     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4750 
 4751     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4752     uint y_idx = ($idx$$constant >> log2epr) & 1;
 4753     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4754     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4755     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4756   %}
 4757   ins_pipe( pipe_slow );
 4758 %}
 4759 
 4760 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
 4761   predicate(Matcher::vector_length_in_bytes(n) == 64);
 4762   match(Set dst (VectorInsert (Binary src val) idx));
 4763   effect(TEMP vtmp);
 4764   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4765   ins_encode %{
 4766     assert(UseAVX > 2, "sanity");
 4767 
 4768     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4769     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4770     int log2epr = log2(elem_per_lane);
 4771 
 4772     assert(is_integral_type(elem_bt), "");
 4773     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4774 
 4775     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4776     uint y_idx = ($idx$$constant >> log2epr) & 3;
 4777     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4778     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4779     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4780   %}
 4781   ins_pipe( pipe_slow );
 4782 %}
 4783 
 4784 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
 4785   predicate(Matcher::vector_length(n) == 2);
 4786   match(Set dst (VectorInsert (Binary dst val) idx));
 4787   format %{ "vector_insert $dst,$val,$idx" %}
 4788   ins_encode %{
 4789     assert(UseSSE >= 4, "required");
 4790     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4791     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4792 
 4793     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
 4794   %}
 4795   ins_pipe( pipe_slow );
 4796 %}
 4797 
 4798 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
 4799   predicate(Matcher::vector_length(n) == 4);
 4800   match(Set dst (VectorInsert (Binary src val) idx));
 4801   effect(TEMP vtmp);
 4802   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4803   ins_encode %{
 4804     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4805     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4806 
 4807     uint x_idx = $idx$$constant & right_n_bits(1);
 4808     uint y_idx = ($idx$$constant >> 1) & 1;
 4809     int vlen_enc = Assembler::AVX_256bit;
 4810     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4811     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4812     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4813   %}
 4814   ins_pipe( pipe_slow );
 4815 %}
 4816 
 4817 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
 4818   predicate(Matcher::vector_length(n) == 8);
 4819   match(Set dst (VectorInsert (Binary src val) idx));
 4820   effect(TEMP vtmp);
 4821   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4822   ins_encode %{
 4823     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
 4824     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4825 
 4826     uint x_idx = $idx$$constant & right_n_bits(1);
 4827     uint y_idx = ($idx$$constant >> 1) & 3;
 4828     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4829     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4830     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4831   %}
 4832   ins_pipe( pipe_slow );
 4833 %}
 4834 
 4835 instruct insertF(vec dst, regF val, immU8 idx) %{
 4836   predicate(Matcher::vector_length(n) < 8);
 4837   match(Set dst (VectorInsert (Binary dst val) idx));
 4838   format %{ "vector_insert $dst,$val,$idx" %}
 4839   ins_encode %{
 4840     assert(UseSSE >= 4, "sanity");
 4841 
 4842     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4843     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4844 
 4845     uint x_idx = $idx$$constant & right_n_bits(2);
 4846     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4847   %}
 4848   ins_pipe( pipe_slow );
 4849 %}
 4850 
 4851 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
 4852   predicate(Matcher::vector_length(n) >= 8);
 4853   match(Set dst (VectorInsert (Binary src val) idx));
 4854   effect(TEMP vtmp);
 4855   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4856   ins_encode %{
 4857     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4858     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4859 
 4860     int vlen = Matcher::vector_length(this);
 4861     uint x_idx = $idx$$constant & right_n_bits(2);
 4862     if (vlen == 8) {
 4863       uint y_idx = ($idx$$constant >> 2) & 1;
 4864       int vlen_enc = Assembler::AVX_256bit;
 4865       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4866       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4867       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4868     } else {
 4869       assert(vlen == 16, "sanity");
 4870       uint y_idx = ($idx$$constant >> 2) & 3;
 4871       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4872       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4873       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4874     }
 4875   %}
 4876   ins_pipe( pipe_slow );
 4877 %}
 4878 
 4879 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
 4880   predicate(Matcher::vector_length(n) == 2);
 4881   match(Set dst (VectorInsert (Binary dst val) idx));
 4882   effect(TEMP tmp);
 4883   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
 4884   ins_encode %{
 4885     assert(UseSSE >= 4, "sanity");
 4886     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4887     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4888 
 4889     __ movq($tmp$$Register, $val$$XMMRegister);
 4890     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
 4891   %}
 4892   ins_pipe( pipe_slow );
 4893 %}
 4894 
 4895 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
 4896   predicate(Matcher::vector_length(n) == 4);
 4897   match(Set dst (VectorInsert (Binary src val) idx));
 4898   effect(TEMP vtmp, TEMP tmp);
 4899   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
 4900   ins_encode %{
 4901     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4902     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4903 
 4904     uint x_idx = $idx$$constant & right_n_bits(1);
 4905     uint y_idx = ($idx$$constant >> 1) & 1;
 4906     int vlen_enc = Assembler::AVX_256bit;
 4907     __ movq($tmp$$Register, $val$$XMMRegister);
 4908     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4909     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4910     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4911   %}
 4912   ins_pipe( pipe_slow );
 4913 %}
 4914 
 4915 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
 4916   predicate(Matcher::vector_length(n) == 8);
 4917   match(Set dst (VectorInsert (Binary src val) idx));
 4918   effect(TEMP tmp, TEMP vtmp);
 4919   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4920   ins_encode %{
 4921     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4922     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4923 
 4924     uint x_idx = $idx$$constant & right_n_bits(1);
 4925     uint y_idx = ($idx$$constant >> 1) & 3;
 4926     __ movq($tmp$$Register, $val$$XMMRegister);
 4927     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4928     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4929     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4930   %}
 4931   ins_pipe( pipe_slow );
 4932 %}
 4933 
 4934 // ====================REDUCTION ARITHMETIC=======================================
 4935 
 4936 // =======================Int Reduction==========================================
 4937 
 4938 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4939   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
 4940   match(Set dst (AddReductionVI src1 src2));
 4941   match(Set dst (MulReductionVI src1 src2));
 4942   match(Set dst (AndReductionV  src1 src2));
 4943   match(Set dst ( OrReductionV  src1 src2));
 4944   match(Set dst (XorReductionV  src1 src2));
 4945   match(Set dst (MinReductionV  src1 src2));
 4946   match(Set dst (MaxReductionV  src1 src2));
 4947   effect(TEMP vtmp1, TEMP vtmp2);
 4948   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4949   ins_encode %{
 4950     int opcode = this->ideal_Opcode();
 4951     int vlen = Matcher::vector_length(this, $src2);
 4952     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4953   %}
 4954   ins_pipe( pipe_slow );
 4955 %}
 4956 
 4957 // =======================Long Reduction==========================================
 4958 
 4959 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4960   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
 4961   match(Set dst (AddReductionVL src1 src2));
 4962   match(Set dst (MulReductionVL src1 src2));
 4963   match(Set dst (AndReductionV  src1 src2));
 4964   match(Set dst ( OrReductionV  src1 src2));
 4965   match(Set dst (XorReductionV  src1 src2));
 4966   match(Set dst (MinReductionV  src1 src2));
 4967   match(Set dst (MaxReductionV  src1 src2));
 4968   effect(TEMP vtmp1, TEMP vtmp2);
 4969   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4970   ins_encode %{
 4971     int opcode = this->ideal_Opcode();
 4972     int vlen = Matcher::vector_length(this, $src2);
 4973     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4974   %}
 4975   ins_pipe( pipe_slow );
 4976 %}
 4977 
 4978 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
 4979   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
 4980   match(Set dst (AddReductionVL src1 src2));
 4981   match(Set dst (MulReductionVL src1 src2));
 4982   match(Set dst (AndReductionV  src1 src2));
 4983   match(Set dst ( OrReductionV  src1 src2));
 4984   match(Set dst (XorReductionV  src1 src2));
 4985   match(Set dst (MinReductionV  src1 src2));
 4986   match(Set dst (MaxReductionV  src1 src2));
 4987   effect(TEMP vtmp1, TEMP vtmp2);
 4988   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4989   ins_encode %{
 4990     int opcode = this->ideal_Opcode();
 4991     int vlen = Matcher::vector_length(this, $src2);
 4992     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4993   %}
 4994   ins_pipe( pipe_slow );
 4995 %}
 4996 
 4997 // =======================Float Reduction==========================================
 4998 
 4999 instruct reductionF128(regF dst, vec src, vec vtmp) %{
 5000   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) <= 4); // src
 5001   match(Set dst (AddReductionVF dst src));
 5002   match(Set dst (MulReductionVF dst src));
 5003   effect(TEMP dst, TEMP vtmp);
 5004   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
 5005   ins_encode %{
 5006     int opcode = this->ideal_Opcode();
 5007     int vlen = Matcher::vector_length(this, $src);
 5008     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5009   %}
 5010   ins_pipe( pipe_slow );
 5011 %}
 5012 
 5013 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
 5014   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5015   match(Set dst (AddReductionVF dst src));
 5016   match(Set dst (MulReductionVF dst src));
 5017   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5018   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5019   ins_encode %{
 5020     int opcode = this->ideal_Opcode();
 5021     int vlen = Matcher::vector_length(this, $src);
 5022     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5023   %}
 5024   ins_pipe( pipe_slow );
 5025 %}
 5026 
 5027 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5028   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src
 5029   match(Set dst (AddReductionVF dst src));
 5030   match(Set dst (MulReductionVF dst src));
 5031   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5032   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5033   ins_encode %{
 5034     int opcode = this->ideal_Opcode();
 5035     int vlen = Matcher::vector_length(this, $src);
 5036     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5037   %}
 5038   ins_pipe( pipe_slow );
 5039 %}
 5040 
 5041 
 5042 instruct unordered_reduction2F(regF dst, regF src1, vec src2) %{
 5043   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5044   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5045   // src1 contains reduction identity
 5046   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5047   match(Set dst (AddReductionVF src1 src2));
 5048   match(Set dst (MulReductionVF src1 src2));
 5049   effect(TEMP dst);
 5050   format %{ "vector_reduction_float  $dst,$src1,$src2 ;" %}
 5051   ins_encode %{
 5052     int opcode = this->ideal_Opcode();
 5053     int vlen = Matcher::vector_length(this, $src2);
 5054     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5055   %}
 5056   ins_pipe( pipe_slow );
 5057 %}
 5058 
 5059 instruct unordered_reduction4F(regF dst, regF src1, vec src2, vec vtmp) %{
 5060   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5061   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5062   // src1 contains reduction identity
 5063   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5064   match(Set dst (AddReductionVF src1 src2));
 5065   match(Set dst (MulReductionVF src1 src2));
 5066   effect(TEMP dst, TEMP vtmp);
 5067   format %{ "vector_reduction_float  $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5068   ins_encode %{
 5069     int opcode = this->ideal_Opcode();
 5070     int vlen = Matcher::vector_length(this, $src2);
 5071     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5072   %}
 5073   ins_pipe( pipe_slow );
 5074 %}
 5075 
 5076 instruct unordered_reduction8F(regF dst, regF src1, vec src2, vec vtmp1, vec vtmp2) %{
 5077   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5078   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5079   // src1 contains reduction identity
 5080   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5081   match(Set dst (AddReductionVF src1 src2));
 5082   match(Set dst (MulReductionVF src1 src2));
 5083   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5084   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5085   ins_encode %{
 5086     int opcode = this->ideal_Opcode();
 5087     int vlen = Matcher::vector_length(this, $src2);
 5088     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5089   %}
 5090   ins_pipe( pipe_slow );
 5091 %}
 5092 
 5093 instruct unordered_reduction16F(regF dst, regF src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5094   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5095   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5096   // src1 contains reduction identity
 5097   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src2
 5098   match(Set dst (AddReductionVF src1 src2));
 5099   match(Set dst (MulReductionVF src1 src2));
 5100   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5101   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5102   ins_encode %{
 5103     int opcode = this->ideal_Opcode();
 5104     int vlen = Matcher::vector_length(this, $src2);
 5105     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5106   %}
 5107   ins_pipe( pipe_slow );
 5108 %}
 5109 
 5110 // =======================Double Reduction==========================================
 5111 
 5112 instruct reduction2D(regD dst, vec src, vec vtmp) %{
 5113   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src
 5114   match(Set dst (AddReductionVD dst src));
 5115   match(Set dst (MulReductionVD dst src));
 5116   effect(TEMP dst, TEMP vtmp);
 5117   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
 5118   ins_encode %{
 5119     int opcode = this->ideal_Opcode();
 5120     int vlen = Matcher::vector_length(this, $src);
 5121     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5122 %}
 5123   ins_pipe( pipe_slow );
 5124 %}
 5125 
 5126 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
 5127   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src
 5128   match(Set dst (AddReductionVD dst src));
 5129   match(Set dst (MulReductionVD dst src));
 5130   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5131   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5132   ins_encode %{
 5133     int opcode = this->ideal_Opcode();
 5134     int vlen = Matcher::vector_length(this, $src);
 5135     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5136   %}
 5137   ins_pipe( pipe_slow );
 5138 %}
 5139 
 5140 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5141   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5142   match(Set dst (AddReductionVD dst src));
 5143   match(Set dst (MulReductionVD dst src));
 5144   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5145   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5146   ins_encode %{
 5147     int opcode = this->ideal_Opcode();
 5148     int vlen = Matcher::vector_length(this, $src);
 5149     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5150   %}
 5151   ins_pipe( pipe_slow );
 5152 %}
 5153 
 5154 instruct unordered_reduction2D(regD dst, regD src1, vec src2) %{
 5155   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5156   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5157   // src1 contains reduction identity
 5158   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5159   match(Set dst (AddReductionVD src1 src2));
 5160   match(Set dst (MulReductionVD src1 src2));
 5161   effect(TEMP dst);
 5162   format %{ "vector_reduction_double $dst,$src1,$src2 ;" %}
 5163   ins_encode %{
 5164     int opcode = this->ideal_Opcode();
 5165     int vlen = Matcher::vector_length(this, $src2);
 5166     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5167 %}
 5168   ins_pipe( pipe_slow );
 5169 %}
 5170 
 5171 instruct unordered_reduction4D(regD dst, regD src1, vec src2, vec vtmp) %{
 5172   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5173   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5174   // src1 contains reduction identity
 5175   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5176   match(Set dst (AddReductionVD src1 src2));
 5177   match(Set dst (MulReductionVD src1 src2));
 5178   effect(TEMP dst, TEMP vtmp);
 5179   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5180   ins_encode %{
 5181     int opcode = this->ideal_Opcode();
 5182     int vlen = Matcher::vector_length(this, $src2);
 5183     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5184   %}
 5185   ins_pipe( pipe_slow );
 5186 %}
 5187 
 5188 instruct unordered_reduction8D(regD dst, regD src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5189   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5190   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5191   // src1 contains reduction identity
 5192   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5193   match(Set dst (AddReductionVD src1 src2));
 5194   match(Set dst (MulReductionVD src1 src2));
 5195   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5196   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5197   ins_encode %{
 5198     int opcode = this->ideal_Opcode();
 5199     int vlen = Matcher::vector_length(this, $src2);
 5200     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5201   %}
 5202   ins_pipe( pipe_slow );
 5203 %}
 5204 
 5205 // =======================Byte Reduction==========================================
 5206 
 5207 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5208   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
 5209   match(Set dst (AddReductionVI src1 src2));
 5210   match(Set dst (AndReductionV  src1 src2));
 5211   match(Set dst ( OrReductionV  src1 src2));
 5212   match(Set dst (XorReductionV  src1 src2));
 5213   match(Set dst (MinReductionV  src1 src2));
 5214   match(Set dst (MaxReductionV  src1 src2));
 5215   effect(TEMP vtmp1, TEMP vtmp2);
 5216   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5217   ins_encode %{
 5218     int opcode = this->ideal_Opcode();
 5219     int vlen = Matcher::vector_length(this, $src2);
 5220     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5221   %}
 5222   ins_pipe( pipe_slow );
 5223 %}
 5224 
 5225 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5226   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
 5227   match(Set dst (AddReductionVI src1 src2));
 5228   match(Set dst (AndReductionV  src1 src2));
 5229   match(Set dst ( OrReductionV  src1 src2));
 5230   match(Set dst (XorReductionV  src1 src2));
 5231   match(Set dst (MinReductionV  src1 src2));
 5232   match(Set dst (MaxReductionV  src1 src2));
 5233   effect(TEMP vtmp1, TEMP vtmp2);
 5234   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5235   ins_encode %{
 5236     int opcode = this->ideal_Opcode();
 5237     int vlen = Matcher::vector_length(this, $src2);
 5238     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5239   %}
 5240   ins_pipe( pipe_slow );
 5241 %}
 5242 
 5243 // =======================Short Reduction==========================================
 5244 
 5245 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5246   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
 5247   match(Set dst (AddReductionVI src1 src2));
 5248   match(Set dst (MulReductionVI src1 src2));
 5249   match(Set dst (AndReductionV  src1 src2));
 5250   match(Set dst ( OrReductionV  src1 src2));
 5251   match(Set dst (XorReductionV  src1 src2));
 5252   match(Set dst (MinReductionV  src1 src2));
 5253   match(Set dst (MaxReductionV  src1 src2));
 5254   effect(TEMP vtmp1, TEMP vtmp2);
 5255   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5256   ins_encode %{
 5257     int opcode = this->ideal_Opcode();
 5258     int vlen = Matcher::vector_length(this, $src2);
 5259     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5260   %}
 5261   ins_pipe( pipe_slow );
 5262 %}
 5263 
 5264 // =======================Mul Reduction==========================================
 5265 
 5266 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5267   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5268             Matcher::vector_length(n->in(2)) <= 32); // src2
 5269   match(Set dst (MulReductionVI src1 src2));
 5270   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5271   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5272   ins_encode %{
 5273     int opcode = this->ideal_Opcode();
 5274     int vlen = Matcher::vector_length(this, $src2);
 5275     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5276   %}
 5277   ins_pipe( pipe_slow );
 5278 %}
 5279 
 5280 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5281   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5282             Matcher::vector_length(n->in(2)) == 64); // src2
 5283   match(Set dst (MulReductionVI src1 src2));
 5284   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5285   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5286   ins_encode %{
 5287     int opcode = this->ideal_Opcode();
 5288     int vlen = Matcher::vector_length(this, $src2);
 5289     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5290   %}
 5291   ins_pipe( pipe_slow );
 5292 %}
 5293 
 5294 //--------------------Min/Max Float Reduction --------------------
 5295 // Float Min Reduction
 5296 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5297                             legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5298   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5299             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5300              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5301             Matcher::vector_length(n->in(2)) == 2);
 5302   match(Set dst (MinReductionV src1 src2));
 5303   match(Set dst (MaxReductionV src1 src2));
 5304   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5305   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5306   ins_encode %{
 5307     assert(UseAVX > 0, "sanity");
 5308 
 5309     int opcode = this->ideal_Opcode();
 5310     int vlen = Matcher::vector_length(this, $src2);
 5311     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5312                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5313   %}
 5314   ins_pipe( pipe_slow );
 5315 %}
 5316 
 5317 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5318                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5319   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5320             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5321              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5322             Matcher::vector_length(n->in(2)) >= 4);
 5323   match(Set dst (MinReductionV src1 src2));
 5324   match(Set dst (MaxReductionV src1 src2));
 5325   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5326   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5327   ins_encode %{
 5328     assert(UseAVX > 0, "sanity");
 5329 
 5330     int opcode = this->ideal_Opcode();
 5331     int vlen = Matcher::vector_length(this, $src2);
 5332     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5333                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5334   %}
 5335   ins_pipe( pipe_slow );
 5336 %}
 5337 
 5338 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp, legVec atmp,
 5339                                legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5340   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5341             Matcher::vector_length(n->in(2)) == 2);
 5342   match(Set dst (MinReductionV dst src));
 5343   match(Set dst (MaxReductionV dst src));
 5344   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5345   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5346   ins_encode %{
 5347     assert(UseAVX > 0, "sanity");
 5348 
 5349     int opcode = this->ideal_Opcode();
 5350     int vlen = Matcher::vector_length(this, $src);
 5351     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5352                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5353   %}
 5354   ins_pipe( pipe_slow );
 5355 %}
 5356 
 5357 
 5358 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp, legVec atmp, legVec btmp,
 5359                               legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5360   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5361             Matcher::vector_length(n->in(2)) >= 4);
 5362   match(Set dst (MinReductionV dst src));
 5363   match(Set dst (MaxReductionV dst src));
 5364   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5365   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5366   ins_encode %{
 5367     assert(UseAVX > 0, "sanity");
 5368 
 5369     int opcode = this->ideal_Opcode();
 5370     int vlen = Matcher::vector_length(this, $src);
 5371     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5372                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5373   %}
 5374   ins_pipe( pipe_slow );
 5375 %}
 5376 
 5377 instruct minmax_reduction2F_avx10(regF dst, immF src1, vec src2, vec xtmp1) %{
 5378   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5379             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5380              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5381             Matcher::vector_length(n->in(2)) == 2);
 5382   match(Set dst (MinReductionV src1 src2));
 5383   match(Set dst (MaxReductionV src1 src2));
 5384   effect(TEMP dst, TEMP xtmp1);
 5385   format %{ "vector_minmax_reduction $dst, $src1, $src2 \t; using $xtmp1 as TEMP" %}
 5386   ins_encode %{
 5387     int opcode = this->ideal_Opcode();
 5388     int vlen = Matcher::vector_length(this, $src2);
 5389     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5390                          xnoreg, xnoreg, xnoreg, $xtmp1$$XMMRegister);
 5391   %}
 5392   ins_pipe( pipe_slow );
 5393 %}
 5394 
 5395 instruct minmax_reductionF_avx10(regF dst, immF src1, vec src2, vec xtmp1, vec xtmp2) %{
 5396   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5397             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5398              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5399             Matcher::vector_length(n->in(2)) >= 4);
 5400   match(Set dst (MinReductionV src1 src2));
 5401   match(Set dst (MaxReductionV src1 src2));
 5402   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 5403   format %{ "vector_minmax_reduction $dst, $src1, $src2 \t; using $xtmp1 and $xtmp2 as TEMP" %}
 5404   ins_encode %{
 5405     int opcode = this->ideal_Opcode();
 5406     int vlen = Matcher::vector_length(this, $src2);
 5407     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg,
 5408                          xnoreg, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
 5409   %}
 5410   ins_pipe( pipe_slow );
 5411 %}
 5412 
 5413 instruct minmax_reduction2F_avx10_av(regF dst, vec src, vec xtmp1) %{
 5414   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5415             Matcher::vector_length(n->in(2)) == 2);
 5416   match(Set dst (MinReductionV dst src));
 5417   match(Set dst (MaxReductionV dst src));
 5418   effect(TEMP dst, TEMP xtmp1);
 5419   format %{ "vector_minmax2F_reduction $dst, $src \t; using $xtmp1 as TEMP" %}
 5420   ins_encode %{
 5421     int opcode = this->ideal_Opcode();
 5422     int vlen = Matcher::vector_length(this, $src);
 5423     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg, xnoreg,
 5424                          $xtmp1$$XMMRegister);
 5425   %}
 5426   ins_pipe( pipe_slow );
 5427 %}
 5428 
 5429 instruct minmax_reductionF_avx10_av(regF dst, vec src, vec xtmp1, vec xtmp2) %{
 5430   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5431             Matcher::vector_length(n->in(2)) >= 4);
 5432   match(Set dst (MinReductionV dst src));
 5433   match(Set dst (MaxReductionV dst src));
 5434   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 5435   format %{ "vector_minmax2F_reduction $dst, $src \t; using $xtmp1 and $xtmp2 as TEMP" %}
 5436   ins_encode %{
 5437     int opcode = this->ideal_Opcode();
 5438     int vlen = Matcher::vector_length(this, $src);
 5439     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg, xnoreg,
 5440                          $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
 5441   %}
 5442   ins_pipe( pipe_slow );
 5443 %}
 5444 
 5445 //--------------------Min Double Reduction --------------------
 5446 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2, legVec tmp1, legVec tmp2,
 5447                             legVec tmp3, legVec tmp4, rFlagsReg cr) %{
 5448   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5449             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5450              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5451             Matcher::vector_length(n->in(2)) == 2);
 5452   match(Set dst (MinReductionV src1 src2));
 5453   match(Set dst (MaxReductionV src1 src2));
 5454   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5455   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5456   ins_encode %{
 5457     assert(UseAVX > 0, "sanity");
 5458 
 5459     int opcode = this->ideal_Opcode();
 5460     int vlen = Matcher::vector_length(this, $src2);
 5461     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5462                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5463   %}
 5464   ins_pipe( pipe_slow );
 5465 %}
 5466 
 5467 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2, legVec tmp1, legVec tmp2,
 5468                            legVec tmp3, legVec tmp4, legVec tmp5, rFlagsReg cr) %{
 5469   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5470             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5471              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5472             Matcher::vector_length(n->in(2)) >= 4);
 5473   match(Set dst (MinReductionV src1 src2));
 5474   match(Set dst (MaxReductionV src1 src2));
 5475   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5476   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5477   ins_encode %{
 5478     assert(UseAVX > 0, "sanity");
 5479 
 5480     int opcode = this->ideal_Opcode();
 5481     int vlen = Matcher::vector_length(this, $src2);
 5482     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5483                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5484   %}
 5485   ins_pipe( pipe_slow );
 5486 %}
 5487 
 5488 
 5489 instruct minmax_reduction2D_av(legRegD dst, legVec src, legVec tmp1, legVec tmp2,
 5490                                legVec tmp3, legVec tmp4, rFlagsReg cr) %{
 5491   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5492             Matcher::vector_length(n->in(2)) == 2);
 5493   match(Set dst (MinReductionV dst src));
 5494   match(Set dst (MaxReductionV dst src));
 5495   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5496   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5497   ins_encode %{
 5498     assert(UseAVX > 0, "sanity");
 5499 
 5500     int opcode = this->ideal_Opcode();
 5501     int vlen = Matcher::vector_length(this, $src);
 5502     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5503                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5504   %}
 5505   ins_pipe( pipe_slow );
 5506 %}
 5507 
 5508 instruct minmax_reductionD_av(legRegD dst, legVec src, legVec tmp1, legVec tmp2, legVec tmp3,
 5509                               legVec tmp4, legVec tmp5, rFlagsReg cr) %{
 5510   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5511             Matcher::vector_length(n->in(2)) >= 4);
 5512   match(Set dst (MinReductionV dst src));
 5513   match(Set dst (MaxReductionV dst src));
 5514   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5515   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5516   ins_encode %{
 5517     assert(UseAVX > 0, "sanity");
 5518 
 5519     int opcode = this->ideal_Opcode();
 5520     int vlen = Matcher::vector_length(this, $src);
 5521     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5522                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5523   %}
 5524   ins_pipe( pipe_slow );
 5525 %}
 5526 
 5527 instruct minmax_reduction2D_avx10(regD dst, immD src1, vec src2, vec xtmp1) %{
 5528   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5529             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5530              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5531             Matcher::vector_length(n->in(2)) == 2);
 5532   match(Set dst (MinReductionV src1 src2));
 5533   match(Set dst (MaxReductionV src1 src2));
 5534   effect(TEMP dst, TEMP xtmp1);
 5535   format %{ "vector_minmax2D_reduction $dst, $src1, $src2 ; using $xtmp1 as TEMP" %}
 5536   ins_encode %{
 5537     int opcode = this->ideal_Opcode();
 5538     int vlen = Matcher::vector_length(this, $src2);
 5539     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, xnoreg,
 5540                           xnoreg, xnoreg, $xtmp1$$XMMRegister);
 5541   %}
 5542   ins_pipe( pipe_slow );
 5543 %}
 5544 
 5545 instruct minmax_reductionD_avx10(regD dst, immD src1, vec src2, vec xtmp1, vec xtmp2) %{
 5546   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5547             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5548              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5549             Matcher::vector_length(n->in(2)) >= 4);
 5550   match(Set dst (MinReductionV src1 src2));
 5551   match(Set dst (MaxReductionV src1 src2));
 5552   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 5553   format %{ "vector_minmaxD_reduction $dst, $src1, $src2 ; using $xtmp1 and $xtmp2 as TEMP" %}
 5554   ins_encode %{
 5555     int opcode = this->ideal_Opcode();
 5556     int vlen = Matcher::vector_length(this, $src2);
 5557     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg,
 5558                           xnoreg, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
 5559   %}
 5560   ins_pipe( pipe_slow );
 5561 %}
 5562 
 5563 
 5564 instruct minmax_reduction2D_av_avx10(regD dst, vec src, vec xtmp1) %{
 5565   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5566             Matcher::vector_length(n->in(2)) == 2);
 5567   match(Set dst (MinReductionV dst src));
 5568   match(Set dst (MaxReductionV dst src));
 5569   effect(TEMP dst, TEMP xtmp1);
 5570   format %{ "vector_minmax2D_reduction $dst, $src ; using $xtmp1 as TEMP" %}
 5571   ins_encode %{
 5572     int opcode = this->ideal_Opcode();
 5573     int vlen = Matcher::vector_length(this, $src);
 5574     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5575                           xnoreg, xnoreg, xnoreg, $xtmp1$$XMMRegister);
 5576   %}
 5577   ins_pipe( pipe_slow );
 5578 %}
 5579 
 5580 instruct minmax_reductionD_av_avx10(regD dst, vec src, vec xtmp1, vec xtmp2) %{
 5581   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5582             Matcher::vector_length(n->in(2)) >= 4);
 5583   match(Set dst (MinReductionV dst src));
 5584   match(Set dst (MaxReductionV dst src));
 5585   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 5586   format %{ "vector_minmaxD_reduction $dst, $src ; using $xtmp1 and $xtmp2 as TEMP" %}
 5587   ins_encode %{
 5588     int opcode = this->ideal_Opcode();
 5589     int vlen = Matcher::vector_length(this, $src);
 5590     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5591                           xnoreg, xnoreg, xnoreg, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
 5592   %}
 5593   ins_pipe( pipe_slow );
 5594 %}
 5595 
 5596 // ====================VECTOR ARITHMETIC=======================================
 5597 
 5598 // --------------------------------- ADD --------------------------------------
 5599 
 5600 // Bytes vector add
 5601 instruct vaddB(vec dst, vec src) %{
 5602   predicate(UseAVX == 0);
 5603   match(Set dst (AddVB dst src));
 5604   format %{ "paddb   $dst,$src\t! add packedB" %}
 5605   ins_encode %{
 5606     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
 5607   %}
 5608   ins_pipe( pipe_slow );
 5609 %}
 5610 
 5611 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
 5612   predicate(UseAVX > 0);
 5613   match(Set dst (AddVB src1 src2));
 5614   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
 5615   ins_encode %{
 5616     int vlen_enc = vector_length_encoding(this);
 5617     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5618   %}
 5619   ins_pipe( pipe_slow );
 5620 %}
 5621 
 5622 instruct vaddB_mem(vec dst, vec src, memory mem) %{
 5623   predicate((UseAVX > 0) &&
 5624             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5625   match(Set dst (AddVB src (LoadVector mem)));
 5626   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
 5627   ins_encode %{
 5628     int vlen_enc = vector_length_encoding(this);
 5629     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5630   %}
 5631   ins_pipe( pipe_slow );
 5632 %}
 5633 
 5634 // Shorts/Chars vector add
 5635 instruct vaddS(vec dst, vec src) %{
 5636   predicate(UseAVX == 0);
 5637   match(Set dst (AddVS dst src));
 5638   format %{ "paddw   $dst,$src\t! add packedS" %}
 5639   ins_encode %{
 5640     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
 5641   %}
 5642   ins_pipe( pipe_slow );
 5643 %}
 5644 
 5645 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
 5646   predicate(UseAVX > 0);
 5647   match(Set dst (AddVS src1 src2));
 5648   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
 5649   ins_encode %{
 5650     int vlen_enc = vector_length_encoding(this);
 5651     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5652   %}
 5653   ins_pipe( pipe_slow );
 5654 %}
 5655 
 5656 instruct vaddS_mem(vec dst, vec src, memory mem) %{
 5657   predicate((UseAVX > 0) &&
 5658             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5659   match(Set dst (AddVS src (LoadVector mem)));
 5660   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
 5661   ins_encode %{
 5662     int vlen_enc = vector_length_encoding(this);
 5663     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5664   %}
 5665   ins_pipe( pipe_slow );
 5666 %}
 5667 
 5668 // Integers vector add
 5669 instruct vaddI(vec dst, vec src) %{
 5670   predicate(UseAVX == 0);
 5671   match(Set dst (AddVI dst src));
 5672   format %{ "paddd   $dst,$src\t! add packedI" %}
 5673   ins_encode %{
 5674     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
 5675   %}
 5676   ins_pipe( pipe_slow );
 5677 %}
 5678 
 5679 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
 5680   predicate(UseAVX > 0);
 5681   match(Set dst (AddVI src1 src2));
 5682   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
 5683   ins_encode %{
 5684     int vlen_enc = vector_length_encoding(this);
 5685     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5686   %}
 5687   ins_pipe( pipe_slow );
 5688 %}
 5689 
 5690 
 5691 instruct vaddI_mem(vec dst, vec src, memory mem) %{
 5692   predicate((UseAVX > 0) &&
 5693             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5694   match(Set dst (AddVI src (LoadVector mem)));
 5695   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
 5696   ins_encode %{
 5697     int vlen_enc = vector_length_encoding(this);
 5698     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5699   %}
 5700   ins_pipe( pipe_slow );
 5701 %}
 5702 
 5703 // Longs vector add
 5704 instruct vaddL(vec dst, vec src) %{
 5705   predicate(UseAVX == 0);
 5706   match(Set dst (AddVL dst src));
 5707   format %{ "paddq   $dst,$src\t! add packedL" %}
 5708   ins_encode %{
 5709     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
 5710   %}
 5711   ins_pipe( pipe_slow );
 5712 %}
 5713 
 5714 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
 5715   predicate(UseAVX > 0);
 5716   match(Set dst (AddVL src1 src2));
 5717   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
 5718   ins_encode %{
 5719     int vlen_enc = vector_length_encoding(this);
 5720     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5721   %}
 5722   ins_pipe( pipe_slow );
 5723 %}
 5724 
 5725 instruct vaddL_mem(vec dst, vec src, memory mem) %{
 5726   predicate((UseAVX > 0) &&
 5727             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5728   match(Set dst (AddVL src (LoadVector mem)));
 5729   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
 5730   ins_encode %{
 5731     int vlen_enc = vector_length_encoding(this);
 5732     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5733   %}
 5734   ins_pipe( pipe_slow );
 5735 %}
 5736 
 5737 // Floats vector add
 5738 instruct vaddF(vec dst, vec src) %{
 5739   predicate(UseAVX == 0);
 5740   match(Set dst (AddVF dst src));
 5741   format %{ "addps   $dst,$src\t! add packedF" %}
 5742   ins_encode %{
 5743     __ addps($dst$$XMMRegister, $src$$XMMRegister);
 5744   %}
 5745   ins_pipe( pipe_slow );
 5746 %}
 5747 
 5748 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
 5749   predicate(UseAVX > 0);
 5750   match(Set dst (AddVF src1 src2));
 5751   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
 5752   ins_encode %{
 5753     int vlen_enc = vector_length_encoding(this);
 5754     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5755   %}
 5756   ins_pipe( pipe_slow );
 5757 %}
 5758 
 5759 instruct vaddF_mem(vec dst, vec src, memory mem) %{
 5760   predicate((UseAVX > 0) &&
 5761             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5762   match(Set dst (AddVF src (LoadVector mem)));
 5763   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
 5764   ins_encode %{
 5765     int vlen_enc = vector_length_encoding(this);
 5766     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5767   %}
 5768   ins_pipe( pipe_slow );
 5769 %}
 5770 
 5771 // Doubles vector add
 5772 instruct vaddD(vec dst, vec src) %{
 5773   predicate(UseAVX == 0);
 5774   match(Set dst (AddVD dst src));
 5775   format %{ "addpd   $dst,$src\t! add packedD" %}
 5776   ins_encode %{
 5777     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
 5778   %}
 5779   ins_pipe( pipe_slow );
 5780 %}
 5781 
 5782 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
 5783   predicate(UseAVX > 0);
 5784   match(Set dst (AddVD src1 src2));
 5785   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
 5786   ins_encode %{
 5787     int vlen_enc = vector_length_encoding(this);
 5788     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5789   %}
 5790   ins_pipe( pipe_slow );
 5791 %}
 5792 
 5793 instruct vaddD_mem(vec dst, vec src, memory mem) %{
 5794   predicate((UseAVX > 0) &&
 5795             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5796   match(Set dst (AddVD src (LoadVector mem)));
 5797   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
 5798   ins_encode %{
 5799     int vlen_enc = vector_length_encoding(this);
 5800     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5801   %}
 5802   ins_pipe( pipe_slow );
 5803 %}
 5804 
 5805 // --------------------------------- SUB --------------------------------------
 5806 
 5807 // Bytes vector sub
 5808 instruct vsubB(vec dst, vec src) %{
 5809   predicate(UseAVX == 0);
 5810   match(Set dst (SubVB dst src));
 5811   format %{ "psubb   $dst,$src\t! sub packedB" %}
 5812   ins_encode %{
 5813     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
 5814   %}
 5815   ins_pipe( pipe_slow );
 5816 %}
 5817 
 5818 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
 5819   predicate(UseAVX > 0);
 5820   match(Set dst (SubVB src1 src2));
 5821   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
 5822   ins_encode %{
 5823     int vlen_enc = vector_length_encoding(this);
 5824     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5825   %}
 5826   ins_pipe( pipe_slow );
 5827 %}
 5828 
 5829 instruct vsubB_mem(vec dst, vec src, memory mem) %{
 5830   predicate((UseAVX > 0) &&
 5831             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5832   match(Set dst (SubVB src (LoadVector mem)));
 5833   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
 5834   ins_encode %{
 5835     int vlen_enc = vector_length_encoding(this);
 5836     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5837   %}
 5838   ins_pipe( pipe_slow );
 5839 %}
 5840 
 5841 // Shorts/Chars vector sub
 5842 instruct vsubS(vec dst, vec src) %{
 5843   predicate(UseAVX == 0);
 5844   match(Set dst (SubVS dst src));
 5845   format %{ "psubw   $dst,$src\t! sub packedS" %}
 5846   ins_encode %{
 5847     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
 5848   %}
 5849   ins_pipe( pipe_slow );
 5850 %}
 5851 
 5852 
 5853 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
 5854   predicate(UseAVX > 0);
 5855   match(Set dst (SubVS src1 src2));
 5856   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
 5857   ins_encode %{
 5858     int vlen_enc = vector_length_encoding(this);
 5859     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5860   %}
 5861   ins_pipe( pipe_slow );
 5862 %}
 5863 
 5864 instruct vsubS_mem(vec dst, vec src, memory mem) %{
 5865   predicate((UseAVX > 0) &&
 5866             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5867   match(Set dst (SubVS src (LoadVector mem)));
 5868   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
 5869   ins_encode %{
 5870     int vlen_enc = vector_length_encoding(this);
 5871     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5872   %}
 5873   ins_pipe( pipe_slow );
 5874 %}
 5875 
 5876 // Integers vector sub
 5877 instruct vsubI(vec dst, vec src) %{
 5878   predicate(UseAVX == 0);
 5879   match(Set dst (SubVI dst src));
 5880   format %{ "psubd   $dst,$src\t! sub packedI" %}
 5881   ins_encode %{
 5882     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
 5883   %}
 5884   ins_pipe( pipe_slow );
 5885 %}
 5886 
 5887 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
 5888   predicate(UseAVX > 0);
 5889   match(Set dst (SubVI src1 src2));
 5890   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
 5891   ins_encode %{
 5892     int vlen_enc = vector_length_encoding(this);
 5893     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5894   %}
 5895   ins_pipe( pipe_slow );
 5896 %}
 5897 
 5898 instruct vsubI_mem(vec dst, vec src, memory mem) %{
 5899   predicate((UseAVX > 0) &&
 5900             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5901   match(Set dst (SubVI src (LoadVector mem)));
 5902   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
 5903   ins_encode %{
 5904     int vlen_enc = vector_length_encoding(this);
 5905     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5906   %}
 5907   ins_pipe( pipe_slow );
 5908 %}
 5909 
 5910 // Longs vector sub
 5911 instruct vsubL(vec dst, vec src) %{
 5912   predicate(UseAVX == 0);
 5913   match(Set dst (SubVL dst src));
 5914   format %{ "psubq   $dst,$src\t! sub packedL" %}
 5915   ins_encode %{
 5916     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
 5917   %}
 5918   ins_pipe( pipe_slow );
 5919 %}
 5920 
 5921 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
 5922   predicate(UseAVX > 0);
 5923   match(Set dst (SubVL src1 src2));
 5924   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
 5925   ins_encode %{
 5926     int vlen_enc = vector_length_encoding(this);
 5927     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5928   %}
 5929   ins_pipe( pipe_slow );
 5930 %}
 5931 
 5932 
 5933 instruct vsubL_mem(vec dst, vec src, memory mem) %{
 5934   predicate((UseAVX > 0) &&
 5935             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5936   match(Set dst (SubVL src (LoadVector mem)));
 5937   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
 5938   ins_encode %{
 5939     int vlen_enc = vector_length_encoding(this);
 5940     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5941   %}
 5942   ins_pipe( pipe_slow );
 5943 %}
 5944 
 5945 // Floats vector sub
 5946 instruct vsubF(vec dst, vec src) %{
 5947   predicate(UseAVX == 0);
 5948   match(Set dst (SubVF dst src));
 5949   format %{ "subps   $dst,$src\t! sub packedF" %}
 5950   ins_encode %{
 5951     __ subps($dst$$XMMRegister, $src$$XMMRegister);
 5952   %}
 5953   ins_pipe( pipe_slow );
 5954 %}
 5955 
 5956 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
 5957   predicate(UseAVX > 0);
 5958   match(Set dst (SubVF src1 src2));
 5959   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
 5960   ins_encode %{
 5961     int vlen_enc = vector_length_encoding(this);
 5962     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5963   %}
 5964   ins_pipe( pipe_slow );
 5965 %}
 5966 
 5967 instruct vsubF_mem(vec dst, vec src, memory mem) %{
 5968   predicate((UseAVX > 0) &&
 5969             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5970   match(Set dst (SubVF src (LoadVector mem)));
 5971   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
 5972   ins_encode %{
 5973     int vlen_enc = vector_length_encoding(this);
 5974     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5975   %}
 5976   ins_pipe( pipe_slow );
 5977 %}
 5978 
 5979 // Doubles vector sub
 5980 instruct vsubD(vec dst, vec src) %{
 5981   predicate(UseAVX == 0);
 5982   match(Set dst (SubVD dst src));
 5983   format %{ "subpd   $dst,$src\t! sub packedD" %}
 5984   ins_encode %{
 5985     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
 5986   %}
 5987   ins_pipe( pipe_slow );
 5988 %}
 5989 
 5990 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
 5991   predicate(UseAVX > 0);
 5992   match(Set dst (SubVD src1 src2));
 5993   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
 5994   ins_encode %{
 5995     int vlen_enc = vector_length_encoding(this);
 5996     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5997   %}
 5998   ins_pipe( pipe_slow );
 5999 %}
 6000 
 6001 instruct vsubD_mem(vec dst, vec src, memory mem) %{
 6002   predicate((UseAVX > 0) &&
 6003             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6004   match(Set dst (SubVD src (LoadVector mem)));
 6005   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
 6006   ins_encode %{
 6007     int vlen_enc = vector_length_encoding(this);
 6008     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6009   %}
 6010   ins_pipe( pipe_slow );
 6011 %}
 6012 
 6013 // --------------------------------- MUL --------------------------------------
 6014 
 6015 // Byte vector mul
 6016 instruct vmul8B(vec dst, vec src1, vec src2, vec xtmp) %{
 6017   predicate(Matcher::vector_length_in_bytes(n) <= 8);
 6018   match(Set dst (MulVB src1 src2));
 6019   effect(TEMP dst, TEMP xtmp);
 6020   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6021   ins_encode %{
 6022     assert(UseSSE > 3, "required");
 6023     __ pmovsxbw($dst$$XMMRegister, $src1$$XMMRegister);
 6024     __ pmovsxbw($xtmp$$XMMRegister, $src2$$XMMRegister);
 6025     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 6026     __ psllw($dst$$XMMRegister, 8);
 6027     __ psrlw($dst$$XMMRegister, 8);
 6028     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6029   %}
 6030   ins_pipe( pipe_slow );
 6031 %}
 6032 
 6033 instruct vmulB(vec dst, vec src1, vec src2, vec xtmp) %{
 6034   predicate(UseAVX == 0 && Matcher::vector_length_in_bytes(n) > 8);
 6035   match(Set dst (MulVB src1 src2));
 6036   effect(TEMP dst, TEMP xtmp);
 6037   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6038   ins_encode %{
 6039     assert(UseSSE > 3, "required");
 6040     // Odd-index elements
 6041     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
 6042     __ psrlw($dst$$XMMRegister, 8);
 6043     __ movdqu($xtmp$$XMMRegister, $src2$$XMMRegister);
 6044     __ psrlw($xtmp$$XMMRegister, 8);
 6045     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 6046     __ psllw($dst$$XMMRegister, 8);
 6047     // Even-index elements
 6048     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 6049     __ pmullw($xtmp$$XMMRegister, $src2$$XMMRegister);
 6050     __ psllw($xtmp$$XMMRegister, 8);
 6051     __ psrlw($xtmp$$XMMRegister, 8);
 6052     // Combine
 6053     __ por($dst$$XMMRegister, $xtmp$$XMMRegister);
 6054   %}
 6055   ins_pipe( pipe_slow );
 6056 %}
 6057 
 6058 instruct vmulB_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 6059   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) > 8);
 6060   match(Set dst (MulVB src1 src2));
 6061   effect(TEMP xtmp1, TEMP xtmp2);
 6062   format %{ "vmulVB  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 6063   ins_encode %{
 6064     int vlen_enc = vector_length_encoding(this);
 6065     // Odd-index elements
 6066     __ vpsrlw($xtmp2$$XMMRegister, $src1$$XMMRegister, 8, vlen_enc);
 6067     __ vpsrlw($xtmp1$$XMMRegister, $src2$$XMMRegister, 8, vlen_enc);
 6068     __ vpmullw($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6069     __ vpsllw($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 8, vlen_enc);
 6070     // Even-index elements
 6071     __ vpmullw($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6072     __ vpsllw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 6073     __ vpsrlw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 6074     // Combine
 6075     __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6076   %}
 6077   ins_pipe( pipe_slow );
 6078 %}
 6079 
 6080 // Shorts/Chars vector mul
 6081 instruct vmulS(vec dst, vec src) %{
 6082   predicate(UseAVX == 0);
 6083   match(Set dst (MulVS dst src));
 6084   format %{ "pmullw  $dst,$src\t! mul packedS" %}
 6085   ins_encode %{
 6086     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
 6087   %}
 6088   ins_pipe( pipe_slow );
 6089 %}
 6090 
 6091 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
 6092   predicate(UseAVX > 0);
 6093   match(Set dst (MulVS src1 src2));
 6094   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
 6095   ins_encode %{
 6096     int vlen_enc = vector_length_encoding(this);
 6097     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6098   %}
 6099   ins_pipe( pipe_slow );
 6100 %}
 6101 
 6102 instruct vmulS_mem(vec dst, vec src, memory mem) %{
 6103   predicate((UseAVX > 0) &&
 6104             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6105   match(Set dst (MulVS src (LoadVector mem)));
 6106   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
 6107   ins_encode %{
 6108     int vlen_enc = vector_length_encoding(this);
 6109     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6110   %}
 6111   ins_pipe( pipe_slow );
 6112 %}
 6113 
 6114 // Integers vector mul
 6115 instruct vmulI(vec dst, vec src) %{
 6116   predicate(UseAVX == 0);
 6117   match(Set dst (MulVI dst src));
 6118   format %{ "pmulld  $dst,$src\t! mul packedI" %}
 6119   ins_encode %{
 6120     assert(UseSSE > 3, "required");
 6121     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
 6122   %}
 6123   ins_pipe( pipe_slow );
 6124 %}
 6125 
 6126 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
 6127   predicate(UseAVX > 0);
 6128   match(Set dst (MulVI src1 src2));
 6129   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
 6130   ins_encode %{
 6131     int vlen_enc = vector_length_encoding(this);
 6132     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6133   %}
 6134   ins_pipe( pipe_slow );
 6135 %}
 6136 
 6137 instruct vmulI_mem(vec dst, vec src, memory mem) %{
 6138   predicate((UseAVX > 0) &&
 6139             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6140   match(Set dst (MulVI src (LoadVector mem)));
 6141   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
 6142   ins_encode %{
 6143     int vlen_enc = vector_length_encoding(this);
 6144     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6145   %}
 6146   ins_pipe( pipe_slow );
 6147 %}
 6148 
 6149 // Longs vector mul
 6150 instruct evmulL_reg(vec dst, vec src1, vec src2) %{
 6151   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6152              VM_Version::supports_avx512dq()) ||
 6153             VM_Version::supports_avx512vldq());
 6154   match(Set dst (MulVL src1 src2));
 6155   ins_cost(500);
 6156   format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
 6157   ins_encode %{
 6158     assert(UseAVX > 2, "required");
 6159     int vlen_enc = vector_length_encoding(this);
 6160     __ evpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6161   %}
 6162   ins_pipe( pipe_slow );
 6163 %}
 6164 
 6165 instruct evmulL_mem(vec dst, vec src, memory mem) %{
 6166   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6167              VM_Version::supports_avx512dq()) ||
 6168             (Matcher::vector_length_in_bytes(n) > 8 &&
 6169              VM_Version::supports_avx512vldq()));
 6170   match(Set dst (MulVL src (LoadVector mem)));
 6171   format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
 6172   ins_cost(500);
 6173   ins_encode %{
 6174     assert(UseAVX > 2, "required");
 6175     int vlen_enc = vector_length_encoding(this);
 6176     __ evpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6177   %}
 6178   ins_pipe( pipe_slow );
 6179 %}
 6180 
 6181 instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
 6182   predicate(UseAVX == 0);
 6183   match(Set dst (MulVL src1 src2));
 6184   ins_cost(500);
 6185   effect(TEMP dst, TEMP xtmp);
 6186   format %{ "mulVL   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6187   ins_encode %{
 6188     assert(VM_Version::supports_sse4_1(), "required");
 6189     // Get the lo-hi products, only the lower 32 bits is in concerns
 6190     __ pshufd($xtmp$$XMMRegister, $src2$$XMMRegister, 0xB1);
 6191     __ pmulld($xtmp$$XMMRegister, $src1$$XMMRegister);
 6192     __ pshufd($dst$$XMMRegister, $xtmp$$XMMRegister, 0xB1);
 6193     __ paddd($dst$$XMMRegister, $xtmp$$XMMRegister);
 6194     __ psllq($dst$$XMMRegister, 32);
 6195     // Get the lo-lo products
 6196     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 6197     __ pmuludq($xtmp$$XMMRegister, $src2$$XMMRegister);
 6198     __ paddq($dst$$XMMRegister, $xtmp$$XMMRegister);
 6199   %}
 6200   ins_pipe( pipe_slow );
 6201 %}
 6202 
 6203 instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 6204   predicate(UseAVX > 0 &&
 6205             ((Matcher::vector_length_in_bytes(n) == 64 &&
 6206               !VM_Version::supports_avx512dq()) ||
 6207              (Matcher::vector_length_in_bytes(n) < 64 &&
 6208               !VM_Version::supports_avx512vldq())));
 6209   match(Set dst (MulVL src1 src2));
 6210   effect(TEMP xtmp1, TEMP xtmp2);
 6211   ins_cost(500);
 6212   format %{ "vmulVL  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 6213   ins_encode %{
 6214     int vlen_enc = vector_length_encoding(this);
 6215     // Get the lo-hi products, only the lower 32 bits is in concerns
 6216     __ vpshufd($xtmp1$$XMMRegister, $src2$$XMMRegister, 0xB1, vlen_enc);
 6217     __ vpmulld($xtmp1$$XMMRegister, $src1$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6218     __ vpshufd($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 0xB1, vlen_enc);
 6219     __ vpaddd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6220     __ vpsllq($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 32, vlen_enc);
 6221     // Get the lo-lo products
 6222     __ vpmuludq($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6223     __ vpaddq($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6224   %}
 6225   ins_pipe( pipe_slow );
 6226 %}
 6227 
 6228 instruct vmuludq_reg(vec dst, vec src1, vec src2) %{
 6229   predicate(UseAVX > 0 && n->as_MulVL()->has_uint_inputs());
 6230   match(Set dst (MulVL src1 src2));
 6231   ins_cost(100);
 6232   format %{ "vpmuludq $dst,$src1,$src2\t! muludq packedL" %}
 6233   ins_encode %{
 6234     int vlen_enc = vector_length_encoding(this);
 6235     __ vpmuludq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6236   %}
 6237   ins_pipe( pipe_slow );
 6238 %}
 6239 
 6240 instruct vmuldq_reg(vec dst, vec src1, vec src2) %{
 6241   predicate(UseAVX > 0 && n->as_MulVL()->has_int_inputs());
 6242   match(Set dst (MulVL src1 src2));
 6243   ins_cost(100);
 6244   format %{ "vpmuldq $dst,$src1,$src2\t! muldq packedL" %}
 6245   ins_encode %{
 6246     int vlen_enc = vector_length_encoding(this);
 6247     __ vpmuldq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6248   %}
 6249   ins_pipe( pipe_slow );
 6250 %}
 6251 
 6252 // Floats vector mul
 6253 instruct vmulF(vec dst, vec src) %{
 6254   predicate(UseAVX == 0);
 6255   match(Set dst (MulVF dst src));
 6256   format %{ "mulps   $dst,$src\t! mul packedF" %}
 6257   ins_encode %{
 6258     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
 6259   %}
 6260   ins_pipe( pipe_slow );
 6261 %}
 6262 
 6263 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
 6264   predicate(UseAVX > 0);
 6265   match(Set dst (MulVF src1 src2));
 6266   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
 6267   ins_encode %{
 6268     int vlen_enc = vector_length_encoding(this);
 6269     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6270   %}
 6271   ins_pipe( pipe_slow );
 6272 %}
 6273 
 6274 instruct vmulF_mem(vec dst, vec src, memory mem) %{
 6275   predicate((UseAVX > 0) &&
 6276             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6277   match(Set dst (MulVF src (LoadVector mem)));
 6278   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
 6279   ins_encode %{
 6280     int vlen_enc = vector_length_encoding(this);
 6281     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6282   %}
 6283   ins_pipe( pipe_slow );
 6284 %}
 6285 
 6286 // Doubles vector mul
 6287 instruct vmulD(vec dst, vec src) %{
 6288   predicate(UseAVX == 0);
 6289   match(Set dst (MulVD dst src));
 6290   format %{ "mulpd   $dst,$src\t! mul packedD" %}
 6291   ins_encode %{
 6292     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
 6293   %}
 6294   ins_pipe( pipe_slow );
 6295 %}
 6296 
 6297 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
 6298   predicate(UseAVX > 0);
 6299   match(Set dst (MulVD src1 src2));
 6300   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
 6301   ins_encode %{
 6302     int vlen_enc = vector_length_encoding(this);
 6303     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6304   %}
 6305   ins_pipe( pipe_slow );
 6306 %}
 6307 
 6308 instruct vmulD_mem(vec dst, vec src, memory mem) %{
 6309   predicate((UseAVX > 0) &&
 6310             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6311   match(Set dst (MulVD src (LoadVector mem)));
 6312   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
 6313   ins_encode %{
 6314     int vlen_enc = vector_length_encoding(this);
 6315     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6316   %}
 6317   ins_pipe( pipe_slow );
 6318 %}
 6319 
 6320 // --------------------------------- DIV --------------------------------------
 6321 
 6322 // Floats vector div
 6323 instruct vdivF(vec dst, vec src) %{
 6324   predicate(UseAVX == 0);
 6325   match(Set dst (DivVF dst src));
 6326   format %{ "divps   $dst,$src\t! div packedF" %}
 6327   ins_encode %{
 6328     __ divps($dst$$XMMRegister, $src$$XMMRegister);
 6329   %}
 6330   ins_pipe( pipe_slow );
 6331 %}
 6332 
 6333 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
 6334   predicate(UseAVX > 0);
 6335   match(Set dst (DivVF src1 src2));
 6336   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
 6337   ins_encode %{
 6338     int vlen_enc = vector_length_encoding(this);
 6339     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6340   %}
 6341   ins_pipe( pipe_slow );
 6342 %}
 6343 
 6344 instruct vdivF_mem(vec dst, vec src, memory mem) %{
 6345   predicate((UseAVX > 0) &&
 6346             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6347   match(Set dst (DivVF src (LoadVector mem)));
 6348   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
 6349   ins_encode %{
 6350     int vlen_enc = vector_length_encoding(this);
 6351     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6352   %}
 6353   ins_pipe( pipe_slow );
 6354 %}
 6355 
 6356 // Doubles vector div
 6357 instruct vdivD(vec dst, vec src) %{
 6358   predicate(UseAVX == 0);
 6359   match(Set dst (DivVD dst src));
 6360   format %{ "divpd   $dst,$src\t! div packedD" %}
 6361   ins_encode %{
 6362     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
 6363   %}
 6364   ins_pipe( pipe_slow );
 6365 %}
 6366 
 6367 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
 6368   predicate(UseAVX > 0);
 6369   match(Set dst (DivVD src1 src2));
 6370   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
 6371   ins_encode %{
 6372     int vlen_enc = vector_length_encoding(this);
 6373     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6374   %}
 6375   ins_pipe( pipe_slow );
 6376 %}
 6377 
 6378 instruct vdivD_mem(vec dst, vec src, memory mem) %{
 6379   predicate((UseAVX > 0) &&
 6380             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6381   match(Set dst (DivVD src (LoadVector mem)));
 6382   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
 6383   ins_encode %{
 6384     int vlen_enc = vector_length_encoding(this);
 6385     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6386   %}
 6387   ins_pipe( pipe_slow );
 6388 %}
 6389 
 6390 // ------------------------------ MinMax ---------------------------------------
 6391 
 6392 // Byte, Short, Int vector Min/Max
 6393 instruct minmax_reg_sse(vec dst, vec src) %{
 6394   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6395             UseAVX == 0);
 6396   match(Set dst (MinV dst src));
 6397   match(Set dst (MaxV dst src));
 6398   format %{ "vector_minmax  $dst,$src\t!  " %}
 6399   ins_encode %{
 6400     assert(UseSSE >= 4, "required");
 6401 
 6402     int opcode = this->ideal_Opcode();
 6403     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6404     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
 6405   %}
 6406   ins_pipe( pipe_slow );
 6407 %}
 6408 
 6409 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
 6410   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6411             UseAVX > 0);
 6412   match(Set dst (MinV src1 src2));
 6413   match(Set dst (MaxV src1 src2));
 6414   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
 6415   ins_encode %{
 6416     int opcode = this->ideal_Opcode();
 6417     int vlen_enc = vector_length_encoding(this);
 6418     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6419 
 6420     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6421   %}
 6422   ins_pipe( pipe_slow );
 6423 %}
 6424 
 6425 // Long vector Min/Max
 6426 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
 6427   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6428             UseAVX == 0);
 6429   match(Set dst (MinV dst src));
 6430   match(Set dst (MaxV src dst));
 6431   effect(TEMP dst, TEMP tmp);
 6432   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
 6433   ins_encode %{
 6434     assert(UseSSE >= 4, "required");
 6435 
 6436     int opcode = this->ideal_Opcode();
 6437     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6438     assert(elem_bt == T_LONG, "sanity");
 6439 
 6440     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
 6441   %}
 6442   ins_pipe( pipe_slow );
 6443 %}
 6444 
 6445 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
 6446   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6447             UseAVX > 0 && !VM_Version::supports_avx512vl());
 6448   match(Set dst (MinV src1 src2));
 6449   match(Set dst (MaxV src1 src2));
 6450   effect(TEMP dst);
 6451   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
 6452   ins_encode %{
 6453     int vlen_enc = vector_length_encoding(this);
 6454     int opcode = this->ideal_Opcode();
 6455     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6456     assert(elem_bt == T_LONG, "sanity");
 6457 
 6458     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6459   %}
 6460   ins_pipe( pipe_slow );
 6461 %}
 6462 
 6463 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
 6464   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
 6465             Matcher::vector_element_basic_type(n) == T_LONG);
 6466   match(Set dst (MinV src1 src2));
 6467   match(Set dst (MaxV src1 src2));
 6468   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
 6469   ins_encode %{
 6470     assert(UseAVX > 2, "required");
 6471 
 6472     int vlen_enc = vector_length_encoding(this);
 6473     int opcode = this->ideal_Opcode();
 6474     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6475     assert(elem_bt == T_LONG, "sanity");
 6476 
 6477     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6478   %}
 6479   ins_pipe( pipe_slow );
 6480 %}
 6481 
 6482 // Float/Double vector Min/Max
 6483 instruct minmaxFP_avx10_reg(vec dst, vec a, vec b) %{
 6484   predicate(VM_Version::supports_avx10_2() &&
 6485             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6486   match(Set dst (MinV a b));
 6487   match(Set dst (MaxV a b));
 6488   format %{ "vector_minmaxFP  $dst, $a, $b" %}
 6489   ins_encode %{
 6490     int vlen_enc = vector_length_encoding(this);
 6491     int opcode = this->ideal_Opcode();
 6492     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6493     __ vminmax_fp(opcode, elem_bt, $dst$$XMMRegister, k0, $a$$XMMRegister, $b$$XMMRegister, vlen_enc);
 6494   %}
 6495   ins_pipe( pipe_slow );
 6496 %}
 6497 
 6498 // Float/Double vector Min/Max
 6499 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
 6500   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_length_in_bytes(n) <= 32 &&
 6501             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
 6502             UseAVX > 0);
 6503   match(Set dst (MinV a b));
 6504   match(Set dst (MaxV a b));
 6505   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
 6506   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
 6507   ins_encode %{
 6508     assert(UseAVX > 0, "required");
 6509 
 6510     int opcode = this->ideal_Opcode();
 6511     int vlen_enc = vector_length_encoding(this);
 6512     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6513 
 6514     __ vminmax_fp(opcode, elem_bt,
 6515                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6516                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6517   %}
 6518   ins_pipe( pipe_slow );
 6519 %}
 6520 
 6521 instruct evminmaxFP_reg_evex(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
 6522   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_length_in_bytes(n) == 64 &&
 6523             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6524   match(Set dst (MinV a b));
 6525   match(Set dst (MaxV a b));
 6526   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
 6527   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
 6528   ins_encode %{
 6529     assert(UseAVX > 2, "required");
 6530 
 6531     int opcode = this->ideal_Opcode();
 6532     int vlen_enc = vector_length_encoding(this);
 6533     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6534 
 6535     __ evminmax_fp(opcode, elem_bt,
 6536                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6537                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6538   %}
 6539   ins_pipe( pipe_slow );
 6540 %}
 6541 
 6542 // ------------------------------ Unsigned vector Min/Max ----------------------
 6543 
 6544 instruct vector_uminmax_reg(vec dst, vec a, vec b) %{
 6545   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6546   match(Set dst (UMinV a b));
 6547   match(Set dst (UMaxV a b));
 6548   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6549   ins_encode %{
 6550     int opcode = this->ideal_Opcode();
 6551     int vlen_enc = vector_length_encoding(this);
 6552     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6553     assert(is_integral_type(elem_bt), "");
 6554     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vlen_enc);
 6555   %}
 6556   ins_pipe( pipe_slow );
 6557 %}
 6558 
 6559 instruct vector_uminmax_mem(vec dst, vec a, memory b) %{
 6560   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6561   match(Set dst (UMinV a (LoadVector b)));
 6562   match(Set dst (UMaxV a (LoadVector b)));
 6563   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6564   ins_encode %{
 6565     int opcode = this->ideal_Opcode();
 6566     int vlen_enc = vector_length_encoding(this);
 6567     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6568     assert(is_integral_type(elem_bt), "");
 6569     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$Address, vlen_enc);
 6570   %}
 6571   ins_pipe( pipe_slow );
 6572 %}
 6573 
 6574 instruct vector_uminmaxq_reg(vec dst, vec a, vec b, vec xtmp1, vec xtmp2) %{
 6575   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_LONG);
 6576   match(Set dst (UMinV a b));
 6577   match(Set dst (UMaxV a b));
 6578   effect(TEMP xtmp1, TEMP xtmp2);
 6579   format %{ "vector_uminmaxq $dst,$a,$b\t! using xtmp1 and xtmp2 as TEMP" %}
 6580   ins_encode %{
 6581     int opcode = this->ideal_Opcode();
 6582     int vlen_enc = vector_length_encoding(this);
 6583     __ vpuminmaxq(opcode, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6584   %}
 6585   ins_pipe( pipe_slow );
 6586 %}
 6587 
 6588 instruct vector_uminmax_reg_masked(vec dst, vec src2, kReg mask) %{
 6589   match(Set dst (UMinV (Binary dst src2) mask));
 6590   match(Set dst (UMaxV (Binary dst src2) mask));
 6591   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6592   ins_encode %{
 6593     int vlen_enc = vector_length_encoding(this);
 6594     BasicType bt = Matcher::vector_element_basic_type(this);
 6595     int opc = this->ideal_Opcode();
 6596     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6597                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 6598   %}
 6599   ins_pipe( pipe_slow );
 6600 %}
 6601 
 6602 instruct vector_uminmax_mem_masked(vec dst, memory src2, kReg mask) %{
 6603   match(Set dst (UMinV (Binary dst (LoadVector src2)) mask));
 6604   match(Set dst (UMaxV (Binary dst (LoadVector src2)) mask));
 6605   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6606   ins_encode %{
 6607     int vlen_enc = vector_length_encoding(this);
 6608     BasicType bt = Matcher::vector_element_basic_type(this);
 6609     int opc = this->ideal_Opcode();
 6610     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6611                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 6612   %}
 6613   ins_pipe( pipe_slow );
 6614 %}
 6615 
 6616 // --------------------------------- Signum/CopySign ---------------------------
 6617 
 6618 instruct signumF_reg(regF dst, regF zero, regF one, rFlagsReg cr) %{
 6619   match(Set dst (SignumF dst (Binary zero one)));
 6620   effect(KILL cr);
 6621   format %{ "signumF $dst, $dst" %}
 6622   ins_encode %{
 6623     int opcode = this->ideal_Opcode();
 6624     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6625   %}
 6626   ins_pipe( pipe_slow );
 6627 %}
 6628 
 6629 instruct signumD_reg(regD dst, regD zero, regD one, rFlagsReg cr) %{
 6630   match(Set dst (SignumD dst (Binary zero one)));
 6631   effect(KILL cr);
 6632   format %{ "signumD $dst, $dst" %}
 6633   ins_encode %{
 6634     int opcode = this->ideal_Opcode();
 6635     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6636   %}
 6637   ins_pipe( pipe_slow );
 6638 %}
 6639 
 6640 instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
 6641   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 6642   match(Set dst (SignumVF src (Binary zero one)));
 6643   match(Set dst (SignumVD src (Binary zero one)));
 6644   effect(TEMP dst, TEMP xtmp1);
 6645   format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
 6646   ins_encode %{
 6647     int opcode = this->ideal_Opcode();
 6648     int vec_enc = vector_length_encoding(this);
 6649     __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6650                          $xtmp1$$XMMRegister, vec_enc);
 6651   %}
 6652   ins_pipe( pipe_slow );
 6653 %}
 6654 
 6655 instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
 6656   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 6657   match(Set dst (SignumVF src (Binary zero one)));
 6658   match(Set dst (SignumVD src (Binary zero one)));
 6659   effect(TEMP dst, TEMP ktmp1);
 6660   format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
 6661   ins_encode %{
 6662     int opcode = this->ideal_Opcode();
 6663     int vec_enc = vector_length_encoding(this);
 6664     __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6665                           $ktmp1$$KRegister, vec_enc);
 6666   %}
 6667   ins_pipe( pipe_slow );
 6668 %}
 6669 
 6670 // ---------------------------------------
 6671 // For copySign use 0xE4 as writemask for vpternlog
 6672 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
 6673 // C (xmm2) is set to 0x7FFFFFFF
 6674 // Wherever xmm2 is 0, we want to pick from B (sign)
 6675 // Wherever xmm2 is 1, we want to pick from A (src)
 6676 //
 6677 // A B C Result
 6678 // 0 0 0 0
 6679 // 0 0 1 0
 6680 // 0 1 0 1
 6681 // 0 1 1 0
 6682 // 1 0 0 0
 6683 // 1 0 1 1
 6684 // 1 1 0 1
 6685 // 1 1 1 1
 6686 //
 6687 // Result going from high bit to low bit is 0x11100100 = 0xe4
 6688 // ---------------------------------------
 6689 
 6690 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
 6691   match(Set dst (CopySignF dst src));
 6692   effect(TEMP tmp1, TEMP tmp2);
 6693   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6694   ins_encode %{
 6695     __ movl($tmp2$$Register, 0x7FFFFFFF);
 6696     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
 6697     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6698   %}
 6699   ins_pipe( pipe_slow );
 6700 %}
 6701 
 6702 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
 6703   match(Set dst (CopySignD dst (Binary src zero)));
 6704   ins_cost(100);
 6705   effect(TEMP tmp1, TEMP tmp2);
 6706   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6707   ins_encode %{
 6708     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
 6709     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
 6710     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6711   %}
 6712   ins_pipe( pipe_slow );
 6713 %}
 6714 
 6715 //----------------------------- CompressBits/ExpandBits ------------------------
 6716 
 6717 instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6718   predicate(n->bottom_type()->isa_int());
 6719   match(Set dst (CompressBits src mask));
 6720   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6721   ins_encode %{
 6722     __ pextl($dst$$Register, $src$$Register, $mask$$Register);
 6723   %}
 6724   ins_pipe( pipe_slow );
 6725 %}
 6726 
 6727 instruct expandBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6728   predicate(n->bottom_type()->isa_int());
 6729   match(Set dst (ExpandBits src mask));
 6730   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6731   ins_encode %{
 6732     __ pdepl($dst$$Register, $src$$Register, $mask$$Register);
 6733   %}
 6734   ins_pipe( pipe_slow );
 6735 %}
 6736 
 6737 instruct compressBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6738   predicate(n->bottom_type()->isa_int());
 6739   match(Set dst (CompressBits src (LoadI mask)));
 6740   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6741   ins_encode %{
 6742     __ pextl($dst$$Register, $src$$Register, $mask$$Address);
 6743   %}
 6744   ins_pipe( pipe_slow );
 6745 %}
 6746 
 6747 instruct expandBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6748   predicate(n->bottom_type()->isa_int());
 6749   match(Set dst (ExpandBits src (LoadI mask)));
 6750   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6751   ins_encode %{
 6752     __ pdepl($dst$$Register, $src$$Register, $mask$$Address);
 6753   %}
 6754   ins_pipe( pipe_slow );
 6755 %}
 6756 
 6757 // --------------------------------- Sqrt --------------------------------------
 6758 
 6759 instruct vsqrtF_reg(vec dst, vec src) %{
 6760   match(Set dst (SqrtVF src));
 6761   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
 6762   ins_encode %{
 6763     assert(UseAVX > 0, "required");
 6764     int vlen_enc = vector_length_encoding(this);
 6765     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6766   %}
 6767   ins_pipe( pipe_slow );
 6768 %}
 6769 
 6770 instruct vsqrtF_mem(vec dst, memory mem) %{
 6771   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6772   match(Set dst (SqrtVF (LoadVector mem)));
 6773   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
 6774   ins_encode %{
 6775     assert(UseAVX > 0, "required");
 6776     int vlen_enc = vector_length_encoding(this);
 6777     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6778   %}
 6779   ins_pipe( pipe_slow );
 6780 %}
 6781 
 6782 // Floating point vector sqrt
 6783 instruct vsqrtD_reg(vec dst, vec src) %{
 6784   match(Set dst (SqrtVD src));
 6785   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
 6786   ins_encode %{
 6787     assert(UseAVX > 0, "required");
 6788     int vlen_enc = vector_length_encoding(this);
 6789     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6790   %}
 6791   ins_pipe( pipe_slow );
 6792 %}
 6793 
 6794 instruct vsqrtD_mem(vec dst, memory mem) %{
 6795   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6796   match(Set dst (SqrtVD (LoadVector mem)));
 6797   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
 6798   ins_encode %{
 6799     assert(UseAVX > 0, "required");
 6800     int vlen_enc = vector_length_encoding(this);
 6801     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6802   %}
 6803   ins_pipe( pipe_slow );
 6804 %}
 6805 
 6806 // ------------------------------ Shift ---------------------------------------
 6807 
 6808 // Left and right shift count vectors are the same on x86
 6809 // (only lowest bits of xmm reg are used for count).
 6810 instruct vshiftcnt(vec dst, rRegI cnt) %{
 6811   match(Set dst (LShiftCntV cnt));
 6812   match(Set dst (RShiftCntV cnt));
 6813   format %{ "movdl    $dst,$cnt\t! load shift count" %}
 6814   ins_encode %{
 6815     __ movdl($dst$$XMMRegister, $cnt$$Register);
 6816   %}
 6817   ins_pipe( pipe_slow );
 6818 %}
 6819 
 6820 // Byte vector shift
 6821 instruct vshiftB(vec dst, vec src, vec shift, vec tmp) %{
 6822   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
 6823   match(Set dst ( LShiftVB src shift));
 6824   match(Set dst ( RShiftVB src shift));
 6825   match(Set dst (URShiftVB src shift));
 6826   effect(TEMP dst, USE src, USE shift, TEMP tmp);
 6827   format %{"vector_byte_shift $dst,$src,$shift" %}
 6828   ins_encode %{
 6829     assert(UseSSE > 3, "required");
 6830     int opcode = this->ideal_Opcode();
 6831     bool sign = (opcode != Op_URShiftVB);
 6832     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
 6833     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
 6834     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6835     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 6836     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6837   %}
 6838   ins_pipe( pipe_slow );
 6839 %}
 6840 
 6841 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6842   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6843             UseAVX <= 1);
 6844   match(Set dst ( LShiftVB src shift));
 6845   match(Set dst ( RShiftVB src shift));
 6846   match(Set dst (URShiftVB src shift));
 6847   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2);
 6848   format %{"vector_byte_shift $dst,$src,$shift" %}
 6849   ins_encode %{
 6850     assert(UseSSE > 3, "required");
 6851     int opcode = this->ideal_Opcode();
 6852     bool sign = (opcode != Op_URShiftVB);
 6853     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
 6854     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
 6855     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
 6856     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
 6857     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
 6858     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6859     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 6860     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 6861     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 6862   %}
 6863   ins_pipe( pipe_slow );
 6864 %}
 6865 
 6866 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6867   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6868             UseAVX > 1);
 6869   match(Set dst ( LShiftVB src shift));
 6870   match(Set dst ( RShiftVB src shift));
 6871   match(Set dst (URShiftVB src shift));
 6872   effect(TEMP dst, TEMP tmp);
 6873   format %{"vector_byte_shift $dst,$src,$shift" %}
 6874   ins_encode %{
 6875     int opcode = this->ideal_Opcode();
 6876     bool sign = (opcode != Op_URShiftVB);
 6877     int vlen_enc = Assembler::AVX_256bit;
 6878     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6879     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6880     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6881     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
 6882     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
 6883   %}
 6884   ins_pipe( pipe_slow );
 6885 %}
 6886 
 6887 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6888   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
 6889   match(Set dst ( LShiftVB src shift));
 6890   match(Set dst ( RShiftVB src shift));
 6891   match(Set dst (URShiftVB src shift));
 6892   effect(TEMP dst, TEMP tmp);
 6893   format %{"vector_byte_shift $dst,$src,$shift" %}
 6894   ins_encode %{
 6895     assert(UseAVX > 1, "required");
 6896     int opcode = this->ideal_Opcode();
 6897     bool sign = (opcode != Op_URShiftVB);
 6898     int vlen_enc = Assembler::AVX_256bit;
 6899     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
 6900     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6901     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6902     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6903     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6904     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6905     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6906     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6907     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6908   %}
 6909   ins_pipe( pipe_slow );
 6910 %}
 6911 
 6912 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6913   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
 6914   match(Set dst ( LShiftVB src shift));
 6915   match(Set dst  (RShiftVB src shift));
 6916   match(Set dst (URShiftVB src shift));
 6917   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 6918   format %{"vector_byte_shift $dst,$src,$shift" %}
 6919   ins_encode %{
 6920     assert(UseAVX > 2, "required");
 6921     int opcode = this->ideal_Opcode();
 6922     bool sign = (opcode != Op_URShiftVB);
 6923     int vlen_enc = Assembler::AVX_512bit;
 6924     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
 6925     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 6926     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6927     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6928     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6929     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6930     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6931     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6932     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6933     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 6934     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
 6935     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6936   %}
 6937   ins_pipe( pipe_slow );
 6938 %}
 6939 
 6940 // Shorts vector logical right shift produces incorrect Java result
 6941 // for negative data because java code convert short value into int with
 6942 // sign extension before a shift. But char vectors are fine since chars are
 6943 // unsigned values.
 6944 // Shorts/Chars vector left shift
 6945 instruct vshiftS(vec dst, vec src, vec shift) %{
 6946   predicate(!n->as_ShiftV()->is_var_shift());
 6947   match(Set dst ( LShiftVS src shift));
 6948   match(Set dst ( RShiftVS src shift));
 6949   match(Set dst (URShiftVS src shift));
 6950   effect(TEMP dst, USE src, USE shift);
 6951   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
 6952   ins_encode %{
 6953     int opcode = this->ideal_Opcode();
 6954     if (UseAVX > 0) {
 6955       int vlen_enc = vector_length_encoding(this);
 6956       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6957     } else {
 6958       int vlen = Matcher::vector_length(this);
 6959       if (vlen == 2) {
 6960         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 6961         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6962       } else if (vlen == 4) {
 6963         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6964         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6965       } else {
 6966         assert (vlen == 8, "sanity");
 6967         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6968         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6969       }
 6970     }
 6971   %}
 6972   ins_pipe( pipe_slow );
 6973 %}
 6974 
 6975 // Integers vector left shift
 6976 instruct vshiftI(vec dst, vec src, vec shift) %{
 6977   predicate(!n->as_ShiftV()->is_var_shift());
 6978   match(Set dst ( LShiftVI src shift));
 6979   match(Set dst ( RShiftVI src shift));
 6980   match(Set dst (URShiftVI src shift));
 6981   effect(TEMP dst, USE src, USE shift);
 6982   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
 6983   ins_encode %{
 6984     int opcode = this->ideal_Opcode();
 6985     if (UseAVX > 0) {
 6986       int vlen_enc = vector_length_encoding(this);
 6987       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6988     } else {
 6989       int vlen = Matcher::vector_length(this);
 6990       if (vlen == 2) {
 6991         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6992         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6993       } else {
 6994         assert(vlen == 4, "sanity");
 6995         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6996         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6997       }
 6998     }
 6999   %}
 7000   ins_pipe( pipe_slow );
 7001 %}
 7002 
 7003 // Integers vector left constant shift
 7004 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
 7005   match(Set dst (LShiftVI src (LShiftCntV shift)));
 7006   match(Set dst (RShiftVI src (RShiftCntV shift)));
 7007   match(Set dst (URShiftVI src (RShiftCntV shift)));
 7008   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
 7009   ins_encode %{
 7010     int opcode = this->ideal_Opcode();
 7011     if (UseAVX > 0) {
 7012       int vector_len = vector_length_encoding(this);
 7013       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 7014     } else {
 7015       int vlen = Matcher::vector_length(this);
 7016       if (vlen == 2) {
 7017         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 7018         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 7019       } else {
 7020         assert(vlen == 4, "sanity");
 7021         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7022         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 7023       }
 7024     }
 7025   %}
 7026   ins_pipe( pipe_slow );
 7027 %}
 7028 
 7029 // Longs vector shift
 7030 instruct vshiftL(vec dst, vec src, vec shift) %{
 7031   predicate(!n->as_ShiftV()->is_var_shift());
 7032   match(Set dst ( LShiftVL src shift));
 7033   match(Set dst (URShiftVL src shift));
 7034   effect(TEMP dst, USE src, USE shift);
 7035   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
 7036   ins_encode %{
 7037     int opcode = this->ideal_Opcode();
 7038     if (UseAVX > 0) {
 7039       int vlen_enc = vector_length_encoding(this);
 7040       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7041     } else {
 7042       assert(Matcher::vector_length(this) == 2, "");
 7043       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7044       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 7045     }
 7046   %}
 7047   ins_pipe( pipe_slow );
 7048 %}
 7049 
 7050 // Longs vector constant shift
 7051 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
 7052   match(Set dst (LShiftVL src (LShiftCntV shift)));
 7053   match(Set dst (URShiftVL src (RShiftCntV shift)));
 7054   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
 7055   ins_encode %{
 7056     int opcode = this->ideal_Opcode();
 7057     if (UseAVX > 0) {
 7058       int vector_len = vector_length_encoding(this);
 7059       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 7060     } else {
 7061       assert(Matcher::vector_length(this) == 2, "");
 7062       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7063       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 7064     }
 7065   %}
 7066   ins_pipe( pipe_slow );
 7067 %}
 7068 
 7069 // -------------------ArithmeticRightShift -----------------------------------
 7070 // Long vector arithmetic right shift
 7071 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp) %{
 7072   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
 7073   match(Set dst (RShiftVL src shift));
 7074   effect(TEMP dst, TEMP tmp);
 7075   format %{ "vshiftq $dst,$src,$shift" %}
 7076   ins_encode %{
 7077     uint vlen = Matcher::vector_length(this);
 7078     if (vlen == 2) {
 7079       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7080       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
 7081       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 7082       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
 7083       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
 7084       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
 7085     } else {
 7086       assert(vlen == 4, "sanity");
 7087       assert(UseAVX > 1, "required");
 7088       int vlen_enc = Assembler::AVX_256bit;
 7089       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7090       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 7091       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7092       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 7093       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 7094     }
 7095   %}
 7096   ins_pipe( pipe_slow );
 7097 %}
 7098 
 7099 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
 7100   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
 7101   match(Set dst (RShiftVL src shift));
 7102   format %{ "vshiftq $dst,$src,$shift" %}
 7103   ins_encode %{
 7104     int vlen_enc = vector_length_encoding(this);
 7105     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7106   %}
 7107   ins_pipe( pipe_slow );
 7108 %}
 7109 
 7110 // ------------------- Variable Shift -----------------------------
 7111 // Byte variable shift
 7112 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7113   predicate(Matcher::vector_length(n) <= 8 &&
 7114             n->as_ShiftV()->is_var_shift() &&
 7115             !VM_Version::supports_avx512bw());
 7116   match(Set dst ( LShiftVB src shift));
 7117   match(Set dst ( RShiftVB src shift));
 7118   match(Set dst (URShiftVB src shift));
 7119   effect(TEMP dst, TEMP vtmp);
 7120   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7121   ins_encode %{
 7122     assert(UseAVX >= 2, "required");
 7123 
 7124     int opcode = this->ideal_Opcode();
 7125     int vlen_enc = Assembler::AVX_128bit;
 7126     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7127     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7128   %}
 7129   ins_pipe( pipe_slow );
 7130 %}
 7131 
 7132 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7133   predicate(Matcher::vector_length(n) == 16 &&
 7134             n->as_ShiftV()->is_var_shift() &&
 7135             !VM_Version::supports_avx512bw());
 7136   match(Set dst ( LShiftVB src shift));
 7137   match(Set dst ( RShiftVB src shift));
 7138   match(Set dst (URShiftVB src shift));
 7139   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7140   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7141   ins_encode %{
 7142     assert(UseAVX >= 2, "required");
 7143 
 7144     int opcode = this->ideal_Opcode();
 7145     int vlen_enc = Assembler::AVX_128bit;
 7146     // Shift lower half and get word result in dst
 7147     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7148 
 7149     // Shift upper half and get word result in vtmp1
 7150     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7151     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7152     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7153 
 7154     // Merge and down convert the two word results to byte in dst
 7155     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7156   %}
 7157   ins_pipe( pipe_slow );
 7158 %}
 7159 
 7160 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4) %{
 7161   predicate(Matcher::vector_length(n) == 32 &&
 7162             n->as_ShiftV()->is_var_shift() &&
 7163             !VM_Version::supports_avx512bw());
 7164   match(Set dst ( LShiftVB src shift));
 7165   match(Set dst ( RShiftVB src shift));
 7166   match(Set dst (URShiftVB src shift));
 7167   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4);
 7168   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 as TEMP" %}
 7169   ins_encode %{
 7170     assert(UseAVX >= 2, "required");
 7171 
 7172     int opcode = this->ideal_Opcode();
 7173     int vlen_enc = Assembler::AVX_128bit;
 7174     // Process lower 128 bits and get result in dst
 7175     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7176     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7177     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7178     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7179     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7180 
 7181     // Process higher 128 bits and get result in vtmp3
 7182     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7183     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7184     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister);
 7185     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
 7186     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
 7187     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7188     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7189 
 7190     // Merge the two results in dst
 7191     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7192   %}
 7193   ins_pipe( pipe_slow );
 7194 %}
 7195 
 7196 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp) %{
 7197   predicate(Matcher::vector_length(n) <= 32 &&
 7198             n->as_ShiftV()->is_var_shift() &&
 7199             VM_Version::supports_avx512bw());
 7200   match(Set dst ( LShiftVB src shift));
 7201   match(Set dst ( RShiftVB src shift));
 7202   match(Set dst (URShiftVB src shift));
 7203   effect(TEMP dst, TEMP vtmp);
 7204   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7205   ins_encode %{
 7206     assert(UseAVX > 2, "required");
 7207 
 7208     int opcode = this->ideal_Opcode();
 7209     int vlen_enc = vector_length_encoding(this);
 7210     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7211   %}
 7212   ins_pipe( pipe_slow );
 7213 %}
 7214 
 7215 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7216   predicate(Matcher::vector_length(n) == 64 &&
 7217             n->as_ShiftV()->is_var_shift() &&
 7218             VM_Version::supports_avx512bw());
 7219   match(Set dst ( LShiftVB src shift));
 7220   match(Set dst ( RShiftVB src shift));
 7221   match(Set dst (URShiftVB src shift));
 7222   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7223   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7224   ins_encode %{
 7225     assert(UseAVX > 2, "required");
 7226 
 7227     int opcode = this->ideal_Opcode();
 7228     int vlen_enc = Assembler::AVX_256bit;
 7229     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7230     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7231     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7232     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7233     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7234   %}
 7235   ins_pipe( pipe_slow );
 7236 %}
 7237 
 7238 // Short variable shift
 7239 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7240   predicate(Matcher::vector_length(n) <= 8 &&
 7241             n->as_ShiftV()->is_var_shift() &&
 7242             !VM_Version::supports_avx512bw());
 7243   match(Set dst ( LShiftVS src shift));
 7244   match(Set dst ( RShiftVS src shift));
 7245   match(Set dst (URShiftVS src shift));
 7246   effect(TEMP dst, TEMP vtmp);
 7247   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7248   ins_encode %{
 7249     assert(UseAVX >= 2, "required");
 7250 
 7251     int opcode = this->ideal_Opcode();
 7252     bool sign = (opcode != Op_URShiftVS);
 7253     int vlen_enc = Assembler::AVX_256bit;
 7254     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
 7255     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
 7256     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 7257     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7258     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
 7259     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7260   %}
 7261   ins_pipe( pipe_slow );
 7262 %}
 7263 
 7264 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7265   predicate(Matcher::vector_length(n) == 16 &&
 7266             n->as_ShiftV()->is_var_shift() &&
 7267             !VM_Version::supports_avx512bw());
 7268   match(Set dst ( LShiftVS src shift));
 7269   match(Set dst ( RShiftVS src shift));
 7270   match(Set dst (URShiftVS src shift));
 7271   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7272   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7273   ins_encode %{
 7274     assert(UseAVX >= 2, "required");
 7275 
 7276     int opcode = this->ideal_Opcode();
 7277     bool sign = (opcode != Op_URShiftVS);
 7278     int vlen_enc = Assembler::AVX_256bit;
 7279     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
 7280     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7281     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7282     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7283     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7284 
 7285     // Shift upper half, with result in dst using vtmp1 as TEMP
 7286     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
 7287     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
 7288     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7289     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7290     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7291     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7292 
 7293     // Merge lower and upper half result into dst
 7294     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7295     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 7296   %}
 7297   ins_pipe( pipe_slow );
 7298 %}
 7299 
 7300 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
 7301   predicate(n->as_ShiftV()->is_var_shift() &&
 7302             VM_Version::supports_avx512bw());
 7303   match(Set dst ( LShiftVS src shift));
 7304   match(Set dst ( RShiftVS src shift));
 7305   match(Set dst (URShiftVS src shift));
 7306   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
 7307   ins_encode %{
 7308     assert(UseAVX > 2, "required");
 7309 
 7310     int opcode = this->ideal_Opcode();
 7311     int vlen_enc = vector_length_encoding(this);
 7312     if (!VM_Version::supports_avx512vl()) {
 7313       vlen_enc = Assembler::AVX_512bit;
 7314     }
 7315     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7316   %}
 7317   ins_pipe( pipe_slow );
 7318 %}
 7319 
 7320 //Integer variable shift
 7321 instruct vshiftI_var(vec dst, vec src, vec shift) %{
 7322   predicate(n->as_ShiftV()->is_var_shift());
 7323   match(Set dst ( LShiftVI src shift));
 7324   match(Set dst ( RShiftVI src shift));
 7325   match(Set dst (URShiftVI src shift));
 7326   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
 7327   ins_encode %{
 7328     assert(UseAVX >= 2, "required");
 7329 
 7330     int opcode = this->ideal_Opcode();
 7331     int vlen_enc = vector_length_encoding(this);
 7332     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7333   %}
 7334   ins_pipe( pipe_slow );
 7335 %}
 7336 
 7337 //Long variable shift
 7338 instruct vshiftL_var(vec dst, vec src, vec shift) %{
 7339   predicate(n->as_ShiftV()->is_var_shift());
 7340   match(Set dst ( LShiftVL src shift));
 7341   match(Set dst (URShiftVL src shift));
 7342   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
 7343   ins_encode %{
 7344     assert(UseAVX >= 2, "required");
 7345 
 7346     int opcode = this->ideal_Opcode();
 7347     int vlen_enc = vector_length_encoding(this);
 7348     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7349   %}
 7350   ins_pipe( pipe_slow );
 7351 %}
 7352 
 7353 //Long variable right shift arithmetic
 7354 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
 7355   predicate(Matcher::vector_length(n) <= 4 &&
 7356             n->as_ShiftV()->is_var_shift() &&
 7357             UseAVX == 2);
 7358   match(Set dst (RShiftVL src shift));
 7359   effect(TEMP dst, TEMP vtmp);
 7360   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
 7361   ins_encode %{
 7362     int opcode = this->ideal_Opcode();
 7363     int vlen_enc = vector_length_encoding(this);
 7364     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
 7365                  $vtmp$$XMMRegister);
 7366   %}
 7367   ins_pipe( pipe_slow );
 7368 %}
 7369 
 7370 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
 7371   predicate(n->as_ShiftV()->is_var_shift() &&
 7372             UseAVX > 2);
 7373   match(Set dst (RShiftVL src shift));
 7374   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
 7375   ins_encode %{
 7376     int opcode = this->ideal_Opcode();
 7377     int vlen_enc = vector_length_encoding(this);
 7378     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7379   %}
 7380   ins_pipe( pipe_slow );
 7381 %}
 7382 
 7383 // --------------------------------- AND --------------------------------------
 7384 
 7385 instruct vand(vec dst, vec src) %{
 7386   predicate(UseAVX == 0);
 7387   match(Set dst (AndV dst src));
 7388   format %{ "pand    $dst,$src\t! and vectors" %}
 7389   ins_encode %{
 7390     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 7391   %}
 7392   ins_pipe( pipe_slow );
 7393 %}
 7394 
 7395 instruct vand_reg(vec dst, vec src1, vec src2) %{
 7396   predicate(UseAVX > 0);
 7397   match(Set dst (AndV src1 src2));
 7398   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
 7399   ins_encode %{
 7400     int vlen_enc = vector_length_encoding(this);
 7401     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7402   %}
 7403   ins_pipe( pipe_slow );
 7404 %}
 7405 
 7406 instruct vand_mem(vec dst, vec src, memory mem) %{
 7407   predicate((UseAVX > 0) &&
 7408             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7409   match(Set dst (AndV src (LoadVector mem)));
 7410   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
 7411   ins_encode %{
 7412     int vlen_enc = vector_length_encoding(this);
 7413     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7414   %}
 7415   ins_pipe( pipe_slow );
 7416 %}
 7417 
 7418 // --------------------------------- OR ---------------------------------------
 7419 
 7420 instruct vor(vec dst, vec src) %{
 7421   predicate(UseAVX == 0);
 7422   match(Set dst (OrV dst src));
 7423   format %{ "por     $dst,$src\t! or vectors" %}
 7424   ins_encode %{
 7425     __ por($dst$$XMMRegister, $src$$XMMRegister);
 7426   %}
 7427   ins_pipe( pipe_slow );
 7428 %}
 7429 
 7430 instruct vor_reg(vec dst, vec src1, vec src2) %{
 7431   predicate(UseAVX > 0);
 7432   match(Set dst (OrV src1 src2));
 7433   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
 7434   ins_encode %{
 7435     int vlen_enc = vector_length_encoding(this);
 7436     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7437   %}
 7438   ins_pipe( pipe_slow );
 7439 %}
 7440 
 7441 instruct vor_mem(vec dst, vec src, memory mem) %{
 7442   predicate((UseAVX > 0) &&
 7443             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7444   match(Set dst (OrV src (LoadVector mem)));
 7445   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
 7446   ins_encode %{
 7447     int vlen_enc = vector_length_encoding(this);
 7448     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7449   %}
 7450   ins_pipe( pipe_slow );
 7451 %}
 7452 
 7453 // --------------------------------- XOR --------------------------------------
 7454 
 7455 instruct vxor(vec dst, vec src) %{
 7456   predicate(UseAVX == 0);
 7457   match(Set dst (XorV dst src));
 7458   format %{ "pxor    $dst,$src\t! xor vectors" %}
 7459   ins_encode %{
 7460     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
 7461   %}
 7462   ins_pipe( pipe_slow );
 7463 %}
 7464 
 7465 instruct vxor_reg(vec dst, vec src1, vec src2) %{
 7466   predicate(UseAVX > 0);
 7467   match(Set dst (XorV src1 src2));
 7468   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
 7469   ins_encode %{
 7470     int vlen_enc = vector_length_encoding(this);
 7471     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7472   %}
 7473   ins_pipe( pipe_slow );
 7474 %}
 7475 
 7476 instruct vxor_mem(vec dst, vec src, memory mem) %{
 7477   predicate((UseAVX > 0) &&
 7478             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7479   match(Set dst (XorV src (LoadVector mem)));
 7480   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
 7481   ins_encode %{
 7482     int vlen_enc = vector_length_encoding(this);
 7483     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7484   %}
 7485   ins_pipe( pipe_slow );
 7486 %}
 7487 
 7488 // --------------------------------- VectorCast --------------------------------------
 7489 
 7490 instruct vcastBtoX(vec dst, vec src) %{
 7491   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_DOUBLE);
 7492   match(Set dst (VectorCastB2X src));
 7493   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7494   ins_encode %{
 7495     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7496     int vlen_enc = vector_length_encoding(this);
 7497     __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7498   %}
 7499   ins_pipe( pipe_slow );
 7500 %}
 7501 
 7502 instruct vcastBtoD(legVec dst, legVec src) %{
 7503   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7504   match(Set dst (VectorCastB2X src));
 7505   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7506   ins_encode %{
 7507     int vlen_enc = vector_length_encoding(this);
 7508     __ vconvert_b2x(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7509   %}
 7510   ins_pipe( pipe_slow );
 7511 %}
 7512 
 7513 instruct castStoX(vec dst, vec src) %{
 7514   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7515             Matcher::vector_length(n->in(1)) <= 8 && // src
 7516             Matcher::vector_element_basic_type(n) == T_BYTE);
 7517   match(Set dst (VectorCastS2X src));
 7518   format %{ "vector_cast_s2x $dst,$src" %}
 7519   ins_encode %{
 7520     assert(UseAVX > 0, "required");
 7521 
 7522     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, noreg);
 7523     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7524   %}
 7525   ins_pipe( pipe_slow );
 7526 %}
 7527 
 7528 instruct vcastStoX(vec dst, vec src, vec vtmp) %{
 7529   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7530             Matcher::vector_length(n->in(1)) == 16 && // src
 7531             Matcher::vector_element_basic_type(n) == T_BYTE);
 7532   effect(TEMP dst, TEMP vtmp);
 7533   match(Set dst (VectorCastS2X src));
 7534   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp as TEMP" %}
 7535   ins_encode %{
 7536     assert(UseAVX > 0, "required");
 7537 
 7538     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
 7539     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7540     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 7541     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7542   %}
 7543   ins_pipe( pipe_slow );
 7544 %}
 7545 
 7546 instruct vcastStoX_evex(vec dst, vec src) %{
 7547   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
 7548             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7549   match(Set dst (VectorCastS2X src));
 7550   format %{ "vector_cast_s2x $dst,$src\t!" %}
 7551   ins_encode %{
 7552     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7553     int src_vlen_enc = vector_length_encoding(this, $src);
 7554     int vlen_enc = vector_length_encoding(this);
 7555     switch (to_elem_bt) {
 7556       case T_BYTE:
 7557         if (!VM_Version::supports_avx512vl()) {
 7558           vlen_enc = Assembler::AVX_512bit;
 7559         }
 7560         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7561         break;
 7562       case T_INT:
 7563         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7564         break;
 7565       case T_FLOAT:
 7566         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7567         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7568         break;
 7569       case T_LONG:
 7570         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7571         break;
 7572       case T_DOUBLE: {
 7573         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 7574         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 7575         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7576         break;
 7577       }
 7578       default:
 7579         ShouldNotReachHere();
 7580     }
 7581   %}
 7582   ins_pipe( pipe_slow );
 7583 %}
 7584 
 7585 instruct castItoX(vec dst, vec src) %{
 7586   predicate(UseAVX <= 2 &&
 7587             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
 7588             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7589   match(Set dst (VectorCastI2X src));
 7590   format %{ "vector_cast_i2x $dst,$src" %}
 7591   ins_encode %{
 7592     assert(UseAVX > 0, "required");
 7593 
 7594     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7595     int vlen_enc = vector_length_encoding(this, $src);
 7596 
 7597     if (to_elem_bt == T_BYTE) {
 7598       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7599       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7600       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7601     } else {
 7602       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7603       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7604       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7605     }
 7606   %}
 7607   ins_pipe( pipe_slow );
 7608 %}
 7609 
 7610 instruct vcastItoX(vec dst, vec src, vec vtmp) %{
 7611   predicate(UseAVX <= 2 &&
 7612             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
 7613             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7614   match(Set dst (VectorCastI2X src));
 7615   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp as TEMP" %}
 7616   effect(TEMP dst, TEMP vtmp);
 7617   ins_encode %{
 7618     assert(UseAVX > 0, "required");
 7619 
 7620     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7621     int vlen_enc = vector_length_encoding(this, $src);
 7622 
 7623     if (to_elem_bt == T_BYTE) {
 7624       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7625       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7626       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7627       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7628     } else {
 7629       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7630       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7631       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7632       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7633     }
 7634   %}
 7635   ins_pipe( pipe_slow );
 7636 %}
 7637 
 7638 instruct vcastItoX_evex(vec dst, vec src) %{
 7639   predicate(UseAVX > 2 ||
 7640             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7641   match(Set dst (VectorCastI2X src));
 7642   format %{ "vector_cast_i2x $dst,$src\t!" %}
 7643   ins_encode %{
 7644     assert(UseAVX > 0, "required");
 7645 
 7646     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
 7647     int src_vlen_enc = vector_length_encoding(this, $src);
 7648     int dst_vlen_enc = vector_length_encoding(this);
 7649     switch (dst_elem_bt) {
 7650       case T_BYTE:
 7651         if (!VM_Version::supports_avx512vl()) {
 7652           src_vlen_enc = Assembler::AVX_512bit;
 7653         }
 7654         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7655         break;
 7656       case T_SHORT:
 7657         if (!VM_Version::supports_avx512vl()) {
 7658           src_vlen_enc = Assembler::AVX_512bit;
 7659         }
 7660         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7661         break;
 7662       case T_FLOAT:
 7663         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7664         break;
 7665       case T_LONG:
 7666         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7667         break;
 7668       case T_DOUBLE:
 7669         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7670         break;
 7671       default:
 7672         ShouldNotReachHere();
 7673     }
 7674   %}
 7675   ins_pipe( pipe_slow );
 7676 %}
 7677 
 7678 instruct vcastLtoBS(vec dst, vec src) %{
 7679   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
 7680             UseAVX <= 2);
 7681   match(Set dst (VectorCastL2X src));
 7682   format %{ "vector_cast_l2x  $dst,$src" %}
 7683   ins_encode %{
 7684     assert(UseAVX > 0, "required");
 7685 
 7686     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7687     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
 7688     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
 7689                                                       : ExternalAddress(vector_int_to_short_mask());
 7690     if (vlen <= 16) {
 7691       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
 7692       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7693       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7694     } else {
 7695       assert(vlen <= 32, "required");
 7696       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
 7697       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
 7698       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7699       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7700     }
 7701     if (to_elem_bt == T_BYTE) {
 7702       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7703     }
 7704   %}
 7705   ins_pipe( pipe_slow );
 7706 %}
 7707 
 7708 instruct vcastLtoX_evex(vec dst, vec src) %{
 7709   predicate(UseAVX > 2 ||
 7710             (Matcher::vector_element_basic_type(n) == T_INT ||
 7711              Matcher::vector_element_basic_type(n) == T_FLOAT ||
 7712              Matcher::vector_element_basic_type(n) == T_DOUBLE));
 7713   match(Set dst (VectorCastL2X src));
 7714   format %{ "vector_cast_l2x  $dst,$src\t!" %}
 7715   ins_encode %{
 7716     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7717     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7718     int vlen_enc = vector_length_encoding(this, $src);
 7719     switch (to_elem_bt) {
 7720       case T_BYTE:
 7721         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7722           vlen_enc = Assembler::AVX_512bit;
 7723         }
 7724         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7725         break;
 7726       case T_SHORT:
 7727         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7728           vlen_enc = Assembler::AVX_512bit;
 7729         }
 7730         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7731         break;
 7732       case T_INT:
 7733         if (vlen == 8) {
 7734           if ($dst$$XMMRegister != $src$$XMMRegister) {
 7735             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7736           }
 7737         } else if (vlen == 16) {
 7738           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
 7739         } else if (vlen == 32) {
 7740           if (UseAVX > 2) {
 7741             if (!VM_Version::supports_avx512vl()) {
 7742               vlen_enc = Assembler::AVX_512bit;
 7743             }
 7744             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7745           } else {
 7746             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
 7747             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 7748           }
 7749         } else { // vlen == 64
 7750           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7751         }
 7752         break;
 7753       case T_FLOAT:
 7754         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7755         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7756         break;
 7757       case T_DOUBLE:
 7758         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7759         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7760         break;
 7761 
 7762       default: assert(false, "%s", type2name(to_elem_bt));
 7763     }
 7764   %}
 7765   ins_pipe( pipe_slow );
 7766 %}
 7767 
 7768 instruct vcastFtoD_reg(vec dst, vec src) %{
 7769   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7770   match(Set dst (VectorCastF2X src));
 7771   format %{ "vector_cast_f2d  $dst,$src\t!" %}
 7772   ins_encode %{
 7773     int vlen_enc = vector_length_encoding(this);
 7774     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7775   %}
 7776   ins_pipe( pipe_slow );
 7777 %}
 7778 
 7779 
 7780 instruct castFtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7781   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7782             type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
 7783   match(Set dst (VectorCastF2X src));
 7784   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7785   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
 7786   ins_encode %{
 7787     int vlen_enc = vector_length_encoding(this, $src);
 7788     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7789     // JDK-8292878 removed the need for an explicit scratch register needed to load greater than
 7790     // 32 bit addresses for register indirect addressing mode since stub constants
 7791     // are part of code cache and there is a cap of 2G on ReservedCodeCacheSize currently.
 7792     // However, targets are free to increase this limit, but having a large code cache size
 7793     // greater than 2G looks unreasonable in practical scenario, on the hind side with given
 7794     // cap we save a temporary register allocation which in limiting case can prevent
 7795     // spilling in high register pressure blocks.
 7796     __ vector_castF2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7797                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 7798                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7799   %}
 7800   ins_pipe( pipe_slow );
 7801 %}
 7802 
 7803 instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7804   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7805             is_integral_type(Matcher::vector_element_basic_type(n)));
 7806   match(Set dst (VectorCastF2X src));
 7807   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7808   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7809   ins_encode %{
 7810     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7811     if (to_elem_bt == T_LONG) {
 7812       int vlen_enc = vector_length_encoding(this);
 7813       __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7814                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7815                              ExternalAddress(vector_double_signflip()), noreg, vlen_enc);
 7816     } else {
 7817       int vlen_enc = vector_length_encoding(this, $src);
 7818       __ vector_castF2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7819                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7820                              ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7821     }
 7822   %}
 7823   ins_pipe( pipe_slow );
 7824 %}
 7825 
 7826 instruct vcastDtoF_reg(vec dst, vec src) %{
 7827   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 7828   match(Set dst (VectorCastD2X src));
 7829   format %{ "vector_cast_d2x  $dst,$src\t!" %}
 7830   ins_encode %{
 7831     int vlen_enc = vector_length_encoding(this, $src);
 7832     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7833   %}
 7834   ins_pipe( pipe_slow );
 7835 %}
 7836 
 7837 instruct castDtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, vec xtmp5, rFlagsReg cr) %{
 7838   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7839             is_integral_type(Matcher::vector_element_basic_type(n)));
 7840   match(Set dst (VectorCastD2X src));
 7841   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP xtmp5, KILL cr);
 7842   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $xtmp5 as TEMP" %}
 7843   ins_encode %{
 7844     int vlen_enc = vector_length_encoding(this, $src);
 7845     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7846     __ vector_castD2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7847                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, $xtmp5$$XMMRegister,
 7848                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7849   %}
 7850   ins_pipe( pipe_slow );
 7851 %}
 7852 
 7853 instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7854   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7855             is_integral_type(Matcher::vector_element_basic_type(n)));
 7856   match(Set dst (VectorCastD2X src));
 7857   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7858   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7859   ins_encode %{
 7860     int vlen_enc = vector_length_encoding(this, $src);
 7861     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7862     AddressLiteral signflip = VM_Version::supports_avx512dq() ? ExternalAddress(vector_double_signflip()) :
 7863                               ExternalAddress(vector_float_signflip());
 7864     __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7865                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, signflip, noreg, vlen_enc);
 7866   %}
 7867   ins_pipe( pipe_slow );
 7868 %}
 7869 
 7870 instruct vucast(vec dst, vec src) %{
 7871   match(Set dst (VectorUCastB2X src));
 7872   match(Set dst (VectorUCastS2X src));
 7873   match(Set dst (VectorUCastI2X src));
 7874   format %{ "vector_ucast $dst,$src\t!" %}
 7875   ins_encode %{
 7876     assert(UseAVX > 0, "required");
 7877 
 7878     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
 7879     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7880     int vlen_enc = vector_length_encoding(this);
 7881     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
 7882   %}
 7883   ins_pipe( pipe_slow );
 7884 %}
 7885 
 7886 instruct vround_float_avx(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7887   predicate(!VM_Version::supports_avx512vl() &&
 7888             Matcher::vector_length_in_bytes(n) < 64 &&
 7889             Matcher::vector_element_basic_type(n) == T_INT);
 7890   match(Set dst (RoundVF src));
 7891   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7892   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
 7893   ins_encode %{
 7894     int vlen_enc = vector_length_encoding(this);
 7895     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7896     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister,
 7897                               ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7898                               $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
 7899   %}
 7900   ins_pipe( pipe_slow );
 7901 %}
 7902 
 7903 instruct vround_float_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7904   predicate((VM_Version::supports_avx512vl() ||
 7905              Matcher::vector_length_in_bytes(n) == 64) &&
 7906              Matcher::vector_element_basic_type(n) == T_INT);
 7907   match(Set dst (RoundVF src));
 7908   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7909   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7910   ins_encode %{
 7911     int vlen_enc = vector_length_encoding(this);
 7912     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7913     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister,
 7914                                ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7915                                $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7916   %}
 7917   ins_pipe( pipe_slow );
 7918 %}
 7919 
 7920 instruct vround_reg_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7921   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 7922   match(Set dst (RoundVD src));
 7923   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2,  KILL cr);
 7924   format %{ "vector_round_long $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7925   ins_encode %{
 7926     int vlen_enc = vector_length_encoding(this);
 7927     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7928     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister,
 7929                                 ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), new_mxcsr, vlen_enc,
 7930                                 $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7931   %}
 7932   ins_pipe( pipe_slow );
 7933 %}
 7934 
 7935 // --------------------------------- VectorMaskCmp --------------------------------------
 7936 
 7937 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7938   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7939             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
 7940             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7941             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7942   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7943   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7944   ins_encode %{
 7945     int vlen_enc = vector_length_encoding(this, $src1);
 7946     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7947     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7948       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7949     } else {
 7950       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7951     }
 7952   %}
 7953   ins_pipe( pipe_slow );
 7954 %}
 7955 
 7956 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7957   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
 7958             n->bottom_type()->isa_vectmask() == nullptr &&
 7959             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7960   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7961   effect(TEMP ktmp);
 7962   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7963   ins_encode %{
 7964     int vlen_enc = Assembler::AVX_512bit;
 7965     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7966     KRegister mask = k0; // The comparison itself is not being masked.
 7967     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7968       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7969       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7970     } else {
 7971       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7972       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7973     }
 7974   %}
 7975   ins_pipe( pipe_slow );
 7976 %}
 7977 
 7978 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
 7979   predicate(n->bottom_type()->isa_vectmask() &&
 7980             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7981   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7982   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
 7983   ins_encode %{
 7984     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7985     int vlen_enc = vector_length_encoding(this, $src1);
 7986     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7987     KRegister mask = k0; // The comparison itself is not being masked.
 7988     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7989       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7990     } else {
 7991       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7992     }
 7993   %}
 7994   ins_pipe( pipe_slow );
 7995 %}
 7996 
 7997 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7998   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7999             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 8000             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 8001             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 8002             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 8003             (n->in(2)->get_int() == BoolTest::eq ||
 8004              n->in(2)->get_int() == BoolTest::lt ||
 8005              n->in(2)->get_int() == BoolTest::gt)); // cond
 8006   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8007   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 8008   ins_encode %{
 8009     int vlen_enc = vector_length_encoding(this, $src1);
 8010     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8011     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 8012     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
 8013   %}
 8014   ins_pipe( pipe_slow );
 8015 %}
 8016 
 8017 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 8018   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 8019             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 8020             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 8021             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 8022             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 8023             (n->in(2)->get_int() == BoolTest::ne ||
 8024              n->in(2)->get_int() == BoolTest::le ||
 8025              n->in(2)->get_int() == BoolTest::ge)); // cond
 8026   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8027   effect(TEMP dst, TEMP xtmp);
 8028   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 8029   ins_encode %{
 8030     int vlen_enc = vector_length_encoding(this, $src1);
 8031     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8032     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 8033     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 8034   %}
 8035   ins_pipe( pipe_slow );
 8036 %}
 8037 
 8038 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 8039   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 8040             Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 8041             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 8042             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 8043             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8044   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8045   effect(TEMP dst, TEMP xtmp);
 8046   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 8047   ins_encode %{
 8048     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
 8049     int vlen_enc = vector_length_encoding(this, $src1);
 8050     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8051     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 8052 
 8053     if (vlen_enc == Assembler::AVX_128bit) {
 8054       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 8055     } else {
 8056       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 8057     }
 8058     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 8059     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8060     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 8061   %}
 8062   ins_pipe( pipe_slow );
 8063 %}
 8064 
 8065 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 8066   predicate((n->bottom_type()->isa_vectmask() == nullptr &&
 8067              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
 8068              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8069   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8070   effect(TEMP ktmp);
 8071   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 8072   ins_encode %{
 8073     assert(UseAVX > 2, "required");
 8074 
 8075     int vlen_enc = vector_length_encoding(this, $src1);
 8076     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8077     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 8078     KRegister mask = k0; // The comparison itself is not being masked.
 8079     bool merge = false;
 8080     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 8081 
 8082     switch (src1_elem_bt) {
 8083       case T_INT: {
 8084         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8085         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 8086         break;
 8087       }
 8088       case T_LONG: {
 8089         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8090         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 8091         break;
 8092       }
 8093       default: assert(false, "%s", type2name(src1_elem_bt));
 8094     }
 8095   %}
 8096   ins_pipe( pipe_slow );
 8097 %}
 8098 
 8099 
 8100 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
 8101   predicate(n->bottom_type()->isa_vectmask() &&
 8102             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8103   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8104   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
 8105   ins_encode %{
 8106     assert(UseAVX > 2, "required");
 8107     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 8108 
 8109     int vlen_enc = vector_length_encoding(this, $src1);
 8110     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8111     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 8112     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 8113 
 8114     // Comparison i
 8115     switch (src1_elem_bt) {
 8116       case T_BYTE: {
 8117         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8118         break;
 8119       }
 8120       case T_SHORT: {
 8121         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8122         break;
 8123       }
 8124       case T_INT: {
 8125         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8126         break;
 8127       }
 8128       case T_LONG: {
 8129         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8130         break;
 8131       }
 8132       default: assert(false, "%s", type2name(src1_elem_bt));
 8133     }
 8134   %}
 8135   ins_pipe( pipe_slow );
 8136 %}
 8137 
 8138 // Extract
 8139 
 8140 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
 8141   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
 8142   match(Set dst (ExtractI src idx));
 8143   match(Set dst (ExtractS src idx));
 8144   match(Set dst (ExtractB src idx));
 8145   format %{ "extractI $dst,$src,$idx\t!" %}
 8146   ins_encode %{
 8147     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8148 
 8149     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8150     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8151   %}
 8152   ins_pipe( pipe_slow );
 8153 %}
 8154 
 8155 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
 8156   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
 8157             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
 8158   match(Set dst (ExtractI src idx));
 8159   match(Set dst (ExtractS src idx));
 8160   match(Set dst (ExtractB src idx));
 8161   effect(TEMP vtmp);
 8162   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8163   ins_encode %{
 8164     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8165 
 8166     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8167     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8168     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
 8169   %}
 8170   ins_pipe( pipe_slow );
 8171 %}
 8172 
 8173 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
 8174   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
 8175   match(Set dst (ExtractL src idx));
 8176   format %{ "extractL $dst,$src,$idx\t!" %}
 8177   ins_encode %{
 8178     assert(UseSSE >= 4, "required");
 8179     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8180 
 8181     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8182   %}
 8183   ins_pipe( pipe_slow );
 8184 %}
 8185 
 8186 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
 8187   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8188             Matcher::vector_length(n->in(1)) == 8);  // src
 8189   match(Set dst (ExtractL src idx));
 8190   effect(TEMP vtmp);
 8191   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8192   ins_encode %{
 8193     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8194 
 8195     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8196     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
 8197   %}
 8198   ins_pipe( pipe_slow );
 8199 %}
 8200 
 8201 instruct extractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8202   predicate(Matcher::vector_length(n->in(1)) <= 4);
 8203   match(Set dst (ExtractF src idx));
 8204   effect(TEMP dst, TEMP vtmp);
 8205   format %{ "extractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8206   ins_encode %{
 8207     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8208 
 8209     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $vtmp$$XMMRegister);
 8210   %}
 8211   ins_pipe( pipe_slow );
 8212 %}
 8213 
 8214 instruct vextractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8215   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
 8216             Matcher::vector_length(n->in(1)/*src*/) == 16);
 8217   match(Set dst (ExtractF src idx));
 8218   effect(TEMP vtmp);
 8219   format %{ "vextractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8220   ins_encode %{
 8221     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8222 
 8223     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8224     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8225   %}
 8226   ins_pipe( pipe_slow );
 8227 %}
 8228 
 8229 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
 8230   predicate(Matcher::vector_length(n->in(1)) == 2); // src
 8231   match(Set dst (ExtractD src idx));
 8232   format %{ "extractD $dst,$src,$idx\t!" %}
 8233   ins_encode %{
 8234     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8235 
 8236     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8237   %}
 8238   ins_pipe( pipe_slow );
 8239 %}
 8240 
 8241 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
 8242   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8243             Matcher::vector_length(n->in(1)) == 8);  // src
 8244   match(Set dst (ExtractD src idx));
 8245   effect(TEMP vtmp);
 8246   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8247   ins_encode %{
 8248     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8249 
 8250     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8251     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8252   %}
 8253   ins_pipe( pipe_slow );
 8254 %}
 8255 
 8256 // --------------------------------- Vector Blend --------------------------------------
 8257 
 8258 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 8259   predicate(UseAVX == 0);
 8260   match(Set dst (VectorBlend (Binary dst src) mask));
 8261   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
 8262   effect(TEMP tmp);
 8263   ins_encode %{
 8264     assert(UseSSE >= 4, "required");
 8265 
 8266     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
 8267       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
 8268     }
 8269     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
 8270   %}
 8271   ins_pipe( pipe_slow );
 8272 %}
 8273 
 8274 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8275   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8276             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8277             Matcher::vector_length_in_bytes(n) <= 32 &&
 8278             is_integral_type(Matcher::vector_element_basic_type(n)));
 8279   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8280   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8281   ins_encode %{
 8282     int vlen_enc = vector_length_encoding(this);
 8283     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8284   %}
 8285   ins_pipe( pipe_slow );
 8286 %}
 8287 
 8288 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8289   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8290             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8291             Matcher::vector_length_in_bytes(n) <= 32 &&
 8292             !is_integral_type(Matcher::vector_element_basic_type(n)));
 8293   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8294   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8295   ins_encode %{
 8296     int vlen_enc = vector_length_encoding(this);
 8297     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8298   %}
 8299   ins_pipe( pipe_slow );
 8300 %}
 8301 
 8302 instruct vblendvp(legVec dst, legVec src1, legVec src2, legVec mask, legVec vtmp) %{
 8303   predicate(UseAVX > 0 && EnableX86ECoreOpts &&
 8304             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8305             Matcher::vector_length_in_bytes(n) <= 32);
 8306   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8307   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $vtmp as TEMP" %}
 8308   effect(TEMP vtmp, TEMP dst);
 8309   ins_encode %{
 8310     int vlen_enc = vector_length_encoding(this);
 8311     __ vpandn($vtmp$$XMMRegister, $mask$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 8312     __ vpand ($dst$$XMMRegister,  $mask$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8313     __ vpor  ($dst$$XMMRegister,  $dst$$XMMRegister,  $vtmp$$XMMRegister, vlen_enc);
 8314   %}
 8315   ins_pipe( pipe_slow );
 8316 %}
 8317 
 8318 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
 8319   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 8320             n->in(2)->bottom_type()->isa_vectmask() == nullptr);
 8321   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8322   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8323   effect(TEMP ktmp);
 8324   ins_encode %{
 8325      int vlen_enc = Assembler::AVX_512bit;
 8326      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8327     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, noreg);
 8328     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8329   %}
 8330   ins_pipe( pipe_slow );
 8331 %}
 8332 
 8333 
 8334 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask) %{
 8335   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
 8336             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
 8337              VM_Version::supports_avx512bw()));
 8338   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8339   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8340   ins_encode %{
 8341     int vlen_enc = vector_length_encoding(this);
 8342     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8343     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8344   %}
 8345   ins_pipe( pipe_slow );
 8346 %}
 8347 
 8348 // --------------------------------- ABS --------------------------------------
 8349 // a = |a|
 8350 instruct vabsB_reg(vec dst, vec src) %{
 8351   match(Set dst (AbsVB  src));
 8352   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
 8353   ins_encode %{
 8354     uint vlen = Matcher::vector_length(this);
 8355     if (vlen <= 16) {
 8356       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8357     } else {
 8358       int vlen_enc = vector_length_encoding(this);
 8359       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8360     }
 8361   %}
 8362   ins_pipe( pipe_slow );
 8363 %}
 8364 
 8365 instruct vabsS_reg(vec dst, vec src) %{
 8366   match(Set dst (AbsVS  src));
 8367   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
 8368   ins_encode %{
 8369     uint vlen = Matcher::vector_length(this);
 8370     if (vlen <= 8) {
 8371       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8372     } else {
 8373       int vlen_enc = vector_length_encoding(this);
 8374       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8375     }
 8376   %}
 8377   ins_pipe( pipe_slow );
 8378 %}
 8379 
 8380 instruct vabsI_reg(vec dst, vec src) %{
 8381   match(Set dst (AbsVI  src));
 8382   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
 8383   ins_encode %{
 8384     uint vlen = Matcher::vector_length(this);
 8385     if (vlen <= 4) {
 8386       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8387     } else {
 8388       int vlen_enc = vector_length_encoding(this);
 8389       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8390     }
 8391   %}
 8392   ins_pipe( pipe_slow );
 8393 %}
 8394 
 8395 instruct vabsL_reg(vec dst, vec src) %{
 8396   match(Set dst (AbsVL  src));
 8397   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
 8398   ins_encode %{
 8399     assert(UseAVX > 2, "required");
 8400     int vlen_enc = vector_length_encoding(this);
 8401     if (!VM_Version::supports_avx512vl()) {
 8402       vlen_enc = Assembler::AVX_512bit;
 8403     }
 8404     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8405   %}
 8406   ins_pipe( pipe_slow );
 8407 %}
 8408 
 8409 // --------------------------------- ABSNEG --------------------------------------
 8410 
 8411 instruct vabsnegF(vec dst, vec src) %{
 8412   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
 8413   match(Set dst (AbsVF src));
 8414   match(Set dst (NegVF src));
 8415   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
 8416   ins_cost(150);
 8417   ins_encode %{
 8418     int opcode = this->ideal_Opcode();
 8419     int vlen = Matcher::vector_length(this);
 8420     if (vlen == 2) {
 8421       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8422     } else {
 8423       assert(vlen == 8 || vlen == 16, "required");
 8424       int vlen_enc = vector_length_encoding(this);
 8425       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8426     }
 8427   %}
 8428   ins_pipe( pipe_slow );
 8429 %}
 8430 
 8431 instruct vabsneg4F(vec dst) %{
 8432   predicate(Matcher::vector_length(n) == 4);
 8433   match(Set dst (AbsVF dst));
 8434   match(Set dst (NegVF dst));
 8435   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
 8436   ins_cost(150);
 8437   ins_encode %{
 8438     int opcode = this->ideal_Opcode();
 8439     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister);
 8440   %}
 8441   ins_pipe( pipe_slow );
 8442 %}
 8443 
 8444 instruct vabsnegD(vec dst, vec src) %{
 8445   match(Set dst (AbsVD  src));
 8446   match(Set dst (NegVD  src));
 8447   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
 8448   ins_encode %{
 8449     int opcode = this->ideal_Opcode();
 8450     uint vlen = Matcher::vector_length(this);
 8451     if (vlen == 2) {
 8452       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8453     } else {
 8454       int vlen_enc = vector_length_encoding(this);
 8455       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8456     }
 8457   %}
 8458   ins_pipe( pipe_slow );
 8459 %}
 8460 
 8461 //------------------------------------- VectorTest --------------------------------------------
 8462 
 8463 instruct vptest_lt16(rFlagsRegU cr, legVec src1, legVec src2, legVec vtmp) %{
 8464   predicate(Matcher::vector_length_in_bytes(n->in(1)) < 16);
 8465   match(Set cr (VectorTest src1 src2));
 8466   effect(TEMP vtmp);
 8467   format %{ "vptest_lt16  $src1, $src2\t! using $vtmp as TEMP" %}
 8468   ins_encode %{
 8469     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8470     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8471     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister, vlen);
 8472   %}
 8473   ins_pipe( pipe_slow );
 8474 %}
 8475 
 8476 instruct vptest_ge16(rFlagsRegU cr, legVec src1, legVec src2) %{
 8477   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16);
 8478   match(Set cr (VectorTest src1 src2));
 8479   format %{ "vptest_ge16  $src1, $src2\n\t" %}
 8480   ins_encode %{
 8481     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8482     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8483     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, vlen);
 8484   %}
 8485   ins_pipe( pipe_slow );
 8486 %}
 8487 
 8488 instruct ktest_alltrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8489   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8490              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8491             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 8492   match(Set cr (VectorTest src1 src2));
 8493   effect(TEMP tmp);
 8494   format %{ "ktest_alltrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8495   ins_encode %{
 8496     uint masklen = Matcher::vector_length(this, $src1);
 8497     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8498     __ andl($tmp$$Register, (1 << masklen) - 1);
 8499     __ cmpl($tmp$$Register, (1 << masklen) - 1);
 8500   %}
 8501   ins_pipe( pipe_slow );
 8502 %}
 8503 
 8504 instruct ktest_anytrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8505   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8506              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8507             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8508   match(Set cr (VectorTest src1 src2));
 8509   effect(TEMP tmp);
 8510   format %{ "ktest_anytrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8511   ins_encode %{
 8512     uint masklen = Matcher::vector_length(this, $src1);
 8513     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8514     __ andl($tmp$$Register, (1 << masklen) - 1);
 8515   %}
 8516   ins_pipe( pipe_slow );
 8517 %}
 8518 
 8519 instruct ktest_ge8(rFlagsRegU cr, kReg src1, kReg src2) %{
 8520   predicate(Matcher::vector_length(n->in(1)) >= 16 ||
 8521             (Matcher::vector_length(n->in(1)) == 8 && VM_Version::supports_avx512dq()));
 8522   match(Set cr (VectorTest src1 src2));
 8523   format %{ "ktest_ge8  $src1, $src2\n\t" %}
 8524   ins_encode %{
 8525     uint masklen = Matcher::vector_length(this, $src1);
 8526     __ kortest(masklen, $src1$$KRegister, $src1$$KRegister);
 8527   %}
 8528   ins_pipe( pipe_slow );
 8529 %}
 8530 
 8531 //------------------------------------- LoadMask --------------------------------------------
 8532 
 8533 instruct loadMask(legVec dst, legVec src) %{
 8534   predicate(n->bottom_type()->isa_vectmask() == nullptr && !VM_Version::supports_avx512vlbw());
 8535   match(Set dst (VectorLoadMask src));
 8536   effect(TEMP dst);
 8537   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
 8538   ins_encode %{
 8539     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8540     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8541     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
 8542   %}
 8543   ins_pipe( pipe_slow );
 8544 %}
 8545 
 8546 instruct loadMask64(kReg dst, vec src, vec xtmp) %{
 8547   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8548   match(Set dst (VectorLoadMask src));
 8549   effect(TEMP xtmp);
 8550   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp as TEMP" %}
 8551   ins_encode %{
 8552     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8553                         true, Assembler::AVX_512bit);
 8554   %}
 8555   ins_pipe( pipe_slow );
 8556 %}
 8557 
 8558 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
 8559   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8560   match(Set dst (VectorLoadMask src));
 8561   effect(TEMP xtmp);
 8562   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
 8563   ins_encode %{
 8564     int vlen_enc = vector_length_encoding(in(1));
 8565     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8566                         false, vlen_enc);
 8567   %}
 8568   ins_pipe( pipe_slow );
 8569 %}
 8570 
 8571 //------------------------------------- StoreMask --------------------------------------------
 8572 
 8573 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
 8574   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8575   match(Set dst (VectorStoreMask src size));
 8576   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8577   ins_encode %{
 8578     int vlen = Matcher::vector_length(this);
 8579     if (vlen <= 16 && UseAVX <= 2) {
 8580       assert(UseSSE >= 3, "required");
 8581       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8582     } else {
 8583       assert(UseAVX > 0, "required");
 8584       int src_vlen_enc = vector_length_encoding(this, $src);
 8585       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8586     }
 8587   %}
 8588   ins_pipe( pipe_slow );
 8589 %}
 8590 
 8591 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
 8592   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8593   match(Set dst (VectorStoreMask src size));
 8594   effect(TEMP_DEF dst, TEMP xtmp);
 8595   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8596   ins_encode %{
 8597     int vlen_enc = Assembler::AVX_128bit;
 8598     int vlen = Matcher::vector_length(this);
 8599     if (vlen <= 8) {
 8600       assert(UseSSE >= 3, "required");
 8601       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8602       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8603       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8604     } else {
 8605       assert(UseAVX > 0, "required");
 8606       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8607       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8608       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8609     }
 8610   %}
 8611   ins_pipe( pipe_slow );
 8612 %}
 8613 
 8614 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
 8615   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8616   match(Set dst (VectorStoreMask src size));
 8617   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8618   effect(TEMP_DEF dst, TEMP xtmp);
 8619   ins_encode %{
 8620     int vlen_enc = Assembler::AVX_128bit;
 8621     int vlen = Matcher::vector_length(this);
 8622     if (vlen <= 4) {
 8623       assert(UseSSE >= 3, "required");
 8624       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8625       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8626       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8627       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8628     } else {
 8629       assert(UseAVX > 0, "required");
 8630       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8631       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8632       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8633       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8634       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8635     }
 8636   %}
 8637   ins_pipe( pipe_slow );
 8638 %}
 8639 
 8640 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
 8641   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
 8642   match(Set dst (VectorStoreMask src size));
 8643   effect(TEMP_DEF dst, TEMP xtmp);
 8644   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8645   ins_encode %{
 8646     assert(UseSSE >= 3, "required");
 8647     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8648     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
 8649     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
 8650     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8651     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8652   %}
 8653   ins_pipe( pipe_slow );
 8654 %}
 8655 
 8656 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
 8657   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
 8658   match(Set dst (VectorStoreMask src size));
 8659   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
 8660   effect(TEMP_DEF dst, TEMP vtmp);
 8661   ins_encode %{
 8662     int vlen_enc = Assembler::AVX_128bit;
 8663     __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
 8664     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 8665     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
 8666     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8667     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8668     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8669     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8670   %}
 8671   ins_pipe( pipe_slow );
 8672 %}
 8673 
 8674 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
 8675   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8676   match(Set dst (VectorStoreMask src size));
 8677   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8678   ins_encode %{
 8679     int src_vlen_enc = vector_length_encoding(this, $src);
 8680     int dst_vlen_enc = vector_length_encoding(this);
 8681     if (!VM_Version::supports_avx512vl()) {
 8682       src_vlen_enc = Assembler::AVX_512bit;
 8683     }
 8684     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8685     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8686   %}
 8687   ins_pipe( pipe_slow );
 8688 %}
 8689 
 8690 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
 8691   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8692   match(Set dst (VectorStoreMask src size));
 8693   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8694   ins_encode %{
 8695     int src_vlen_enc = vector_length_encoding(this, $src);
 8696     int dst_vlen_enc = vector_length_encoding(this);
 8697     if (!VM_Version::supports_avx512vl()) {
 8698       src_vlen_enc = Assembler::AVX_512bit;
 8699     }
 8700     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8701     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8702   %}
 8703   ins_pipe( pipe_slow );
 8704 %}
 8705 
 8706 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size) %{
 8707   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8708   match(Set dst (VectorStoreMask mask size));
 8709   effect(TEMP_DEF dst);
 8710   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8711   ins_encode %{
 8712     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
 8713     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
 8714                  false, Assembler::AVX_512bit, noreg);
 8715     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
 8716   %}
 8717   ins_pipe( pipe_slow );
 8718 %}
 8719 
 8720 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
 8721   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8722   match(Set dst (VectorStoreMask mask size));
 8723   effect(TEMP_DEF dst);
 8724   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8725   ins_encode %{
 8726     int dst_vlen_enc = vector_length_encoding(this);
 8727     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
 8728     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8729   %}
 8730   ins_pipe( pipe_slow );
 8731 %}
 8732 
 8733 instruct vmaskcast_evex(kReg dst) %{
 8734   match(Set dst (VectorMaskCast dst));
 8735   ins_cost(0);
 8736   format %{ "vector_mask_cast $dst" %}
 8737   ins_encode %{
 8738     // empty
 8739   %}
 8740   ins_pipe(empty);
 8741 %}
 8742 
 8743 instruct vmaskcast(vec dst) %{
 8744   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
 8745   match(Set dst (VectorMaskCast dst));
 8746   ins_cost(0);
 8747   format %{ "vector_mask_cast $dst" %}
 8748   ins_encode %{
 8749     // empty
 8750   %}
 8751   ins_pipe(empty);
 8752 %}
 8753 
 8754 instruct vmaskcast_avx(vec dst, vec src) %{
 8755   predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)));
 8756   match(Set dst (VectorMaskCast src));
 8757   format %{ "vector_mask_cast $dst, $src" %}
 8758   ins_encode %{
 8759     int vlen = Matcher::vector_length(this);
 8760     BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
 8761     BasicType dst_bt = Matcher::vector_element_basic_type(this);
 8762     __ vector_mask_cast($dst$$XMMRegister, $src$$XMMRegister, dst_bt, src_bt, vlen);
 8763   %}
 8764   ins_pipe(pipe_slow);
 8765 %}
 8766 
 8767 //-------------------------------- Load Iota Indices ----------------------------------
 8768 
 8769 instruct loadIotaIndices(vec dst, immI_0 src) %{
 8770   match(Set dst (VectorLoadConst src));
 8771   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
 8772   ins_encode %{
 8773      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8774      BasicType bt = Matcher::vector_element_basic_type(this);
 8775      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, bt);
 8776   %}
 8777   ins_pipe( pipe_slow );
 8778 %}
 8779 
 8780 instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
 8781   match(Set dst (PopulateIndex src1 src2));
 8782   effect(TEMP dst, TEMP vtmp);
 8783   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8784   ins_encode %{
 8785      assert($src2$$constant == 1, "required");
 8786      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8787      int vlen_enc = vector_length_encoding(this);
 8788      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8789      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8790      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8791      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8792   %}
 8793   ins_pipe( pipe_slow );
 8794 %}
 8795 
 8796 instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
 8797   match(Set dst (PopulateIndex src1 src2));
 8798   effect(TEMP dst, TEMP vtmp);
 8799   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8800   ins_encode %{
 8801      assert($src2$$constant == 1, "required");
 8802      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8803      int vlen_enc = vector_length_encoding(this);
 8804      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8805      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8806      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8807      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8808   %}
 8809   ins_pipe( pipe_slow );
 8810 %}
 8811 
 8812 //-------------------------------- Rearrange ----------------------------------
 8813 
 8814 // LoadShuffle/Rearrange for Byte
 8815 instruct rearrangeB(vec dst, vec shuffle) %{
 8816   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8817             Matcher::vector_length(n) < 32);
 8818   match(Set dst (VectorRearrange dst shuffle));
 8819   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8820   ins_encode %{
 8821     assert(UseSSE >= 4, "required");
 8822     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8823   %}
 8824   ins_pipe( pipe_slow );
 8825 %}
 8826 
 8827 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8828   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8829             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
 8830   match(Set dst (VectorRearrange src shuffle));
 8831   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8832   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8833   ins_encode %{
 8834     assert(UseAVX >= 2, "required");
 8835     // Swap src into vtmp1
 8836     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8837     // Shuffle swapped src to get entries from other 128 bit lane
 8838     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8839     // Shuffle original src to get entries from self 128 bit lane
 8840     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8841     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8842     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8843     // Perform the blend
 8844     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8845   %}
 8846   ins_pipe( pipe_slow );
 8847 %}
 8848 
 8849 
 8850 instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
 8851   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8852             Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
 8853   match(Set dst (VectorRearrange src shuffle));
 8854   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 8855   format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
 8856   ins_encode %{
 8857     int vlen_enc = vector_length_encoding(this);
 8858     __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
 8859                        $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
 8860                        $rtmp$$Register, $ktmp$$KRegister, vlen_enc);
 8861   %}
 8862   ins_pipe( pipe_slow );
 8863 %}
 8864 
 8865 instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
 8866   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8867             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
 8868   match(Set dst (VectorRearrange src shuffle));
 8869   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8870   ins_encode %{
 8871     int vlen_enc = vector_length_encoding(this);
 8872     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8873   %}
 8874   ins_pipe( pipe_slow );
 8875 %}
 8876 
 8877 // LoadShuffle/Rearrange for Short
 8878 
 8879 instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
 8880   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8881             !VM_Version::supports_avx512bw());
 8882   match(Set dst (VectorLoadShuffle src));
 8883   effect(TEMP dst, TEMP vtmp);
 8884   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8885   ins_encode %{
 8886     // Create a byte shuffle mask from short shuffle mask
 8887     // only byte shuffle instruction available on these platforms
 8888     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8889     if (UseAVX == 0) {
 8890       assert(vlen_in_bytes <= 16, "required");
 8891       // Multiply each shuffle by two to get byte index
 8892       __ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
 8893       __ psllw($vtmp$$XMMRegister, 1);
 8894 
 8895       // Duplicate to create 2 copies of byte index
 8896       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8897       __ psllw($dst$$XMMRegister, 8);
 8898       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
 8899 
 8900       // Add one to get alternate byte index
 8901       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), noreg);
 8902       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8903     } else {
 8904       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
 8905       int vlen_enc = vector_length_encoding(this);
 8906       // Multiply each shuffle by two to get byte index
 8907       __ vpsllw($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
 8908 
 8909       // Duplicate to create 2 copies of byte index
 8910       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
 8911       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8912 
 8913       // Add one to get alternate byte index
 8914       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, noreg);
 8915     }
 8916   %}
 8917   ins_pipe( pipe_slow );
 8918 %}
 8919 
 8920 instruct rearrangeS(vec dst, vec shuffle) %{
 8921   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8922             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
 8923   match(Set dst (VectorRearrange dst shuffle));
 8924   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8925   ins_encode %{
 8926     assert(UseSSE >= 4, "required");
 8927     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8928   %}
 8929   ins_pipe( pipe_slow );
 8930 %}
 8931 
 8932 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8933   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8934             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
 8935   match(Set dst (VectorRearrange src shuffle));
 8936   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8937   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8938   ins_encode %{
 8939     assert(UseAVX >= 2, "required");
 8940     // Swap src into vtmp1
 8941     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8942     // Shuffle swapped src to get entries from other 128 bit lane
 8943     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8944     // Shuffle original src to get entries from self 128 bit lane
 8945     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8946     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8947     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8948     // Perform the blend
 8949     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8950   %}
 8951   ins_pipe( pipe_slow );
 8952 %}
 8953 
 8954 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
 8955   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8956             VM_Version::supports_avx512bw());
 8957   match(Set dst (VectorRearrange src shuffle));
 8958   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8959   ins_encode %{
 8960     int vlen_enc = vector_length_encoding(this);
 8961     if (!VM_Version::supports_avx512vl()) {
 8962       vlen_enc = Assembler::AVX_512bit;
 8963     }
 8964     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8965   %}
 8966   ins_pipe( pipe_slow );
 8967 %}
 8968 
 8969 // LoadShuffle/Rearrange for Integer and Float
 8970 
 8971 instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
 8972   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8973             Matcher::vector_length(n) == 4 && UseAVX == 0);
 8974   match(Set dst (VectorLoadShuffle src));
 8975   effect(TEMP dst, TEMP vtmp);
 8976   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8977   ins_encode %{
 8978     assert(UseSSE >= 4, "required");
 8979 
 8980     // Create a byte shuffle mask from int shuffle mask
 8981     // only byte shuffle instruction available on these platforms
 8982 
 8983     // Duplicate and multiply each shuffle by 4
 8984     __ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
 8985     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8986     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8987     __ psllw($vtmp$$XMMRegister, 2);
 8988 
 8989     // Duplicate again to create 4 copies of byte index
 8990     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8991     __ psllw($dst$$XMMRegister, 8);
 8992     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
 8993 
 8994     // Add 3,2,1,0 to get alternate byte index
 8995     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), noreg);
 8996     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8997   %}
 8998   ins_pipe( pipe_slow );
 8999 %}
 9000 
 9001 instruct rearrangeI(vec dst, vec shuffle) %{
 9002   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 9003             UseAVX == 0);
 9004   match(Set dst (VectorRearrange dst shuffle));
 9005   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 9006   ins_encode %{
 9007     assert(UseSSE >= 4, "required");
 9008     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 9009   %}
 9010   ins_pipe( pipe_slow );
 9011 %}
 9012 
 9013 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
 9014   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 9015             UseAVX > 0);
 9016   match(Set dst (VectorRearrange src shuffle));
 9017   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9018   ins_encode %{
 9019     int vlen_enc = vector_length_encoding(this);
 9020     BasicType bt = Matcher::vector_element_basic_type(this);
 9021     __ vector_rearrange_int_float(bt, $dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9022   %}
 9023   ins_pipe( pipe_slow );
 9024 %}
 9025 
 9026 // LoadShuffle/Rearrange for Long and Double
 9027 
 9028 instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
 9029   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9030             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 9031   match(Set dst (VectorLoadShuffle src));
 9032   effect(TEMP dst, TEMP vtmp);
 9033   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 9034   ins_encode %{
 9035     assert(UseAVX >= 2, "required");
 9036 
 9037     int vlen_enc = vector_length_encoding(this);
 9038     // Create a double word shuffle mask from long shuffle mask
 9039     // only double word shuffle instruction available on these platforms
 9040 
 9041     // Multiply each shuffle by two to get double word index
 9042     __ vpsllq($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
 9043 
 9044     // Duplicate each double word shuffle
 9045     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
 9046     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 9047 
 9048     // Add one to get alternate double word index
 9049     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, noreg);
 9050   %}
 9051   ins_pipe( pipe_slow );
 9052 %}
 9053 
 9054 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
 9055   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9056             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 9057   match(Set dst (VectorRearrange src shuffle));
 9058   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9059   ins_encode %{
 9060     assert(UseAVX >= 2, "required");
 9061 
 9062     int vlen_enc = vector_length_encoding(this);
 9063     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9064   %}
 9065   ins_pipe( pipe_slow );
 9066 %}
 9067 
 9068 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
 9069   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9070             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 9071   match(Set dst (VectorRearrange src shuffle));
 9072   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9073   ins_encode %{
 9074     assert(UseAVX > 2, "required");
 9075 
 9076     int vlen_enc = vector_length_encoding(this);
 9077     if (vlen_enc == Assembler::AVX_128bit) {
 9078       vlen_enc = Assembler::AVX_256bit;
 9079     }
 9080     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9081   %}
 9082   ins_pipe( pipe_slow );
 9083 %}
 9084 
 9085 // --------------------------------- FMA --------------------------------------
 9086 // a * b + c
 9087 
 9088 instruct vfmaF_reg(vec a, vec b, vec c) %{
 9089   match(Set c (FmaVF  c (Binary a b)));
 9090   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 9091   ins_cost(150);
 9092   ins_encode %{
 9093     assert(UseFMA, "not enabled");
 9094     int vlen_enc = vector_length_encoding(this);
 9095     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9096   %}
 9097   ins_pipe( pipe_slow );
 9098 %}
 9099 
 9100 instruct vfmaF_mem(vec a, memory b, vec c) %{
 9101   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9102   match(Set c (FmaVF  c (Binary a (LoadVector b))));
 9103   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 9104   ins_cost(150);
 9105   ins_encode %{
 9106     assert(UseFMA, "not enabled");
 9107     int vlen_enc = vector_length_encoding(this);
 9108     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9109   %}
 9110   ins_pipe( pipe_slow );
 9111 %}
 9112 
 9113 instruct vfmaD_reg(vec a, vec b, vec c) %{
 9114   match(Set c (FmaVD  c (Binary a b)));
 9115   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9116   ins_cost(150);
 9117   ins_encode %{
 9118     assert(UseFMA, "not enabled");
 9119     int vlen_enc = vector_length_encoding(this);
 9120     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9121   %}
 9122   ins_pipe( pipe_slow );
 9123 %}
 9124 
 9125 instruct vfmaD_mem(vec a, memory b, vec c) %{
 9126   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9127   match(Set c (FmaVD  c (Binary a (LoadVector b))));
 9128   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9129   ins_cost(150);
 9130   ins_encode %{
 9131     assert(UseFMA, "not enabled");
 9132     int vlen_enc = vector_length_encoding(this);
 9133     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9134   %}
 9135   ins_pipe( pipe_slow );
 9136 %}
 9137 
 9138 // --------------------------------- Vector Multiply Add --------------------------------------
 9139 
 9140 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
 9141   predicate(UseAVX == 0);
 9142   match(Set dst (MulAddVS2VI dst src1));
 9143   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
 9144   ins_encode %{
 9145     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
 9146   %}
 9147   ins_pipe( pipe_slow );
 9148 %}
 9149 
 9150 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
 9151   predicate(UseAVX > 0);
 9152   match(Set dst (MulAddVS2VI src1 src2));
 9153   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
 9154   ins_encode %{
 9155     int vlen_enc = vector_length_encoding(this);
 9156     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9157   %}
 9158   ins_pipe( pipe_slow );
 9159 %}
 9160 
 9161 // --------------------------------- Vector Multiply Add Add ----------------------------------
 9162 
 9163 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
 9164   predicate(VM_Version::supports_avx512_vnni());
 9165   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
 9166   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
 9167   ins_encode %{
 9168     assert(UseAVX > 2, "required");
 9169     int vlen_enc = vector_length_encoding(this);
 9170     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9171   %}
 9172   ins_pipe( pipe_slow );
 9173   ins_cost(10);
 9174 %}
 9175 
 9176 // --------------------------------- PopCount --------------------------------------
 9177 
 9178 instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
 9179   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9180   match(Set dst (PopCountVI src));
 9181   match(Set dst (PopCountVL src));
 9182   format %{ "vector_popcount_integral $dst, $src" %}
 9183   ins_encode %{
 9184     int opcode = this->ideal_Opcode();
 9185     int vlen_enc = vector_length_encoding(this, $src);
 9186     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9187     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
 9188   %}
 9189   ins_pipe( pipe_slow );
 9190 %}
 9191 
 9192 instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9193   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9194   match(Set dst (PopCountVI src mask));
 9195   match(Set dst (PopCountVL src mask));
 9196   format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
 9197   ins_encode %{
 9198     int vlen_enc = vector_length_encoding(this, $src);
 9199     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9200     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9201     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
 9202   %}
 9203   ins_pipe( pipe_slow );
 9204 %}
 9205 
 9206 instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
 9207   predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9208   match(Set dst (PopCountVI src));
 9209   match(Set dst (PopCountVL src));
 9210   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9211   format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
 9212   ins_encode %{
 9213     int opcode = this->ideal_Opcode();
 9214     int vlen_enc = vector_length_encoding(this, $src);
 9215     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9216     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9217                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
 9218   %}
 9219   ins_pipe( pipe_slow );
 9220 %}
 9221 
 9222 // --------------------------------- Vector Trailing Zeros Count --------------------------------------
 9223 
 9224 instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
 9225   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9226                                               Matcher::vector_length_in_bytes(n->in(1))));
 9227   match(Set dst (CountTrailingZerosV src));
 9228   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 9229   ins_cost(400);
 9230   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
 9231   ins_encode %{
 9232     int vlen_enc = vector_length_encoding(this, $src);
 9233     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9234     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9235                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9236   %}
 9237   ins_pipe( pipe_slow );
 9238 %}
 9239 
 9240 instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9241   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9242             VM_Version::supports_avx512cd() &&
 9243             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9244   match(Set dst (CountTrailingZerosV src));
 9245   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9246   ins_cost(400);
 9247   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
 9248   ins_encode %{
 9249     int vlen_enc = vector_length_encoding(this, $src);
 9250     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9251     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9252                                         $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9253   %}
 9254   ins_pipe( pipe_slow );
 9255 %}
 9256 
 9257 instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
 9258   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9259   match(Set dst (CountTrailingZerosV src));
 9260   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
 9261   ins_cost(400);
 9262   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
 9263   ins_encode %{
 9264     int vlen_enc = vector_length_encoding(this, $src);
 9265     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9266     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9267                                         $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 9268                                         $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
 9269   %}
 9270   ins_pipe( pipe_slow );
 9271 %}
 9272 
 9273 instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9274   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9275   match(Set dst (CountTrailingZerosV src));
 9276   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9277   format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9278   ins_encode %{
 9279     int vlen_enc = vector_length_encoding(this, $src);
 9280     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9281     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9282                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9283   %}
 9284   ins_pipe( pipe_slow );
 9285 %}
 9286 
 9287 
 9288 // --------------------------------- Bitwise Ternary Logic ----------------------------------
 9289 
 9290 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
 9291   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
 9292   effect(TEMP dst);
 9293   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9294   ins_encode %{
 9295     int vector_len = vector_length_encoding(this);
 9296     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
 9297   %}
 9298   ins_pipe( pipe_slow );
 9299 %}
 9300 
 9301 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
 9302   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
 9303   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
 9304   effect(TEMP dst);
 9305   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9306   ins_encode %{
 9307     int vector_len = vector_length_encoding(this);
 9308     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
 9309   %}
 9310   ins_pipe( pipe_slow );
 9311 %}
 9312 
 9313 // --------------------------------- Rotation Operations ----------------------------------
 9314 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
 9315   match(Set dst (RotateLeftV src shift));
 9316   match(Set dst (RotateRightV src shift));
 9317   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
 9318   ins_encode %{
 9319     int opcode      = this->ideal_Opcode();
 9320     int vector_len  = vector_length_encoding(this);
 9321     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9322     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 9323   %}
 9324   ins_pipe( pipe_slow );
 9325 %}
 9326 
 9327 instruct vprorate(vec dst, vec src, vec shift) %{
 9328   match(Set dst (RotateLeftV src shift));
 9329   match(Set dst (RotateRightV src shift));
 9330   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
 9331   ins_encode %{
 9332     int opcode      = this->ideal_Opcode();
 9333     int vector_len  = vector_length_encoding(this);
 9334     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9335     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
 9336   %}
 9337   ins_pipe( pipe_slow );
 9338 %}
 9339 
 9340 // ---------------------------------- Masked Operations ------------------------------------
 9341 instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
 9342   predicate(!n->in(3)->bottom_type()->isa_vectmask());
 9343   match(Set dst (LoadVectorMasked mem mask));
 9344   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9345   ins_encode %{
 9346     BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
 9347     int vlen_enc = vector_length_encoding(this);
 9348     __ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
 9349   %}
 9350   ins_pipe( pipe_slow );
 9351 %}
 9352 
 9353 
 9354 instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
 9355   predicate(n->in(3)->bottom_type()->isa_vectmask());
 9356   match(Set dst (LoadVectorMasked mem mask));
 9357   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9358   ins_encode %{
 9359     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
 9360     int vector_len = vector_length_encoding(this);
 9361     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
 9362   %}
 9363   ins_pipe( pipe_slow );
 9364 %}
 9365 
 9366 instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
 9367   predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9368   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9369   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9370   ins_encode %{
 9371     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9372     int vlen_enc = vector_length_encoding(src_node);
 9373     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9374     __ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 9375   %}
 9376   ins_pipe( pipe_slow );
 9377 %}
 9378 
 9379 instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
 9380   predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9381   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9382   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9383   ins_encode %{
 9384     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9385     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9386     int vlen_enc = vector_length_encoding(src_node);
 9387     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
 9388   %}
 9389   ins_pipe( pipe_slow );
 9390 %}
 9391 
 9392 instruct verify_vector_alignment(rRegP addr, immL32 mask, rFlagsReg cr) %{
 9393   match(Set addr (VerifyVectorAlignment addr mask));
 9394   effect(KILL cr);
 9395   format %{ "verify_vector_alignment $addr $mask \t! verify alignment" %}
 9396   ins_encode %{
 9397     Label Lskip;
 9398     // check if masked bits of addr are zero
 9399     __ testq($addr$$Register, $mask$$constant);
 9400     __ jccb(Assembler::equal, Lskip);
 9401     __ stop("verify_vector_alignment found a misaligned vector memory access");
 9402     __ bind(Lskip);
 9403   %}
 9404   ins_pipe(pipe_slow);
 9405 %}
 9406 
 9407 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 9408   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
 9409   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
 9410   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
 9411   ins_encode %{
 9412     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
 9413     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
 9414 
 9415     Label DONE;
 9416     int vlen_enc = vector_length_encoding(this, $src1);
 9417     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9418 
 9419     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
 9420     __ mov64($dst$$Register, -1L);
 9421     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
 9422     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
 9423     __ jccb(Assembler::carrySet, DONE);
 9424     __ kmovql($dst$$Register, $ktmp1$$KRegister);
 9425     __ notq($dst$$Register);
 9426     __ tzcntq($dst$$Register, $dst$$Register);
 9427     __ bind(DONE);
 9428   %}
 9429   ins_pipe( pipe_slow );
 9430 %}
 9431 
 9432 
 9433 instruct vmask_gen(kReg dst, rRegL len, rRegL temp, rFlagsReg cr) %{
 9434   match(Set dst (VectorMaskGen len));
 9435   effect(TEMP temp, KILL cr);
 9436   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
 9437   ins_encode %{
 9438     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
 9439   %}
 9440   ins_pipe( pipe_slow );
 9441 %}
 9442 
 9443 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
 9444   match(Set dst (VectorMaskGen len));
 9445   format %{ "vector_mask_gen $len \t! vector mask generator" %}
 9446   effect(TEMP temp);
 9447   ins_encode %{
 9448     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
 9449     __ kmovql($dst$$KRegister, $temp$$Register);
 9450   %}
 9451   ins_pipe( pipe_slow );
 9452 %}
 9453 
 9454 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
 9455   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9456   match(Set dst (VectorMaskToLong mask));
 9457   effect(TEMP dst, KILL cr);
 9458   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
 9459   ins_encode %{
 9460     int opcode = this->ideal_Opcode();
 9461     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9462     int mask_len = Matcher::vector_length(this, $mask);
 9463     int mask_size = mask_len * type2aelembytes(mbt);
 9464     int vlen_enc = vector_length_encoding(this, $mask);
 9465     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9466                              $dst$$Register, mask_len, mask_size, vlen_enc);
 9467   %}
 9468   ins_pipe( pipe_slow );
 9469 %}
 9470 
 9471 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
 9472   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9473   match(Set dst (VectorMaskToLong mask));
 9474   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
 9475   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9476   ins_encode %{
 9477     int opcode = this->ideal_Opcode();
 9478     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9479     int mask_len = Matcher::vector_length(this, $mask);
 9480     int vlen_enc = vector_length_encoding(this, $mask);
 9481     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9482                              $dst$$Register, mask_len, mbt, vlen_enc);
 9483   %}
 9484   ins_pipe( pipe_slow );
 9485 %}
 9486 
 9487 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
 9488   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9489   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
 9490   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
 9491   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9492   ins_encode %{
 9493     int opcode = this->ideal_Opcode();
 9494     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9495     int mask_len = Matcher::vector_length(this, $mask);
 9496     int vlen_enc = vector_length_encoding(this, $mask);
 9497     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9498                              $dst$$Register, mask_len, mbt, vlen_enc);
 9499   %}
 9500   ins_pipe( pipe_slow );
 9501 %}
 9502 
 9503 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9504   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9505   match(Set dst (VectorMaskTrueCount mask));
 9506   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9507   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
 9508   ins_encode %{
 9509     int opcode = this->ideal_Opcode();
 9510     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9511     int mask_len = Matcher::vector_length(this, $mask);
 9512     int mask_size = mask_len * type2aelembytes(mbt);
 9513     int vlen_enc = vector_length_encoding(this, $mask);
 9514     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9515                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9516   %}
 9517   ins_pipe( pipe_slow );
 9518 %}
 9519 
 9520 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9521   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9522   match(Set dst (VectorMaskTrueCount mask));
 9523   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9524   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9525   ins_encode %{
 9526     int opcode = this->ideal_Opcode();
 9527     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9528     int mask_len = Matcher::vector_length(this, $mask);
 9529     int vlen_enc = vector_length_encoding(this, $mask);
 9530     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9531                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9532   %}
 9533   ins_pipe( pipe_slow );
 9534 %}
 9535 
 9536 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9537   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9538   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
 9539   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9540   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9541   ins_encode %{
 9542     int opcode = this->ideal_Opcode();
 9543     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9544     int mask_len = Matcher::vector_length(this, $mask);
 9545     int vlen_enc = vector_length_encoding(this, $mask);
 9546     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9547                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9548   %}
 9549   ins_pipe( pipe_slow );
 9550 %}
 9551 
 9552 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9553   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9554   match(Set dst (VectorMaskFirstTrue mask));
 9555   match(Set dst (VectorMaskLastTrue mask));
 9556   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9557   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
 9558   ins_encode %{
 9559     int opcode = this->ideal_Opcode();
 9560     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9561     int mask_len = Matcher::vector_length(this, $mask);
 9562     int mask_size = mask_len * type2aelembytes(mbt);
 9563     int vlen_enc = vector_length_encoding(this, $mask);
 9564     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9565                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9566   %}
 9567   ins_pipe( pipe_slow );
 9568 %}
 9569 
 9570 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9571   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9572   match(Set dst (VectorMaskFirstTrue mask));
 9573   match(Set dst (VectorMaskLastTrue mask));
 9574   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9575   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9576   ins_encode %{
 9577     int opcode = this->ideal_Opcode();
 9578     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9579     int mask_len = Matcher::vector_length(this, $mask);
 9580     int vlen_enc = vector_length_encoding(this, $mask);
 9581     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9582                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9583   %}
 9584   ins_pipe( pipe_slow );
 9585 %}
 9586 
 9587 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9588   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9589   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
 9590   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
 9591   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9592   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9593   ins_encode %{
 9594     int opcode = this->ideal_Opcode();
 9595     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9596     int mask_len = Matcher::vector_length(this, $mask);
 9597     int vlen_enc = vector_length_encoding(this, $mask);
 9598     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9599                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9600   %}
 9601   ins_pipe( pipe_slow );
 9602 %}
 9603 
 9604 // --------------------------------- Compress/Expand Operations ---------------------------
 9605 instruct vcompress_reg_avx(vec dst, vec src, vec mask, rRegI rtmp, rRegL rscratch, vec perm, vec xtmp, rFlagsReg cr) %{
 9606   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 9607   match(Set dst (CompressV src mask));
 9608   match(Set dst (ExpandV src mask));
 9609   effect(TEMP_DEF dst, TEMP perm, TEMP xtmp, TEMP rtmp, TEMP rscratch, KILL cr);
 9610   format %{ "vector_compress $dst, $src, $mask \t!using $xtmp, $rtmp, $rscratch and $perm as TEMP" %}
 9611   ins_encode %{
 9612     int opcode = this->ideal_Opcode();
 9613     int vlen_enc = vector_length_encoding(this);
 9614     BasicType bt  = Matcher::vector_element_basic_type(this);
 9615     __ vector_compress_expand_avx2(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$XMMRegister, $rtmp$$Register,
 9616                                    $rscratch$$Register, $perm$$XMMRegister, $xtmp$$XMMRegister, bt, vlen_enc);
 9617   %}
 9618   ins_pipe( pipe_slow );
 9619 %}
 9620 
 9621 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
 9622   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 9623   match(Set dst (CompressV src mask));
 9624   match(Set dst (ExpandV src mask));
 9625   format %{ "vector_compress_expand $dst, $src, $mask" %}
 9626   ins_encode %{
 9627     int opcode = this->ideal_Opcode();
 9628     int vector_len = vector_length_encoding(this);
 9629     BasicType bt  = Matcher::vector_element_basic_type(this);
 9630     __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
 9631   %}
 9632   ins_pipe( pipe_slow );
 9633 %}
 9634 
 9635 instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
 9636   match(Set dst (CompressM mask));
 9637   effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
 9638   format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
 9639   ins_encode %{
 9640     assert(this->in(1)->bottom_type()->isa_vectmask(), "");
 9641     int mask_len = Matcher::vector_length(this);
 9642     __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
 9643   %}
 9644   ins_pipe( pipe_slow );
 9645 %}
 9646 
 9647 // -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
 9648 
 9649 instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9650   predicate(!VM_Version::supports_gfni());
 9651   match(Set dst (ReverseV src));
 9652   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9653   format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9654   ins_encode %{
 9655     int vec_enc = vector_length_encoding(this);
 9656     BasicType bt = Matcher::vector_element_basic_type(this);
 9657     __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9658                           $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9659   %}
 9660   ins_pipe( pipe_slow );
 9661 %}
 9662 
 9663 instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp) %{
 9664   predicate(VM_Version::supports_gfni());
 9665   match(Set dst (ReverseV src));
 9666   effect(TEMP dst, TEMP xtmp);
 9667   format %{ "vector_reverse_bit_gfni $dst, $src!\t using $xtmp as TEMP" %}
 9668   ins_encode %{
 9669     int vec_enc = vector_length_encoding(this);
 9670     BasicType bt  = Matcher::vector_element_basic_type(this);
 9671     InternalAddress addr = $constantaddress(jlong(0x8040201008040201));
 9672     __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, addr, vec_enc,
 9673                                $xtmp$$XMMRegister);
 9674   %}
 9675   ins_pipe( pipe_slow );
 9676 %}
 9677 
 9678 instruct vreverse_byte_reg(vec dst, vec src) %{
 9679   predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
 9680   match(Set dst (ReverseBytesV src));
 9681   effect(TEMP dst);
 9682   format %{ "vector_reverse_byte $dst, $src" %}
 9683   ins_encode %{
 9684     int vec_enc = vector_length_encoding(this);
 9685     BasicType bt = Matcher::vector_element_basic_type(this);
 9686     __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, vec_enc);
 9687   %}
 9688   ins_pipe( pipe_slow );
 9689 %}
 9690 
 9691 instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9692   predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
 9693   match(Set dst (ReverseBytesV src));
 9694   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9695   format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9696   ins_encode %{
 9697     int vec_enc = vector_length_encoding(this);
 9698     BasicType bt = Matcher::vector_element_basic_type(this);
 9699     __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9700                              $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9701   %}
 9702   ins_pipe( pipe_slow );
 9703 %}
 9704 
 9705 // ---------------------------------- Vector Count Leading Zeros -----------------------------------
 9706 
 9707 instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
 9708   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9709                                               Matcher::vector_length_in_bytes(n->in(1))));
 9710   match(Set dst (CountLeadingZerosV src));
 9711   format %{ "vector_count_leading_zeros $dst, $src" %}
 9712   ins_encode %{
 9713      int vlen_enc = vector_length_encoding(this, $src);
 9714      BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9715      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9716                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
 9717   %}
 9718   ins_pipe( pipe_slow );
 9719 %}
 9720 
 9721 instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9722   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9723                                               Matcher::vector_length_in_bytes(n->in(1))));
 9724   match(Set dst (CountLeadingZerosV src mask));
 9725   format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
 9726   ins_encode %{
 9727     int vlen_enc = vector_length_encoding(this, $src);
 9728     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9729     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9730     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
 9731                                        xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
 9732   %}
 9733   ins_pipe( pipe_slow );
 9734 %}
 9735 
 9736 instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
 9737   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9738             VM_Version::supports_avx512cd() &&
 9739             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9740   match(Set dst (CountLeadingZerosV src));
 9741   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 9742   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
 9743   ins_encode %{
 9744     int vlen_enc = vector_length_encoding(this, $src);
 9745     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9746     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9747                                        $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
 9748   %}
 9749   ins_pipe( pipe_slow );
 9750 %}
 9751 
 9752 instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
 9753   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9754   match(Set dst (CountLeadingZerosV src));
 9755   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 9756   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
 9757   ins_encode %{
 9758     int vlen_enc = vector_length_encoding(this, $src);
 9759     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9760     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9761                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
 9762                                        $rtmp$$Register, true, vlen_enc);
 9763   %}
 9764   ins_pipe( pipe_slow );
 9765 %}
 9766 
 9767 instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
 9768   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
 9769             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9770   match(Set dst (CountLeadingZerosV src));
 9771   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
 9772   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
 9773   ins_encode %{
 9774     int vlen_enc = vector_length_encoding(this, $src);
 9775     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9776     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9777                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
 9778   %}
 9779   ins_pipe( pipe_slow );
 9780 %}
 9781 
 9782 instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9783   predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
 9784             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9785   match(Set dst (CountLeadingZerosV src));
 9786   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9787   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9788   ins_encode %{
 9789     int vlen_enc = vector_length_encoding(this, $src);
 9790     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9791     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9792                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9793   %}
 9794   ins_pipe( pipe_slow );
 9795 %}
 9796 
 9797 // ---------------------------------- Vector Masked Operations ------------------------------------
 9798 
 9799 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
 9800   match(Set dst (AddVB (Binary dst src2) mask));
 9801   match(Set dst (AddVS (Binary dst src2) mask));
 9802   match(Set dst (AddVI (Binary dst src2) mask));
 9803   match(Set dst (AddVL (Binary dst src2) mask));
 9804   match(Set dst (AddVF (Binary dst src2) mask));
 9805   match(Set dst (AddVD (Binary dst src2) mask));
 9806   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9807   ins_encode %{
 9808     int vlen_enc = vector_length_encoding(this);
 9809     BasicType bt = Matcher::vector_element_basic_type(this);
 9810     int opc = this->ideal_Opcode();
 9811     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9812                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9813   %}
 9814   ins_pipe( pipe_slow );
 9815 %}
 9816 
 9817 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
 9818   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
 9819   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
 9820   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
 9821   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
 9822   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
 9823   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
 9824   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9825   ins_encode %{
 9826     int vlen_enc = vector_length_encoding(this);
 9827     BasicType bt = Matcher::vector_element_basic_type(this);
 9828     int opc = this->ideal_Opcode();
 9829     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9830                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9831   %}
 9832   ins_pipe( pipe_slow );
 9833 %}
 9834 
 9835 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
 9836   match(Set dst (XorV (Binary dst src2) mask));
 9837   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9838   ins_encode %{
 9839     int vlen_enc = vector_length_encoding(this);
 9840     BasicType bt = Matcher::vector_element_basic_type(this);
 9841     int opc = this->ideal_Opcode();
 9842     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9843                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9844   %}
 9845   ins_pipe( pipe_slow );
 9846 %}
 9847 
 9848 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
 9849   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
 9850   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9851   ins_encode %{
 9852     int vlen_enc = vector_length_encoding(this);
 9853     BasicType bt = Matcher::vector_element_basic_type(this);
 9854     int opc = this->ideal_Opcode();
 9855     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9856                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9857   %}
 9858   ins_pipe( pipe_slow );
 9859 %}
 9860 
 9861 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
 9862   match(Set dst (OrV (Binary dst src2) mask));
 9863   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9864   ins_encode %{
 9865     int vlen_enc = vector_length_encoding(this);
 9866     BasicType bt = Matcher::vector_element_basic_type(this);
 9867     int opc = this->ideal_Opcode();
 9868     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9869                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9870   %}
 9871   ins_pipe( pipe_slow );
 9872 %}
 9873 
 9874 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
 9875   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
 9876   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9877   ins_encode %{
 9878     int vlen_enc = vector_length_encoding(this);
 9879     BasicType bt = Matcher::vector_element_basic_type(this);
 9880     int opc = this->ideal_Opcode();
 9881     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9882                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9883   %}
 9884   ins_pipe( pipe_slow );
 9885 %}
 9886 
 9887 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
 9888   match(Set dst (AndV (Binary dst src2) mask));
 9889   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9890   ins_encode %{
 9891     int vlen_enc = vector_length_encoding(this);
 9892     BasicType bt = Matcher::vector_element_basic_type(this);
 9893     int opc = this->ideal_Opcode();
 9894     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9895                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9896   %}
 9897   ins_pipe( pipe_slow );
 9898 %}
 9899 
 9900 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
 9901   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
 9902   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9903   ins_encode %{
 9904     int vlen_enc = vector_length_encoding(this);
 9905     BasicType bt = Matcher::vector_element_basic_type(this);
 9906     int opc = this->ideal_Opcode();
 9907     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9908                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9909   %}
 9910   ins_pipe( pipe_slow );
 9911 %}
 9912 
 9913 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
 9914   match(Set dst (SubVB (Binary dst src2) mask));
 9915   match(Set dst (SubVS (Binary dst src2) mask));
 9916   match(Set dst (SubVI (Binary dst src2) mask));
 9917   match(Set dst (SubVL (Binary dst src2) mask));
 9918   match(Set dst (SubVF (Binary dst src2) mask));
 9919   match(Set dst (SubVD (Binary dst src2) mask));
 9920   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9921   ins_encode %{
 9922     int vlen_enc = vector_length_encoding(this);
 9923     BasicType bt = Matcher::vector_element_basic_type(this);
 9924     int opc = this->ideal_Opcode();
 9925     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9926                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9927   %}
 9928   ins_pipe( pipe_slow );
 9929 %}
 9930 
 9931 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
 9932   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
 9933   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
 9934   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
 9935   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
 9936   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
 9937   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
 9938   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9939   ins_encode %{
 9940     int vlen_enc = vector_length_encoding(this);
 9941     BasicType bt = Matcher::vector_element_basic_type(this);
 9942     int opc = this->ideal_Opcode();
 9943     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9944                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9945   %}
 9946   ins_pipe( pipe_slow );
 9947 %}
 9948 
 9949 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
 9950   match(Set dst (MulVS (Binary dst src2) mask));
 9951   match(Set dst (MulVI (Binary dst src2) mask));
 9952   match(Set dst (MulVL (Binary dst src2) mask));
 9953   match(Set dst (MulVF (Binary dst src2) mask));
 9954   match(Set dst (MulVD (Binary dst src2) mask));
 9955   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9956   ins_encode %{
 9957     int vlen_enc = vector_length_encoding(this);
 9958     BasicType bt = Matcher::vector_element_basic_type(this);
 9959     int opc = this->ideal_Opcode();
 9960     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9961                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9962   %}
 9963   ins_pipe( pipe_slow );
 9964 %}
 9965 
 9966 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
 9967   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
 9968   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
 9969   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
 9970   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
 9971   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
 9972   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9973   ins_encode %{
 9974     int vlen_enc = vector_length_encoding(this);
 9975     BasicType bt = Matcher::vector_element_basic_type(this);
 9976     int opc = this->ideal_Opcode();
 9977     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9978                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9979   %}
 9980   ins_pipe( pipe_slow );
 9981 %}
 9982 
 9983 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
 9984   match(Set dst (SqrtVF dst mask));
 9985   match(Set dst (SqrtVD dst mask));
 9986   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
 9987   ins_encode %{
 9988     int vlen_enc = vector_length_encoding(this);
 9989     BasicType bt = Matcher::vector_element_basic_type(this);
 9990     int opc = this->ideal_Opcode();
 9991     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9992                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9993   %}
 9994   ins_pipe( pipe_slow );
 9995 %}
 9996 
 9997 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
 9998   match(Set dst (DivVF (Binary dst src2) mask));
 9999   match(Set dst (DivVD (Binary dst src2) mask));
10000   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
10001   ins_encode %{
10002     int vlen_enc = vector_length_encoding(this);
10003     BasicType bt = Matcher::vector_element_basic_type(this);
10004     int opc = this->ideal_Opcode();
10005     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10006                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10007   %}
10008   ins_pipe( pipe_slow );
10009 %}
10010 
10011 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
10012   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
10013   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
10014   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
10015   ins_encode %{
10016     int vlen_enc = vector_length_encoding(this);
10017     BasicType bt = Matcher::vector_element_basic_type(this);
10018     int opc = this->ideal_Opcode();
10019     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10020                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10021   %}
10022   ins_pipe( pipe_slow );
10023 %}
10024 
10025 
10026 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
10027   match(Set dst (RotateLeftV (Binary dst shift) mask));
10028   match(Set dst (RotateRightV (Binary dst shift) mask));
10029   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
10030   ins_encode %{
10031     int vlen_enc = vector_length_encoding(this);
10032     BasicType bt = Matcher::vector_element_basic_type(this);
10033     int opc = this->ideal_Opcode();
10034     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10035                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10036   %}
10037   ins_pipe( pipe_slow );
10038 %}
10039 
10040 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
10041   match(Set dst (RotateLeftV (Binary dst src2) mask));
10042   match(Set dst (RotateRightV (Binary dst src2) mask));
10043   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
10044   ins_encode %{
10045     int vlen_enc = vector_length_encoding(this);
10046     BasicType bt = Matcher::vector_element_basic_type(this);
10047     int opc = this->ideal_Opcode();
10048     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10049                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10050   %}
10051   ins_pipe( pipe_slow );
10052 %}
10053 
10054 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10055   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
10056   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
10057   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
10058   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
10059   ins_encode %{
10060     int vlen_enc = vector_length_encoding(this);
10061     BasicType bt = Matcher::vector_element_basic_type(this);
10062     int opc = this->ideal_Opcode();
10063     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10064                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10065   %}
10066   ins_pipe( pipe_slow );
10067 %}
10068 
10069 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
10070   predicate(!n->as_ShiftV()->is_var_shift());
10071   match(Set dst (LShiftVS (Binary dst src2) mask));
10072   match(Set dst (LShiftVI (Binary dst src2) mask));
10073   match(Set dst (LShiftVL (Binary dst src2) mask));
10074   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
10075   ins_encode %{
10076     int vlen_enc = vector_length_encoding(this);
10077     BasicType bt = Matcher::vector_element_basic_type(this);
10078     int opc = this->ideal_Opcode();
10079     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10080                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10081   %}
10082   ins_pipe( pipe_slow );
10083 %}
10084 
10085 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10086   predicate(n->as_ShiftV()->is_var_shift());
10087   match(Set dst (LShiftVS (Binary dst src2) mask));
10088   match(Set dst (LShiftVI (Binary dst src2) mask));
10089   match(Set dst (LShiftVL (Binary dst src2) mask));
10090   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
10091   ins_encode %{
10092     int vlen_enc = vector_length_encoding(this);
10093     BasicType bt = Matcher::vector_element_basic_type(this);
10094     int opc = this->ideal_Opcode();
10095     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10096                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10097   %}
10098   ins_pipe( pipe_slow );
10099 %}
10100 
10101 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10102   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
10103   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
10104   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
10105   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
10106   ins_encode %{
10107     int vlen_enc = vector_length_encoding(this);
10108     BasicType bt = Matcher::vector_element_basic_type(this);
10109     int opc = this->ideal_Opcode();
10110     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10111                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10112   %}
10113   ins_pipe( pipe_slow );
10114 %}
10115 
10116 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
10117   predicate(!n->as_ShiftV()->is_var_shift());
10118   match(Set dst (RShiftVS (Binary dst src2) mask));
10119   match(Set dst (RShiftVI (Binary dst src2) mask));
10120   match(Set dst (RShiftVL (Binary dst src2) mask));
10121   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10122   ins_encode %{
10123     int vlen_enc = vector_length_encoding(this);
10124     BasicType bt = Matcher::vector_element_basic_type(this);
10125     int opc = this->ideal_Opcode();
10126     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10127                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10128   %}
10129   ins_pipe( pipe_slow );
10130 %}
10131 
10132 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10133   predicate(n->as_ShiftV()->is_var_shift());
10134   match(Set dst (RShiftVS (Binary dst src2) mask));
10135   match(Set dst (RShiftVI (Binary dst src2) mask));
10136   match(Set dst (RShiftVL (Binary dst src2) mask));
10137   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10138   ins_encode %{
10139     int vlen_enc = vector_length_encoding(this);
10140     BasicType bt = Matcher::vector_element_basic_type(this);
10141     int opc = this->ideal_Opcode();
10142     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10143                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10144   %}
10145   ins_pipe( pipe_slow );
10146 %}
10147 
10148 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10149   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
10150   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
10151   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
10152   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
10153   ins_encode %{
10154     int vlen_enc = vector_length_encoding(this);
10155     BasicType bt = Matcher::vector_element_basic_type(this);
10156     int opc = this->ideal_Opcode();
10157     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10158                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10159   %}
10160   ins_pipe( pipe_slow );
10161 %}
10162 
10163 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
10164   predicate(!n->as_ShiftV()->is_var_shift());
10165   match(Set dst (URShiftVS (Binary dst src2) mask));
10166   match(Set dst (URShiftVI (Binary dst src2) mask));
10167   match(Set dst (URShiftVL (Binary dst src2) mask));
10168   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10169   ins_encode %{
10170     int vlen_enc = vector_length_encoding(this);
10171     BasicType bt = Matcher::vector_element_basic_type(this);
10172     int opc = this->ideal_Opcode();
10173     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10174                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10175   %}
10176   ins_pipe( pipe_slow );
10177 %}
10178 
10179 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10180   predicate(n->as_ShiftV()->is_var_shift());
10181   match(Set dst (URShiftVS (Binary dst src2) mask));
10182   match(Set dst (URShiftVI (Binary dst src2) mask));
10183   match(Set dst (URShiftVL (Binary dst src2) mask));
10184   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10185   ins_encode %{
10186     int vlen_enc = vector_length_encoding(this);
10187     BasicType bt = Matcher::vector_element_basic_type(this);
10188     int opc = this->ideal_Opcode();
10189     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10190                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10191   %}
10192   ins_pipe( pipe_slow );
10193 %}
10194 
10195 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
10196   match(Set dst (MaxV (Binary dst src2) mask));
10197   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10198   ins_encode %{
10199     int vlen_enc = vector_length_encoding(this);
10200     BasicType bt = Matcher::vector_element_basic_type(this);
10201     int opc = this->ideal_Opcode();
10202     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10203                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10204   %}
10205   ins_pipe( pipe_slow );
10206 %}
10207 
10208 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
10209   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
10210   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10211   ins_encode %{
10212     int vlen_enc = vector_length_encoding(this);
10213     BasicType bt = Matcher::vector_element_basic_type(this);
10214     int opc = this->ideal_Opcode();
10215     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10216                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10217   %}
10218   ins_pipe( pipe_slow );
10219 %}
10220 
10221 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
10222   match(Set dst (MinV (Binary dst src2) mask));
10223   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10224   ins_encode %{
10225     int vlen_enc = vector_length_encoding(this);
10226     BasicType bt = Matcher::vector_element_basic_type(this);
10227     int opc = this->ideal_Opcode();
10228     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10229                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10230   %}
10231   ins_pipe( pipe_slow );
10232 %}
10233 
10234 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
10235   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
10236   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10237   ins_encode %{
10238     int vlen_enc = vector_length_encoding(this);
10239     BasicType bt = Matcher::vector_element_basic_type(this);
10240     int opc = this->ideal_Opcode();
10241     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10242                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10243   %}
10244   ins_pipe( pipe_slow );
10245 %}
10246 
10247 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
10248   match(Set dst (VectorRearrange (Binary dst src2) mask));
10249   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
10250   ins_encode %{
10251     int vlen_enc = vector_length_encoding(this);
10252     BasicType bt = Matcher::vector_element_basic_type(this);
10253     int opc = this->ideal_Opcode();
10254     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10255                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10256   %}
10257   ins_pipe( pipe_slow );
10258 %}
10259 
10260 instruct vabs_masked(vec dst, kReg mask) %{
10261   match(Set dst (AbsVB dst mask));
10262   match(Set dst (AbsVS dst mask));
10263   match(Set dst (AbsVI dst mask));
10264   match(Set dst (AbsVL dst mask));
10265   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
10266   ins_encode %{
10267     int vlen_enc = vector_length_encoding(this);
10268     BasicType bt = Matcher::vector_element_basic_type(this);
10269     int opc = this->ideal_Opcode();
10270     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10271                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
10272   %}
10273   ins_pipe( pipe_slow );
10274 %}
10275 
10276 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
10277   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
10278   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
10279   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10280   ins_encode %{
10281     assert(UseFMA, "Needs FMA instructions support.");
10282     int vlen_enc = vector_length_encoding(this);
10283     BasicType bt = Matcher::vector_element_basic_type(this);
10284     int opc = this->ideal_Opcode();
10285     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10286                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
10287   %}
10288   ins_pipe( pipe_slow );
10289 %}
10290 
10291 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
10292   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
10293   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
10294   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10295   ins_encode %{
10296     assert(UseFMA, "Needs FMA instructions support.");
10297     int vlen_enc = vector_length_encoding(this);
10298     BasicType bt = Matcher::vector_element_basic_type(this);
10299     int opc = this->ideal_Opcode();
10300     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10301                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
10302   %}
10303   ins_pipe( pipe_slow );
10304 %}
10305 
10306 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask) %{
10307   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
10308   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask" %}
10309   ins_encode %{
10310     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
10311     int vlen_enc = vector_length_encoding(this, $src1);
10312     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
10313 
10314     // Comparison i
10315     switch (src1_elem_bt) {
10316       case T_BYTE: {
10317         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10318         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10319         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10320         break;
10321       }
10322       case T_SHORT: {
10323         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10324         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10325         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10326         break;
10327       }
10328       case T_INT: {
10329         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10330         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10331         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10332         break;
10333       }
10334       case T_LONG: {
10335         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10336         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10337         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10338         break;
10339       }
10340       case T_FLOAT: {
10341         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10342         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10343         break;
10344       }
10345       case T_DOUBLE: {
10346         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10347         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10348         break;
10349       }
10350       default: assert(false, "%s", type2name(src1_elem_bt)); break;
10351     }
10352   %}
10353   ins_pipe( pipe_slow );
10354 %}
10355 
10356 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
10357   predicate(Matcher::vector_length(n) <= 32);
10358   match(Set dst (MaskAll src));
10359   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
10360   ins_encode %{
10361     int mask_len = Matcher::vector_length(this);
10362     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
10363   %}
10364   ins_pipe( pipe_slow );
10365 %}
10366 
10367 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
10368   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
10369   match(Set dst (XorVMask src (MaskAll cnt)));
10370   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
10371   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
10372   ins_encode %{
10373     uint masklen = Matcher::vector_length(this);
10374     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
10375   %}
10376   ins_pipe( pipe_slow );
10377 %}
10378 
10379 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
10380   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
10381             (Matcher::vector_length(n) == 16) ||
10382             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
10383   match(Set dst (XorVMask src (MaskAll cnt)));
10384   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
10385   ins_encode %{
10386     uint masklen = Matcher::vector_length(this);
10387     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
10388   %}
10389   ins_pipe( pipe_slow );
10390 %}
10391 
10392 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
10393   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) <= 8);
10394   match(Set dst (VectorLongToMask src));
10395   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
10396   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
10397   ins_encode %{
10398     int mask_len = Matcher::vector_length(this);
10399     int vec_enc  = vector_length_encoding(mask_len);
10400     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10401                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
10402   %}
10403   ins_pipe( pipe_slow );
10404 %}
10405 
10406 
10407 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
10408   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) > 8);
10409   match(Set dst (VectorLongToMask src));
10410   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
10411   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
10412   ins_encode %{
10413     int mask_len = Matcher::vector_length(this);
10414     assert(mask_len <= 32, "invalid mask length");
10415     int vec_enc  = vector_length_encoding(mask_len);
10416     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10417                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
10418   %}
10419   ins_pipe( pipe_slow );
10420 %}
10421 
10422 instruct long_to_mask_evex(kReg dst, rRegL src) %{
10423   predicate(n->bottom_type()->isa_vectmask());
10424   match(Set dst (VectorLongToMask src));
10425   format %{ "long_to_mask_evex $dst, $src\t!" %}
10426   ins_encode %{
10427     __ kmov($dst$$KRegister, $src$$Register);
10428   %}
10429   ins_pipe( pipe_slow );
10430 %}
10431 
10432 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10433   match(Set dst (AndVMask src1 src2));
10434   match(Set dst (OrVMask src1 src2));
10435   match(Set dst (XorVMask src1 src2));
10436   effect(TEMP kscratch);
10437   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10438   ins_encode %{
10439     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10440     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10441     assert(Type::equals(mask1->bottom_type(), mask2->bottom_type()), "Mask types must be equal");
10442     uint masklen = Matcher::vector_length(this);
10443     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10444     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10445   %}
10446   ins_pipe( pipe_slow );
10447 %}
10448 
10449 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10450   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10451   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10452   ins_encode %{
10453     int vlen_enc = vector_length_encoding(this);
10454     BasicType bt = Matcher::vector_element_basic_type(this);
10455     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10456                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10457   %}
10458   ins_pipe( pipe_slow );
10459 %}
10460 
10461 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10462   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10463   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10464   ins_encode %{
10465     int vlen_enc = vector_length_encoding(this);
10466     BasicType bt = Matcher::vector_element_basic_type(this);
10467     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10468                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10469   %}
10470   ins_pipe( pipe_slow );
10471 %}
10472 
10473 instruct castMM(kReg dst)
10474 %{
10475   match(Set dst (CastVV dst));
10476 
10477   size(0);
10478   format %{ "# castVV of $dst" %}
10479   ins_encode(/* empty encoding */);
10480   ins_cost(0);
10481   ins_pipe(empty);
10482 %}
10483 
10484 instruct castVV(vec dst)
10485 %{
10486   match(Set dst (CastVV dst));
10487 
10488   size(0);
10489   format %{ "# castVV of $dst" %}
10490   ins_encode(/* empty encoding */);
10491   ins_cost(0);
10492   ins_pipe(empty);
10493 %}
10494 
10495 instruct castVVLeg(legVec dst)
10496 %{
10497   match(Set dst (CastVV dst));
10498 
10499   size(0);
10500   format %{ "# castVV of $dst" %}
10501   ins_encode(/* empty encoding */);
10502   ins_cost(0);
10503   ins_pipe(empty);
10504 %}
10505 
10506 instruct FloatClassCheck_reg_reg_vfpclass(rRegI dst, regF src, kReg ktmp, rFlagsReg cr)
10507 %{
10508   match(Set dst (IsInfiniteF src));
10509   effect(TEMP ktmp, KILL cr);
10510   format %{ "float_class_check $dst, $src" %}
10511   ins_encode %{
10512     __ vfpclassss($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10513     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10514   %}
10515   ins_pipe(pipe_slow);
10516 %}
10517 
10518 instruct DoubleClassCheck_reg_reg_vfpclass(rRegI dst, regD src, kReg ktmp, rFlagsReg cr)
10519 %{
10520   match(Set dst (IsInfiniteD src));
10521   effect(TEMP ktmp, KILL cr);
10522   format %{ "double_class_check $dst, $src" %}
10523   ins_encode %{
10524     __ vfpclasssd($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10525     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10526   %}
10527   ins_pipe(pipe_slow);
10528 %}
10529 
10530 instruct vector_addsub_saturating_subword_reg(vec dst, vec src1, vec src2)
10531 %{
10532   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10533             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10534   match(Set dst (SaturatingAddV src1 src2));
10535   match(Set dst (SaturatingSubV src1 src2));
10536   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10537   ins_encode %{
10538     int vlen_enc = vector_length_encoding(this);
10539     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10540     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10541                             $src1$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10542   %}
10543   ins_pipe(pipe_slow);
10544 %}
10545 
10546 instruct vector_addsub_saturating_unsigned_subword_reg(vec dst, vec src1, vec src2)
10547 %{
10548   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10549             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10550   match(Set dst (SaturatingAddV src1 src2));
10551   match(Set dst (SaturatingSubV src1 src2));
10552   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10553   ins_encode %{
10554     int vlen_enc = vector_length_encoding(this);
10555     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10556     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10557                             $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10558   %}
10559   ins_pipe(pipe_slow);
10560 %}
10561 
10562 instruct vector_addsub_saturating_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2)
10563 %{
10564   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10565             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10566             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10567   match(Set dst (SaturatingAddV src1 src2));
10568   match(Set dst (SaturatingSubV src1 src2));
10569   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2);
10570   format %{ "vector_addsub_saturating_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
10571   ins_encode %{
10572     int vlen_enc = vector_length_encoding(this);
10573     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10574     __ vector_addsub_dq_saturating_evex(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10575                                         $src1$$XMMRegister, $src2$$XMMRegister,
10576                                         $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10577                                         $ktmp1$$KRegister, $ktmp2$$KRegister, vlen_enc);
10578   %}
10579   ins_pipe(pipe_slow);
10580 %}
10581 
10582 instruct vector_addsub_saturating_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4)
10583 %{
10584   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10585             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10586             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10587   match(Set dst (SaturatingAddV src1 src2));
10588   match(Set dst (SaturatingSubV src1 src2));
10589   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4);
10590   format %{ "vector_addsub_saturating_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
10591   ins_encode %{
10592     int vlen_enc = vector_length_encoding(this);
10593     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10594     __ vector_addsub_dq_saturating_avx(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10595                                        $src2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10596                                        $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, vlen_enc);
10597   %}
10598   ins_pipe(pipe_slow);
10599 %}
10600 
10601 instruct vector_add_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp)
10602 %{
10603   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10604             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10605             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10606   match(Set dst (SaturatingAddV src1 src2));
10607   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp);
10608   format %{ "vector_add_saturating_unsigned_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $ktmp as TEMP" %}
10609   ins_encode %{
10610     int vlen_enc = vector_length_encoding(this);
10611     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10612     __ vector_add_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10613                                               $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10614   %}
10615   ins_pipe(pipe_slow);
10616 %}
10617 
10618 instruct vector_add_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3)
10619 %{
10620   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10621             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10622             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10623   match(Set dst (SaturatingAddV src1 src2));
10624   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
10625   format %{ "vector_add_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
10626   ins_encode %{
10627     int vlen_enc = vector_length_encoding(this);
10628     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10629     __ vector_add_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10630                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, vlen_enc);
10631   %}
10632   ins_pipe(pipe_slow);
10633 %}
10634 
10635 instruct vector_sub_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, kReg ktmp)
10636 %{
10637   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10638             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10639             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10640   match(Set dst (SaturatingSubV src1 src2));
10641   effect(TEMP ktmp);
10642   format %{ "vector_sub_saturating_unsigned_evex $dst, $src1, $src2 \t! using $ktmp as TEMP" %}
10643   ins_encode %{
10644     int vlen_enc = vector_length_encoding(this);
10645     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10646     __ vector_sub_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10647                                               $src2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10648   %}
10649   ins_pipe(pipe_slow);
10650 %}
10651 
10652 instruct vector_sub_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2)
10653 %{
10654   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10655             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10656             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10657   match(Set dst (SaturatingSubV src1 src2));
10658   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
10659   format %{ "vector_sub_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1 and $xtmp2 as TEMP" %}
10660   ins_encode %{
10661     int vlen_enc = vector_length_encoding(this);
10662     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10663     __ vector_sub_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10664                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
10665   %}
10666   ins_pipe(pipe_slow);
10667 %}
10668 
10669 instruct vector_addsub_saturating_subword_mem(vec dst, vec src1, memory src2)
10670 %{
10671   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10672             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10673   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10674   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10675   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10676   ins_encode %{
10677     int vlen_enc = vector_length_encoding(this);
10678     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10679     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10680                             $src1$$XMMRegister, $src2$$Address, false, vlen_enc);
10681   %}
10682   ins_pipe(pipe_slow);
10683 %}
10684 
10685 instruct vector_addsub_saturating_unsigned_subword_mem(vec dst, vec src1, memory src2)
10686 %{
10687   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10688             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10689   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10690   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10691   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10692   ins_encode %{
10693     int vlen_enc = vector_length_encoding(this);
10694     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10695     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10696                             $src1$$XMMRegister, $src2$$Address, true, vlen_enc);
10697   %}
10698   ins_pipe(pipe_slow);
10699 %}
10700 
10701 instruct vector_addsub_saturating_subword_masked_reg(vec dst, vec src, kReg mask) %{
10702   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10703             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10704   match(Set dst (SaturatingAddV (Binary dst src) mask));
10705   match(Set dst (SaturatingSubV (Binary dst src) mask));
10706   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10707   ins_encode %{
10708     int vlen_enc = vector_length_encoding(this);
10709     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10710     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10711                               $dst$$XMMRegister, $src$$XMMRegister, false, true, vlen_enc);
10712   %}
10713   ins_pipe( pipe_slow );
10714 %}
10715 
10716 instruct vector_addsub_saturating_unsigned_subword_masked_reg(vec dst, vec src, kReg mask) %{
10717   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10718             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10719   match(Set dst (SaturatingAddV (Binary dst src) mask));
10720   match(Set dst (SaturatingSubV (Binary dst src) mask));
10721   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10722   ins_encode %{
10723     int vlen_enc = vector_length_encoding(this);
10724     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10725     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10726                               $dst$$XMMRegister, $src$$XMMRegister, true, true, vlen_enc);
10727   %}
10728   ins_pipe( pipe_slow );
10729 %}
10730 
10731 instruct vector_addsub_saturating_subword_masked_mem(vec dst, memory src, kReg mask) %{
10732   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10733             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10734   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10735   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10736   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10737   ins_encode %{
10738     int vlen_enc = vector_length_encoding(this);
10739     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10740     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10741                               $dst$$XMMRegister, $src$$Address, false, true, vlen_enc);
10742   %}
10743   ins_pipe( pipe_slow );
10744 %}
10745 
10746 instruct vector_addsub_saturating_unsigned_subword_masked_mem(vec dst, memory src, kReg mask) %{
10747   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10748             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10749   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10750   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10751   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10752   ins_encode %{
10753     int vlen_enc = vector_length_encoding(this);
10754     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10755     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10756                               $dst$$XMMRegister, $src$$Address, true, true, vlen_enc);
10757   %}
10758   ins_pipe( pipe_slow );
10759 %}
10760 
10761 instruct vector_selectfrom_twovectors_reg_evex(vec index, vec src1, vec src2)
10762 %{
10763   match(Set index (SelectFromTwoVector (Binary index src1) src2));
10764   format %{ "select_from_two_vector $index, $src1, $src2 \t!" %}
10765   ins_encode %{
10766     int vlen_enc = vector_length_encoding(this);
10767     BasicType bt = Matcher::vector_element_basic_type(this);
10768     __ select_from_two_vectors_evex(bt, $index$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
10769   %}
10770   ins_pipe(pipe_slow);
10771 %}
10772 
10773 instruct reinterpretS2HF(regF dst, rRegI src)
10774 %{
10775   match(Set dst (ReinterpretS2HF src));
10776   format %{ "vmovw $dst, $src" %}
10777   ins_encode %{
10778     __ vmovw($dst$$XMMRegister, $src$$Register);
10779   %}
10780   ins_pipe(pipe_slow);
10781 %}
10782 
10783 instruct reinterpretHF2S(rRegI dst, regF src)
10784 %{
10785   match(Set dst (ReinterpretHF2S src));
10786   format %{ "vmovw $dst, $src" %}
10787   ins_encode %{
10788     __ vmovw($dst$$Register, $src$$XMMRegister);
10789   %}
10790   ins_pipe(pipe_slow);
10791 %}
10792 
10793 instruct convF2HFAndS2HF(regF dst, regF src)
10794 %{
10795   match(Set dst (ReinterpretS2HF (ConvF2HF src)));
10796   format %{ "convF2HFAndS2HF $dst, $src" %}
10797   ins_encode %{
10798     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
10799   %}
10800   ins_pipe(pipe_slow);
10801 %}
10802 
10803 instruct convHF2SAndHF2F(regF dst, regF src)
10804 %{
10805   match(Set dst (ConvHF2F (ReinterpretHF2S src)));
10806   format %{ "convHF2SAndHF2F $dst, $src" %}
10807   ins_encode %{
10808     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, Assembler::AVX_128bit);
10809   %}
10810   ins_pipe(pipe_slow);
10811 %}
10812 
10813 instruct scalar_sqrt_HF_reg(regF dst, regF src)
10814 %{
10815   match(Set dst (SqrtHF src));
10816   format %{ "scalar_sqrt_fp16 $dst, $src" %}
10817   ins_encode %{
10818     __ vsqrtsh($dst$$XMMRegister, $src$$XMMRegister);
10819   %}
10820   ins_pipe(pipe_slow);
10821 %}
10822 
10823 instruct scalar_binOps_HF_reg(regF dst, regF src1, regF src2)
10824 %{
10825   match(Set dst (AddHF src1 src2));
10826   match(Set dst (DivHF src1 src2));
10827   match(Set dst (MulHF src1 src2));
10828   match(Set dst (SubHF src1 src2));
10829   format %{ "scalar_binop_fp16 $dst, $src1, $src2" %}
10830   ins_encode %{
10831     int opcode = this->ideal_Opcode();
10832     __ efp16sh(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
10833   %}
10834   ins_pipe(pipe_slow);
10835 %}
10836 
10837 instruct scalar_minmax_HF_avx10_reg(regF dst, regF src1, regF src2)
10838 %{
10839   predicate(VM_Version::supports_avx10_2());
10840   match(Set dst (MaxHF src1 src2));
10841   match(Set dst (MinHF src1 src2));
10842   format %{ "scalar_min_max_fp16 $dst, $src1, $src2" %}
10843   ins_encode %{
10844     int function = this->ideal_Opcode() == Op_MinHF ? AVX10_MINMAX_MIN_COMPARE_SIGN : AVX10_MINMAX_MAX_COMPARE_SIGN;
10845     __ eminmaxsh($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, function);
10846   %}
10847   ins_pipe( pipe_slow );
10848 %}
10849 
10850 instruct scalar_minmax_HF_reg(regF dst, regF src1, regF src2, kReg ktmp, regF xtmp1, regF xtmp2)
10851 %{
10852   predicate(!VM_Version::supports_avx10_2());
10853   match(Set dst (MaxHF src1 src2));
10854   match(Set dst (MinHF src1 src2));
10855   effect(TEMP_DEF dst, TEMP ktmp, TEMP xtmp1, TEMP xtmp2);
10856   format %{ "scalar_min_max_fp16 $dst, $src1, $src2\t using $ktmp, $xtmp1 and $xtmp2 as TEMP" %}
10857   ins_encode %{
10858     int opcode = this->ideal_Opcode();
10859     __ scalar_max_min_fp16(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $ktmp$$KRegister,
10860                            $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
10861   %}
10862   ins_pipe( pipe_slow );
10863 %}
10864 
10865 instruct scalar_fma_HF_reg(regF dst, regF src1, regF src2)
10866 %{
10867   match(Set dst (FmaHF  src2 (Binary dst src1)));
10868   effect(DEF dst);
10869   format %{ "scalar_fma_fp16 $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10870   ins_encode %{
10871     __ vfmadd132sh($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister);
10872   %}
10873   ins_pipe( pipe_slow );
10874 %}
10875 
10876 
10877 instruct vector_sqrt_HF_reg(vec dst, vec src)
10878 %{
10879   match(Set dst (SqrtVHF src));
10880   format %{ "vector_sqrt_fp16 $dst, $src" %}
10881   ins_encode %{
10882     int vlen_enc = vector_length_encoding(this);
10883     __ evsqrtph($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
10884   %}
10885   ins_pipe(pipe_slow);
10886 %}
10887 
10888 instruct vector_sqrt_HF_mem(vec dst, memory src)
10889 %{
10890   match(Set dst (SqrtVHF (VectorReinterpret (LoadVector src))));
10891   format %{ "vector_sqrt_fp16_mem $dst, $src" %}
10892   ins_encode %{
10893     int vlen_enc = vector_length_encoding(this);
10894     __ evsqrtph($dst$$XMMRegister, $src$$Address, vlen_enc);
10895   %}
10896   ins_pipe(pipe_slow);
10897 %}
10898 
10899 instruct vector_binOps_HF_reg(vec dst, vec src1, vec src2)
10900 %{
10901   match(Set dst (AddVHF src1 src2));
10902   match(Set dst (DivVHF src1 src2));
10903   match(Set dst (MulVHF src1 src2));
10904   match(Set dst (SubVHF src1 src2));
10905   format %{ "vector_binop_fp16 $dst, $src1, $src2" %}
10906   ins_encode %{
10907     int vlen_enc = vector_length_encoding(this);
10908     int opcode = this->ideal_Opcode();
10909     __ evfp16ph(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
10910   %}
10911   ins_pipe(pipe_slow);
10912 %}
10913 
10914 
10915 instruct vector_binOps_HF_mem(vec dst, vec src1, memory src2)
10916 %{
10917   match(Set dst (AddVHF src1 (VectorReinterpret (LoadVector src2))));
10918   match(Set dst (DivVHF src1 (VectorReinterpret (LoadVector src2))));
10919   match(Set dst (MulVHF src1 (VectorReinterpret (LoadVector src2))));
10920   match(Set dst (SubVHF src1 (VectorReinterpret (LoadVector src2))));
10921   format %{ "vector_binop_fp16_mem $dst, $src1, $src2" %}
10922   ins_encode %{
10923     int vlen_enc = vector_length_encoding(this);
10924     int opcode = this->ideal_Opcode();
10925     __ evfp16ph(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address, vlen_enc);
10926   %}
10927   ins_pipe(pipe_slow);
10928 %}
10929 
10930 instruct vector_fma_HF_reg(vec dst, vec src1, vec src2)
10931 %{
10932   match(Set dst (FmaVHF src2 (Binary dst src1)));
10933   format %{ "vector_fma_fp16 $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10934   ins_encode %{
10935     int vlen_enc = vector_length_encoding(this);
10936     __ evfmadd132ph($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
10937   %}
10938   ins_pipe( pipe_slow );
10939 %}
10940 
10941 instruct vector_fma_HF_mem(vec dst, memory src1, vec src2)
10942 %{
10943   match(Set dst (FmaVHF src2 (Binary dst (VectorReinterpret (LoadVector src1)))));
10944   format %{ "vector_fma_fp16_mem $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10945   ins_encode %{
10946     int vlen_enc = vector_length_encoding(this);
10947     __ evfmadd132ph($dst$$XMMRegister, $src2$$XMMRegister, $src1$$Address, vlen_enc);
10948   %}
10949   ins_pipe( pipe_slow );
10950 %}
10951 
10952 instruct vector_minmax_HF_avx10_mem(vec dst, vec src1, memory src2)
10953 %{
10954   predicate(VM_Version::supports_avx10_2());
10955   match(Set dst (MinVHF src1 (VectorReinterpret (LoadVector src2))));
10956   match(Set dst (MaxVHF src1 (VectorReinterpret (LoadVector src2))));
10957   format %{ "vector_min_max_fp16_mem $dst, $src1, $src2" %}
10958   ins_encode %{
10959     int vlen_enc = vector_length_encoding(this);
10960     int function =  this->ideal_Opcode() == Op_MinVHF ? AVX10_MINMAX_MIN_COMPARE_SIGN : AVX10_MINMAX_MAX_COMPARE_SIGN;
10961     __ evminmaxph($dst$$XMMRegister, k0, $src1$$XMMRegister, $src2$$Address, true, function, vlen_enc);
10962   %}
10963   ins_pipe( pipe_slow );
10964 %}
10965 
10966 instruct vector_minmax_HF_avx10_reg(vec dst, vec src1, vec src2)
10967 %{
10968   predicate(VM_Version::supports_avx10_2());
10969   match(Set dst (MinVHF src1 src2));
10970   match(Set dst (MaxVHF src1 src2));
10971   format %{ "vector_min_max_fp16 $dst, $src1, $src2" %}
10972   ins_encode %{
10973     int vlen_enc = vector_length_encoding(this);
10974     int function =  this->ideal_Opcode() == Op_MinVHF ? AVX10_MINMAX_MIN_COMPARE_SIGN : AVX10_MINMAX_MAX_COMPARE_SIGN;
10975     __ evminmaxph($dst$$XMMRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, true, function, vlen_enc);
10976   %}
10977   ins_pipe( pipe_slow );
10978 %}
10979 
10980 instruct vector_minmax_HF_reg(vec dst, vec src1, vec src2, kReg ktmp, vec xtmp1, vec xtmp2)
10981 %{
10982   predicate(!VM_Version::supports_avx10_2());
10983   match(Set dst (MinVHF src1 src2));
10984   match(Set dst (MaxVHF src1 src2));
10985   effect(TEMP_DEF dst, TEMP ktmp, TEMP xtmp1, TEMP xtmp2);
10986   format %{ "vector_min_max_fp16 $dst, $src1, $src2\t using $ktmp, $xtmp1 and $xtmp2 as TEMP" %}
10987   ins_encode %{
10988     int vlen_enc = vector_length_encoding(this);
10989     int opcode = this->ideal_Opcode();
10990     __ vector_max_min_fp16(opcode, $dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, $ktmp$$KRegister,
10991                            $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
10992   %}
10993   ins_pipe( pipe_slow );
10994 %}