1 //
    2 // Copyright (c) 2011, 2025, Oracle and/or its affiliates. All rights reserved.
    3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4 //
    5 // This code is free software; you can redistribute it and/or modify it
    6 // under the terms of the GNU General Public License version 2 only, as
    7 // published by the Free Software Foundation.
    8 //
    9 // This code is distributed in the hope that it will be useful, but WITHOUT
   10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12 // version 2 for more details (a copy is included in the LICENSE file that
   13 // accompanied this code).
   14 //
   15 // You should have received a copy of the GNU General Public License version
   16 // 2 along with this work; if not, write to the Free Software Foundation,
   17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18 //
   19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20 // or visit www.oracle.com if you need additional information or have any
   21 // questions.
   22 //
   23 //
   24 
   25 // X86 Common Architecture Description File
   26 
   27 //----------REGISTER DEFINITION BLOCK------------------------------------------
   28 // This information is used by the matcher and the register allocator to
   29 // describe individual registers and classes of registers within the target
   30 // architecture.
   31 
   32 register %{
   33 //----------Architecture Description Register Definitions----------------------
   34 // General Registers
   35 // "reg_def"  name ( register save type, C convention save type,
   36 //                   ideal register type, encoding );
   37 // Register Save Types:
   38 //
   39 // NS  = No-Save:       The register allocator assumes that these registers
   40 //                      can be used without saving upon entry to the method, &
   41 //                      that they do not need to be saved at call sites.
   42 //
   43 // SOC = Save-On-Call:  The register allocator assumes that these registers
   44 //                      can be used without saving upon entry to the method,
   45 //                      but that they must be saved at call sites.
   46 //
   47 // SOE = Save-On-Entry: The register allocator assumes that these registers
   48 //                      must be saved before using them upon entry to the
   49 //                      method, but they do not need to be saved at call
   50 //                      sites.
   51 //
   52 // AS  = Always-Save:   The register allocator assumes that these registers
   53 //                      must be saved before using them upon entry to the
   54 //                      method, & that they must be saved at call sites.
   55 //
   56 // Ideal Register Type is used to determine how to save & restore a
   57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
   58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
   59 //
   60 // The encoding number is the actual bit-pattern placed into the opcodes.
   61 
   62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
   63 // Word a in each register holds a Float, words ab hold a Double.
   64 // The whole registers are used in SSE4.2 version intrinsics,
   65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
   66 // UseXMMForArrayCopy and UseSuperword flags).
   67 // For pre EVEX enabled architectures:
   68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
   69 // For EVEX enabled architectures:
   70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
   71 //
   72 // Linux ABI:   No register preserved across function calls
   73 //              XMM0-XMM7 might hold parameters
   74 // Windows ABI: XMM6-XMM15 preserved across function calls
   75 //              XMM0-XMM3 might hold parameters
   76 
   77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
   79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
   80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
   81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
   82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
   83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
   84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
   85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
   86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
   87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
   88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
   89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
   90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
   91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
   92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
   93 
   94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
   96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
   97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
   98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
   99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
  103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
  104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
  105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
  106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
  107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
  108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
  109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
  110 
  111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
  120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
  121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
  122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
  123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
  124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
  125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
  126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
  127 
  128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
  129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
  130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
  131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
  132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
  133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
  134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
  135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
  136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
  137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
  138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
  139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
  140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
  141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
  142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
  143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
  144 
  145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
  146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
  147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
  148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
  149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
  150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
  151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
  152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
  153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
  154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
  155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
  156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
  157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
  158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
  159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
  160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
  161 
  162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
  163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
  164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
  165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
  166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
  167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
  168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
  169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
  170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
  171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
  172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
  173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
  174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
  175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
  176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
  177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
  178 
  179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
  180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
  181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
  182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
  183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
  184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
  185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
  186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
  187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
  188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
  189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
  190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
  191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
  192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
  193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
  194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
  195 
  196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
  197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
  198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
  199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
  200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
  201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
  202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
  203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
  204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
  205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
  206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
  207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
  208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
  209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
  210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
  211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
  212 
  213 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
  214 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
  215 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
  216 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
  217 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
  218 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
  219 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
  220 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
  221 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
  222 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
  223 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
  224 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
  225 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
  226 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
  227 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
  228 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
  229 
  230 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
  231 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
  232 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
  233 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
  234 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
  235 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
  236 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
  237 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
  238 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
  239 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
  240 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
  241 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
  242 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
  243 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
  244 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
  245 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
  246 
  247 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
  248 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
  249 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
  250 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
  251 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
  252 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
  253 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
  254 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
  255 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
  256 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
  257 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
  258 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
  259 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
  260 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
  261 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
  262 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
  263 
  264 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
  265 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
  266 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
  267 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
  268 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
  269 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
  270 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
  271 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
  272 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
  273 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
  274 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
  275 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
  276 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
  277 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
  278 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
  279 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
  280 
  281 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
  282 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
  283 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
  284 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
  285 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
  286 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
  287 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
  288 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
  289 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
  290 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
  291 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
  292 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
  293 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
  294 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
  295 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
  296 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
  297 
  298 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
  299 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
  300 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
  301 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
  302 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
  303 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
  304 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
  305 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
  306 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
  307 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
  308 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
  309 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
  310 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
  311 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
  312 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
  313 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
  314 
  315 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
  316 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
  317 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
  318 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
  319 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
  320 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
  321 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
  322 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
  323 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
  324 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
  325 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
  326 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
  327 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
  328 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
  329 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
  330 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
  331 
  332 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
  333 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
  334 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
  335 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
  336 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
  337 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
  338 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
  339 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
  340 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
  341 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
  342 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
  343 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
  344 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
  345 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
  346 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
  347 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
  348 
  349 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
  350 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
  351 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
  352 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
  353 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
  354 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
  355 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
  356 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
  357 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
  358 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
  359 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
  360 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
  361 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
  362 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
  363 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
  364 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
  365 
  366 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
  367 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
  368 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
  369 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
  370 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
  371 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
  372 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
  373 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
  374 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
  375 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
  376 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
  377 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
  378 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
  379 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
  380 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
  381 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
  382 
  383 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
  384 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
  385 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
  386 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
  387 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
  388 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
  389 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
  390 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
  391 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
  392 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
  393 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
  394 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
  395 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
  396 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
  397 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
  398 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
  399 
  400 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
  401 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
  402 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
  403 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
  404 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
  405 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
  406 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
  407 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
  408 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
  409 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
  410 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
  411 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
  412 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
  413 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
  414 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
  415 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
  416 
  417 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
  418 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
  419 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
  420 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
  421 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
  422 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
  423 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
  424 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
  425 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
  426 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
  427 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
  428 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
  429 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
  430 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
  431 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
  432 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
  433 
  434 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
  435 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
  436 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
  437 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
  438 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
  439 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
  440 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
  441 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
  442 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
  443 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
  444 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
  445 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
  446 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
  447 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
  448 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
  449 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
  450 
  451 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
  452 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
  453 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
  454 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
  455 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
  456 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
  457 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
  458 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
  459 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
  460 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
  461 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
  462 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
  463 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
  464 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
  465 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
  466 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
  467 
  468 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
  469 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
  470 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
  471 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
  472 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
  473 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
  474 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
  475 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
  476 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
  477 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
  478 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
  479 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
  480 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
  481 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
  482 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
  483 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
  484 
  485 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
  486 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
  487 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
  488 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
  489 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
  490 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
  491 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
  492 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
  493 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
  494 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
  495 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
  496 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
  497 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
  498 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
  499 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
  500 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
  501 
  502 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
  503 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
  504 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
  505 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
  506 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
  507 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
  508 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
  509 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
  510 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
  511 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
  512 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
  513 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
  514 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
  515 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
  516 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
  517 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
  518 
  519 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
  520 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
  521 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
  522 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
  523 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
  524 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
  525 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
  526 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
  527 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
  528 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
  529 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
  530 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
  531 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
  532 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
  533 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
  534 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
  535 
  536 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
  537 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
  538 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
  539 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
  540 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
  541 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
  542 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
  543 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
  544 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
  545 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
  546 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
  547 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
  548 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
  549 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
  550 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
  551 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
  552 
  553 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
  554 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
  555 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
  556 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
  557 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
  558 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
  559 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
  560 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
  561 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
  562 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
  563 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
  564 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
  565 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
  566 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
  567 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
  568 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
  569 
  570 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
  571 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
  572 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
  573 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
  574 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
  575 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
  576 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
  577 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
  578 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
  579 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
  580 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
  581 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
  582 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
  583 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
  584 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
  585 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
  586 
  587 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
  588 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
  589 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
  590 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
  591 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
  592 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
  593 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
  594 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
  595 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
  596 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
  597 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
  598 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
  599 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
  600 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
  601 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
  602 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
  603 
  604 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
  605 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
  606 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
  607 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
  608 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
  609 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
  610 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
  611 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
  612 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
  613 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
  614 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
  615 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
  616 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
  617 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
  618 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
  619 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
  620 
  621 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
  622 
  623 // AVX3 Mask Registers.
  624 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
  625 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
  626 
  627 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
  628 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
  629 
  630 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
  631 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
  632 
  633 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
  634 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
  635 
  636 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
  637 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
  638 
  639 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
  640 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
  641 
  642 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
  643 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
  644 
  645 
  646 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
  647                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
  648                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
  649                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
  650                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
  651                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
  652                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
  653                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
  654                    XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
  655                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
  656                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
  657                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
  658                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
  659                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
  660                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
  661                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p,
  662                    XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
  663                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
  664                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
  665                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
  666                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
  667                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
  668                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
  669                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
  670                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
  671                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
  672                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
  673                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
  674                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
  675                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
  676                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
  677                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
  678 
  679 alloc_class chunk2(K7, K7_H,
  680                    K6, K6_H,
  681                    K5, K5_H,
  682                    K4, K4_H,
  683                    K3, K3_H,
  684                    K2, K2_H,
  685                    K1, K1_H);
  686 
  687 reg_class  vectmask_reg(K1, K1_H,
  688                         K2, K2_H,
  689                         K3, K3_H,
  690                         K4, K4_H,
  691                         K5, K5_H,
  692                         K6, K6_H,
  693                         K7, K7_H);
  694 
  695 reg_class vectmask_reg_K1(K1, K1_H);
  696 reg_class vectmask_reg_K2(K2, K2_H);
  697 reg_class vectmask_reg_K3(K3, K3_H);
  698 reg_class vectmask_reg_K4(K4, K4_H);
  699 reg_class vectmask_reg_K5(K5, K5_H);
  700 reg_class vectmask_reg_K6(K6, K6_H);
  701 reg_class vectmask_reg_K7(K7, K7_H);
  702 
  703 // flags allocation class should be last.
  704 alloc_class chunk3(RFLAGS);
  705 
  706 
  707 // Singleton class for condition codes
  708 reg_class int_flags(RFLAGS);
  709 
  710 // Class for pre evex float registers
  711 reg_class float_reg_legacy(XMM0,
  712                     XMM1,
  713                     XMM2,
  714                     XMM3,
  715                     XMM4,
  716                     XMM5,
  717                     XMM6,
  718                     XMM7,
  719                     XMM8,
  720                     XMM9,
  721                     XMM10,
  722                     XMM11,
  723                     XMM12,
  724                     XMM13,
  725                     XMM14,
  726                     XMM15);
  727 
  728 // Class for evex float registers
  729 reg_class float_reg_evex(XMM0,
  730                     XMM1,
  731                     XMM2,
  732                     XMM3,
  733                     XMM4,
  734                     XMM5,
  735                     XMM6,
  736                     XMM7,
  737                     XMM8,
  738                     XMM9,
  739                     XMM10,
  740                     XMM11,
  741                     XMM12,
  742                     XMM13,
  743                     XMM14,
  744                     XMM15,
  745                     XMM16,
  746                     XMM17,
  747                     XMM18,
  748                     XMM19,
  749                     XMM20,
  750                     XMM21,
  751                     XMM22,
  752                     XMM23,
  753                     XMM24,
  754                     XMM25,
  755                     XMM26,
  756                     XMM27,
  757                     XMM28,
  758                     XMM29,
  759                     XMM30,
  760                     XMM31);
  761 
  762 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
  763 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  764 
  765 // Class for pre evex double registers
  766 reg_class double_reg_legacy(XMM0,  XMM0b,
  767                      XMM1,  XMM1b,
  768                      XMM2,  XMM2b,
  769                      XMM3,  XMM3b,
  770                      XMM4,  XMM4b,
  771                      XMM5,  XMM5b,
  772                      XMM6,  XMM6b,
  773                      XMM7,  XMM7b,
  774                      XMM8,  XMM8b,
  775                      XMM9,  XMM9b,
  776                      XMM10, XMM10b,
  777                      XMM11, XMM11b,
  778                      XMM12, XMM12b,
  779                      XMM13, XMM13b,
  780                      XMM14, XMM14b,
  781                      XMM15, XMM15b);
  782 
  783 // Class for evex double registers
  784 reg_class double_reg_evex(XMM0,  XMM0b,
  785                      XMM1,  XMM1b,
  786                      XMM2,  XMM2b,
  787                      XMM3,  XMM3b,
  788                      XMM4,  XMM4b,
  789                      XMM5,  XMM5b,
  790                      XMM6,  XMM6b,
  791                      XMM7,  XMM7b,
  792                      XMM8,  XMM8b,
  793                      XMM9,  XMM9b,
  794                      XMM10, XMM10b,
  795                      XMM11, XMM11b,
  796                      XMM12, XMM12b,
  797                      XMM13, XMM13b,
  798                      XMM14, XMM14b,
  799                      XMM15, XMM15b,
  800                      XMM16, XMM16b,
  801                      XMM17, XMM17b,
  802                      XMM18, XMM18b,
  803                      XMM19, XMM19b,
  804                      XMM20, XMM20b,
  805                      XMM21, XMM21b,
  806                      XMM22, XMM22b,
  807                      XMM23, XMM23b,
  808                      XMM24, XMM24b,
  809                      XMM25, XMM25b,
  810                      XMM26, XMM26b,
  811                      XMM27, XMM27b,
  812                      XMM28, XMM28b,
  813                      XMM29, XMM29b,
  814                      XMM30, XMM30b,
  815                      XMM31, XMM31b);
  816 
  817 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
  818 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  819 
  820 // Class for pre evex 32bit vector registers
  821 reg_class vectors_reg_legacy(XMM0,
  822                       XMM1,
  823                       XMM2,
  824                       XMM3,
  825                       XMM4,
  826                       XMM5,
  827                       XMM6,
  828                       XMM7,
  829                       XMM8,
  830                       XMM9,
  831                       XMM10,
  832                       XMM11,
  833                       XMM12,
  834                       XMM13,
  835                       XMM14,
  836                       XMM15);
  837 
  838 // Class for evex 32bit vector registers
  839 reg_class vectors_reg_evex(XMM0,
  840                       XMM1,
  841                       XMM2,
  842                       XMM3,
  843                       XMM4,
  844                       XMM5,
  845                       XMM6,
  846                       XMM7,
  847                       XMM8,
  848                       XMM9,
  849                       XMM10,
  850                       XMM11,
  851                       XMM12,
  852                       XMM13,
  853                       XMM14,
  854                       XMM15,
  855                       XMM16,
  856                       XMM17,
  857                       XMM18,
  858                       XMM19,
  859                       XMM20,
  860                       XMM21,
  861                       XMM22,
  862                       XMM23,
  863                       XMM24,
  864                       XMM25,
  865                       XMM26,
  866                       XMM27,
  867                       XMM28,
  868                       XMM29,
  869                       XMM30,
  870                       XMM31);
  871 
  872 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
  873 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  874 
  875 // Class for all 64bit vector registers
  876 reg_class vectord_reg_legacy(XMM0,  XMM0b,
  877                       XMM1,  XMM1b,
  878                       XMM2,  XMM2b,
  879                       XMM3,  XMM3b,
  880                       XMM4,  XMM4b,
  881                       XMM5,  XMM5b,
  882                       XMM6,  XMM6b,
  883                       XMM7,  XMM7b,
  884                       XMM8,  XMM8b,
  885                       XMM9,  XMM9b,
  886                       XMM10, XMM10b,
  887                       XMM11, XMM11b,
  888                       XMM12, XMM12b,
  889                       XMM13, XMM13b,
  890                       XMM14, XMM14b,
  891                       XMM15, XMM15b);
  892 
  893 // Class for all 64bit vector registers
  894 reg_class vectord_reg_evex(XMM0,  XMM0b,
  895                       XMM1,  XMM1b,
  896                       XMM2,  XMM2b,
  897                       XMM3,  XMM3b,
  898                       XMM4,  XMM4b,
  899                       XMM5,  XMM5b,
  900                       XMM6,  XMM6b,
  901                       XMM7,  XMM7b,
  902                       XMM8,  XMM8b,
  903                       XMM9,  XMM9b,
  904                       XMM10, XMM10b,
  905                       XMM11, XMM11b,
  906                       XMM12, XMM12b,
  907                       XMM13, XMM13b,
  908                       XMM14, XMM14b,
  909                       XMM15, XMM15b,
  910                       XMM16, XMM16b,
  911                       XMM17, XMM17b,
  912                       XMM18, XMM18b,
  913                       XMM19, XMM19b,
  914                       XMM20, XMM20b,
  915                       XMM21, XMM21b,
  916                       XMM22, XMM22b,
  917                       XMM23, XMM23b,
  918                       XMM24, XMM24b,
  919                       XMM25, XMM25b,
  920                       XMM26, XMM26b,
  921                       XMM27, XMM27b,
  922                       XMM28, XMM28b,
  923                       XMM29, XMM29b,
  924                       XMM30, XMM30b,
  925                       XMM31, XMM31b);
  926 
  927 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
  928 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  929 
  930 // Class for all 128bit vector registers
  931 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
  932                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  933                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  934                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  935                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  936                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  937                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  938                       XMM7,  XMM7b,  XMM7c,  XMM7d,
  939                       XMM8,  XMM8b,  XMM8c,  XMM8d,
  940                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  941                       XMM10, XMM10b, XMM10c, XMM10d,
  942                       XMM11, XMM11b, XMM11c, XMM11d,
  943                       XMM12, XMM12b, XMM12c, XMM12d,
  944                       XMM13, XMM13b, XMM13c, XMM13d,
  945                       XMM14, XMM14b, XMM14c, XMM14d,
  946                       XMM15, XMM15b, XMM15c, XMM15d);
  947 
  948 // Class for all 128bit vector registers
  949 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
  950                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  951                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  952                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  953                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  954                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  955                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  956                       XMM7,  XMM7b,  XMM7c,  XMM7d,
  957                       XMM8,  XMM8b,  XMM8c,  XMM8d,
  958                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  959                       XMM10, XMM10b, XMM10c, XMM10d,
  960                       XMM11, XMM11b, XMM11c, XMM11d,
  961                       XMM12, XMM12b, XMM12c, XMM12d,
  962                       XMM13, XMM13b, XMM13c, XMM13d,
  963                       XMM14, XMM14b, XMM14c, XMM14d,
  964                       XMM15, XMM15b, XMM15c, XMM15d,
  965                       XMM16, XMM16b, XMM16c, XMM16d,
  966                       XMM17, XMM17b, XMM17c, XMM17d,
  967                       XMM18, XMM18b, XMM18c, XMM18d,
  968                       XMM19, XMM19b, XMM19c, XMM19d,
  969                       XMM20, XMM20b, XMM20c, XMM20d,
  970                       XMM21, XMM21b, XMM21c, XMM21d,
  971                       XMM22, XMM22b, XMM22c, XMM22d,
  972                       XMM23, XMM23b, XMM23c, XMM23d,
  973                       XMM24, XMM24b, XMM24c, XMM24d,
  974                       XMM25, XMM25b, XMM25c, XMM25d,
  975                       XMM26, XMM26b, XMM26c, XMM26d,
  976                       XMM27, XMM27b, XMM27c, XMM27d,
  977                       XMM28, XMM28b, XMM28c, XMM28d,
  978                       XMM29, XMM29b, XMM29c, XMM29d,
  979                       XMM30, XMM30b, XMM30c, XMM30d,
  980                       XMM31, XMM31b, XMM31c, XMM31d);
  981 
  982 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
  983 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  984 
  985 // Class for all 256bit vector registers
  986 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
  987                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
  988                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
  989                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
  990                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
  991                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
  992                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
  993                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,
  994                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
  995                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
  996                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
  997                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
  998                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
  999                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1000                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1001                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h);
 1002 
 1003 // Class for all 256bit vector registers
 1004 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1005                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1006                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1007                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1008                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1009                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1010                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1011                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,
 1012                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1013                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1014                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1015                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1016                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1017                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1018                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1019                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
 1020                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
 1021                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
 1022                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
 1023                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
 1024                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
 1025                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
 1026                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
 1027                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
 1028                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
 1029                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
 1030                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
 1031                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
 1032                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
 1033                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
 1034                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
 1035                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h);
 1036 
 1037 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
 1038 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1039 
 1040 // Class for all 512bit vector registers
 1041 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1042                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1043                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1044                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1045                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1046                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1047                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1048                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
 1049                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1050                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1051                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1052                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1053                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1054                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1055                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1056                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p,
 1057                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 1058                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 1059                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 1060                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 1061                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 1062                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 1063                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 1064                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 1065                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 1066                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 1067                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 1068                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 1069                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 1070                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 1071                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 1072                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
 1073 
 1074 // Class for restricted 512bit vector registers
 1075 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1076                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1077                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1078                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1079                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1080                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1081                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1082                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
 1083                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1084                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1085                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1086                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1087                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1088                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1089                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1090                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p);
 1091 
 1092 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
 1093 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 1094 
 1095 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 1096 %}
 1097 
 1098 
 1099 //----------SOURCE BLOCK-------------------------------------------------------
 1100 // This is a block of C++ code which provides values, functions, and
 1101 // definitions necessary in the rest of the architecture description
 1102 
 1103 source_hpp %{
 1104 // Header information of the source block.
 1105 // Method declarations/definitions which are used outside
 1106 // the ad-scope can conveniently be defined here.
 1107 //
 1108 // To keep related declarations/definitions/uses close together,
 1109 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 1110 
 1111 #include "runtime/vm_version.hpp"
 1112 
 1113 class NativeJump;
 1114 
 1115 class CallStubImpl {
 1116 
 1117   //--------------------------------------------------------------
 1118   //---<  Used for optimization in Compile::shorten_branches  >---
 1119   //--------------------------------------------------------------
 1120 
 1121  public:
 1122   // Size of call trampoline stub.
 1123   static uint size_call_trampoline() {
 1124     return 0; // no call trampolines on this platform
 1125   }
 1126 
 1127   // number of relocations needed by a call trampoline stub
 1128   static uint reloc_call_trampoline() {
 1129     return 0; // no call trampolines on this platform
 1130   }
 1131 };
 1132 
 1133 class HandlerImpl {
 1134 
 1135  public:
 1136 
 1137   static int emit_exception_handler(C2_MacroAssembler *masm);
 1138   static int emit_deopt_handler(C2_MacroAssembler* masm);
 1139 
 1140   static uint size_exception_handler() {
 1141     // NativeCall instruction size is the same as NativeJump.
 1142     // exception handler starts out as jump and can be patched to
 1143     // a call be deoptimization.  (4932387)
 1144     // Note that this value is also credited (in output.cpp) to
 1145     // the size of the code section.
 1146     return NativeJump::instruction_size;
 1147   }
 1148 
 1149   static uint size_deopt_handler() {
 1150     // three 5 byte instructions plus one move for unreachable address.
 1151     return 15+3;
 1152   }
 1153 };
 1154 
 1155 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
 1156   switch(bytes) {
 1157     case  4: // fall-through
 1158     case  8: // fall-through
 1159     case 16: return Assembler::AVX_128bit;
 1160     case 32: return Assembler::AVX_256bit;
 1161     case 64: return Assembler::AVX_512bit;
 1162 
 1163     default: {
 1164       ShouldNotReachHere();
 1165       return Assembler::AVX_NoVec;
 1166     }
 1167   }
 1168 }
 1169 
 1170 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
 1171   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
 1172 }
 1173 
 1174 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
 1175   uint def_idx = use->operand_index(opnd);
 1176   Node* def = use->in(def_idx);
 1177   return vector_length_encoding(def);
 1178 }
 1179 
 1180 static inline bool is_vector_popcount_predicate(BasicType bt) {
 1181   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1182          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1183 }
 1184 
 1185 static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
 1186   return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
 1187            (VM_Version::supports_avx512vl() || vlen_bytes == 64);
 1188 }
 1189 
 1190 class Node::PD {
 1191 public:
 1192   enum NodeFlags {
 1193     Flag_intel_jcc_erratum    = Node::_last_flag << 1,
 1194     Flag_sets_carry_flag      = Node::_last_flag << 2,
 1195     Flag_sets_parity_flag     = Node::_last_flag << 3,
 1196     Flag_sets_zero_flag       = Node::_last_flag << 4,
 1197     Flag_sets_overflow_flag   = Node::_last_flag << 5,
 1198     Flag_sets_sign_flag       = Node::_last_flag << 6,
 1199     Flag_clears_carry_flag    = Node::_last_flag << 7,
 1200     Flag_clears_parity_flag   = Node::_last_flag << 8,
 1201     Flag_clears_zero_flag     = Node::_last_flag << 9,
 1202     Flag_clears_overflow_flag = Node::_last_flag << 10,
 1203     Flag_clears_sign_flag     = Node::_last_flag << 11,
 1204     _last_flag                = Flag_clears_sign_flag
 1205   };
 1206 };
 1207 
 1208 %} // end source_hpp
 1209 
 1210 source %{
 1211 
 1212 #include "opto/addnode.hpp"
 1213 #include "c2_intelJccErratum_x86.hpp"
 1214 
 1215 void PhaseOutput::pd_perform_mach_node_analysis() {
 1216   if (VM_Version::has_intel_jcc_erratum()) {
 1217     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
 1218     _buf_sizes._code += extra_padding;
 1219   }
 1220 }
 1221 
 1222 int MachNode::pd_alignment_required() const {
 1223   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
 1224     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
 1225     return IntelJccErratum::largest_jcc_size() + 1;
 1226   } else {
 1227     return 1;
 1228   }
 1229 }
 1230 
 1231 int MachNode::compute_padding(int current_offset) const {
 1232   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
 1233     Compile* C = Compile::current();
 1234     PhaseOutput* output = C->output();
 1235     Block* block = output->block();
 1236     int index = output->index();
 1237     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
 1238   } else {
 1239     return 0;
 1240   }
 1241 }
 1242 
 1243 // Emit exception handler code.
 1244 // Stuff framesize into a register and call a VM stub routine.
 1245 int HandlerImpl::emit_exception_handler(C2_MacroAssembler* masm) {
 1246 
 1247   // Note that the code buffer's insts_mark is always relative to insts.
 1248   // That's why we must use the macroassembler to generate a handler.
 1249   address base = __ start_a_stub(size_exception_handler());
 1250   if (base == nullptr) {
 1251     ciEnv::current()->record_failure("CodeCache is full");
 1252     return 0;  // CodeBuffer::expand failed
 1253   }
 1254   int offset = __ offset();
 1255   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 1256   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 1257   __ end_a_stub();
 1258   return offset;
 1259 }
 1260 
 1261 // Emit deopt handler code.
 1262 int HandlerImpl::emit_deopt_handler(C2_MacroAssembler* masm) {
 1263 
 1264   // Note that the code buffer's insts_mark is always relative to insts.
 1265   // That's why we must use the macroassembler to generate a handler.
 1266   address base = __ start_a_stub(size_deopt_handler());
 1267   if (base == nullptr) {
 1268     ciEnv::current()->record_failure("CodeCache is full");
 1269     return 0;  // CodeBuffer::expand failed
 1270   }
 1271   int offset = __ offset();
 1272 
 1273   address the_pc = (address) __ pc();
 1274   Label next;
 1275   // push a "the_pc" on the stack without destroying any registers
 1276   // as they all may be live.
 1277 
 1278   // push address of "next"
 1279   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 1280   __ bind(next);
 1281   // adjust it so it matches "the_pc"
 1282   __ subptr(Address(rsp, 0), __ offset() - offset);
 1283 
 1284   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 1285   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
 1286   __ end_a_stub();
 1287   return offset;
 1288 }
 1289 
 1290 static Assembler::Width widthForType(BasicType bt) {
 1291   if (bt == T_BYTE) {
 1292     return Assembler::B;
 1293   } else if (bt == T_SHORT) {
 1294     return Assembler::W;
 1295   } else if (bt == T_INT) {
 1296     return Assembler::D;
 1297   } else {
 1298     assert(bt == T_LONG, "not a long: %s", type2name(bt));
 1299     return Assembler::Q;
 1300   }
 1301 }
 1302 
 1303 //=============================================================================
 1304 
 1305   // Float masks come from different places depending on platform.
 1306   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 1307   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 1308   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 1309   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 1310   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
 1311   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
 1312   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
 1313   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
 1314   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
 1315   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
 1316   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
 1317   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
 1318   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
 1319   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
 1320   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
 1321   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
 1322   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
 1323   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
 1324   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
 1325 
 1326 //=============================================================================
 1327 bool Matcher::match_rule_supported(int opcode) {
 1328   if (!has_match_rule(opcode)) {
 1329     return false; // no match rule present
 1330   }
 1331   switch (opcode) {
 1332     case Op_AbsVL:
 1333     case Op_StoreVectorScatter:
 1334       if (UseAVX < 3) {
 1335         return false;
 1336       }
 1337       break;
 1338     case Op_PopCountI:
 1339     case Op_PopCountL:
 1340       if (!UsePopCountInstruction) {
 1341         return false;
 1342       }
 1343       break;
 1344     case Op_PopCountVI:
 1345       if (UseAVX < 2) {
 1346         return false;
 1347       }
 1348       break;
 1349     case Op_CompressV:
 1350     case Op_ExpandV:
 1351     case Op_PopCountVL:
 1352       if (UseAVX < 2) {
 1353         return false;
 1354       }
 1355       break;
 1356     case Op_MulVI:
 1357       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
 1358         return false;
 1359       }
 1360       break;
 1361     case Op_MulVL:
 1362       if (UseSSE < 4) { // only with SSE4_1 or AVX
 1363         return false;
 1364       }
 1365       break;
 1366     case Op_MulReductionVL:
 1367       if (VM_Version::supports_avx512dq() == false) {
 1368         return false;
 1369       }
 1370       break;
 1371     case Op_AbsVB:
 1372     case Op_AbsVS:
 1373     case Op_AbsVI:
 1374     case Op_AddReductionVI:
 1375     case Op_AndReductionV:
 1376     case Op_OrReductionV:
 1377     case Op_XorReductionV:
 1378       if (UseSSE < 3) { // requires at least SSSE3
 1379         return false;
 1380       }
 1381       break;
 1382     case Op_MaxHF:
 1383     case Op_MinHF:
 1384       if (!VM_Version::supports_avx512vlbw()) {
 1385         return false;
 1386       }  // fallthrough
 1387     case Op_AddHF:
 1388     case Op_DivHF:
 1389     case Op_FmaHF:
 1390     case Op_MulHF:
 1391     case Op_ReinterpretS2HF:
 1392     case Op_ReinterpretHF2S:
 1393     case Op_SubHF:
 1394     case Op_SqrtHF:
 1395       if (!VM_Version::supports_avx512_fp16()) {
 1396         return false;
 1397       }
 1398       break;
 1399     case Op_VectorLoadShuffle:
 1400     case Op_VectorRearrange:
 1401     case Op_MulReductionVI:
 1402       if (UseSSE < 4) { // requires at least SSE4
 1403         return false;
 1404       }
 1405       break;
 1406     case Op_IsInfiniteF:
 1407     case Op_IsInfiniteD:
 1408       if (!VM_Version::supports_avx512dq()) {
 1409         return false;
 1410       }
 1411       break;
 1412     case Op_SqrtVD:
 1413     case Op_SqrtVF:
 1414     case Op_VectorMaskCmp:
 1415     case Op_VectorCastB2X:
 1416     case Op_VectorCastS2X:
 1417     case Op_VectorCastI2X:
 1418     case Op_VectorCastL2X:
 1419     case Op_VectorCastF2X:
 1420     case Op_VectorCastD2X:
 1421     case Op_VectorUCastB2X:
 1422     case Op_VectorUCastS2X:
 1423     case Op_VectorUCastI2X:
 1424     case Op_VectorMaskCast:
 1425       if (UseAVX < 1) { // enabled for AVX only
 1426         return false;
 1427       }
 1428       break;
 1429     case Op_PopulateIndex:
 1430       if (UseAVX < 2) {
 1431         return false;
 1432       }
 1433       break;
 1434     case Op_RoundVF:
 1435       if (UseAVX < 2) { // enabled for AVX2 only
 1436         return false;
 1437       }
 1438       break;
 1439     case Op_RoundVD:
 1440       if (UseAVX < 3) {
 1441         return false;  // enabled for AVX3 only
 1442       }
 1443       break;
 1444     case Op_CompareAndSwapL:
 1445     case Op_CompareAndSwapP:
 1446       break;
 1447     case Op_StrIndexOf:
 1448       if (!UseSSE42Intrinsics) {
 1449         return false;
 1450       }
 1451       break;
 1452     case Op_StrIndexOfChar:
 1453       if (!UseSSE42Intrinsics) {
 1454         return false;
 1455       }
 1456       break;
 1457     case Op_OnSpinWait:
 1458       if (VM_Version::supports_on_spin_wait() == false) {
 1459         return false;
 1460       }
 1461       break;
 1462     case Op_MulVB:
 1463     case Op_LShiftVB:
 1464     case Op_RShiftVB:
 1465     case Op_URShiftVB:
 1466     case Op_VectorInsert:
 1467     case Op_VectorLoadMask:
 1468     case Op_VectorStoreMask:
 1469     case Op_VectorBlend:
 1470       if (UseSSE < 4) {
 1471         return false;
 1472       }
 1473       break;
 1474     case Op_MaxD:
 1475     case Op_MaxF:
 1476     case Op_MinD:
 1477     case Op_MinF:
 1478       if (UseAVX < 1) { // enabled for AVX only
 1479         return false;
 1480       }
 1481       break;
 1482     case Op_CacheWB:
 1483     case Op_CacheWBPreSync:
 1484     case Op_CacheWBPostSync:
 1485       if (!VM_Version::supports_data_cache_line_flush()) {
 1486         return false;
 1487       }
 1488       break;
 1489     case Op_ExtractB:
 1490     case Op_ExtractL:
 1491     case Op_ExtractI:
 1492     case Op_RoundDoubleMode:
 1493       if (UseSSE < 4) {
 1494         return false;
 1495       }
 1496       break;
 1497     case Op_RoundDoubleModeV:
 1498       if (VM_Version::supports_avx() == false) {
 1499         return false; // 128bit vroundpd is not available
 1500       }
 1501       break;
 1502     case Op_LoadVectorGather:
 1503     case Op_LoadVectorGatherMasked:
 1504       if (UseAVX < 2) {
 1505         return false;
 1506       }
 1507       break;
 1508     case Op_FmaF:
 1509     case Op_FmaD:
 1510     case Op_FmaVD:
 1511     case Op_FmaVF:
 1512       if (!UseFMA) {
 1513         return false;
 1514       }
 1515       break;
 1516     case Op_MacroLogicV:
 1517       if (UseAVX < 3 || !UseVectorMacroLogic) {
 1518         return false;
 1519       }
 1520       break;
 1521 
 1522     case Op_VectorCmpMasked:
 1523     case Op_VectorMaskGen:
 1524       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1525         return false;
 1526       }
 1527       break;
 1528     case Op_VectorMaskFirstTrue:
 1529     case Op_VectorMaskLastTrue:
 1530     case Op_VectorMaskTrueCount:
 1531     case Op_VectorMaskToLong:
 1532       if (UseAVX < 1) {
 1533          return false;
 1534       }
 1535       break;
 1536     case Op_RoundF:
 1537     case Op_RoundD:
 1538       break;
 1539     case Op_CopySignD:
 1540     case Op_CopySignF:
 1541       if (UseAVX < 3)  {
 1542         return false;
 1543       }
 1544       if (!VM_Version::supports_avx512vl()) {
 1545         return false;
 1546       }
 1547       break;
 1548     case Op_CompressBits:
 1549     case Op_ExpandBits:
 1550       if (!VM_Version::supports_bmi2()) {
 1551         return false;
 1552       }
 1553       break;
 1554     case Op_CompressM:
 1555       if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
 1556         return false;
 1557       }
 1558       break;
 1559     case Op_ConvF2HF:
 1560     case Op_ConvHF2F:
 1561       if (!VM_Version::supports_float16()) {
 1562         return false;
 1563       }
 1564       break;
 1565     case Op_VectorCastF2HF:
 1566     case Op_VectorCastHF2F:
 1567       if (!VM_Version::supports_f16c() && !VM_Version::supports_evex()) {
 1568         return false;
 1569       }
 1570       break;
 1571   }
 1572   return true;  // Match rules are supported by default.
 1573 }
 1574 
 1575 //------------------------------------------------------------------------
 1576 
 1577 static inline bool is_pop_count_instr_target(BasicType bt) {
 1578   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1579          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1580 }
 1581 
 1582 bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
 1583   return match_rule_supported_vector(opcode, vlen, bt);
 1584 }
 1585 
 1586 // Identify extra cases that we might want to provide match rules for vector nodes and
 1587 // other intrinsics guarded with vector length (vlen) and element type (bt).
 1588 bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
 1589   if (!match_rule_supported(opcode)) {
 1590     return false;
 1591   }
 1592   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
 1593   //   * SSE2 supports 128bit vectors for all types;
 1594   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
 1595   //   * AVX2 supports 256bit vectors for all types;
 1596   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
 1597   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
 1598   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
 1599   // And MaxVectorSize is taken into account as well.
 1600   if (!vector_size_supported(bt, vlen)) {
 1601     return false;
 1602   }
 1603   // Special cases which require vector length follow:
 1604   //   * implementation limitations
 1605   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
 1606   //   * 128bit vroundpd instruction is present only in AVX1
 1607   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1608   switch (opcode) {
 1609     case Op_MaxVHF:
 1610     case Op_MinVHF:
 1611       if (!VM_Version::supports_avx512bw()) {
 1612         return false;
 1613       }
 1614     case Op_AddVHF:
 1615     case Op_DivVHF:
 1616     case Op_FmaVHF:
 1617     case Op_MulVHF:
 1618     case Op_SubVHF:
 1619     case Op_SqrtVHF:
 1620       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1621         return false;
 1622       }
 1623       if (!VM_Version::supports_avx512_fp16()) {
 1624         return false;
 1625       }
 1626       break;
 1627     case Op_AbsVF:
 1628     case Op_NegVF:
 1629       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
 1630         return false; // 512bit vandps and vxorps are not available
 1631       }
 1632       break;
 1633     case Op_AbsVD:
 1634     case Op_NegVD:
 1635       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
 1636         return false; // 512bit vpmullq, vandpd and vxorpd are not available
 1637       }
 1638       break;
 1639     case Op_RotateRightV:
 1640     case Op_RotateLeftV:
 1641       if (bt != T_INT && bt != T_LONG) {
 1642         return false;
 1643       } // fallthrough
 1644     case Op_MacroLogicV:
 1645       if (!VM_Version::supports_evex() ||
 1646           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
 1647         return false;
 1648       }
 1649       break;
 1650     case Op_ClearArray:
 1651     case Op_VectorMaskGen:
 1652     case Op_VectorCmpMasked:
 1653       if (!VM_Version::supports_avx512bw()) {
 1654         return false;
 1655       }
 1656       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
 1657         return false;
 1658       }
 1659       break;
 1660     case Op_LoadVectorMasked:
 1661     case Op_StoreVectorMasked:
 1662       if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
 1663         return false;
 1664       }
 1665       break;
 1666     case Op_UMinV:
 1667     case Op_UMaxV:
 1668       if (UseAVX == 0) {
 1669         return false;
 1670       }
 1671       break;
 1672     case Op_MaxV:
 1673     case Op_MinV:
 1674       if (UseSSE < 4 && is_integral_type(bt)) {
 1675         return false;
 1676       }
 1677       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
 1678           // Float/Double intrinsics are enabled for AVX family currently.
 1679           if (UseAVX == 0) {
 1680             return false;
 1681           }
 1682           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
 1683             return false;
 1684           }
 1685       }
 1686       break;
 1687     case Op_CallLeafVector:
 1688       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
 1689         return false;
 1690       }
 1691       break;
 1692     case Op_AddReductionVI:
 1693       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
 1694         return false;
 1695       }
 1696       // fallthrough
 1697     case Op_AndReductionV:
 1698     case Op_OrReductionV:
 1699     case Op_XorReductionV:
 1700       if (is_subword_type(bt) && (UseSSE < 4)) {
 1701         return false;
 1702       }
 1703       break;
 1704     case Op_MinReductionV:
 1705     case Op_MaxReductionV:
 1706       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
 1707         return false;
 1708       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
 1709         return false;
 1710       }
 1711       // Float/Double intrinsics enabled for AVX family.
 1712       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
 1713         return false;
 1714       }
 1715       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
 1716         return false;
 1717       }
 1718       break;
 1719     case Op_VectorTest:
 1720       if (UseSSE < 4) {
 1721         return false; // Implementation limitation
 1722       } else if (size_in_bits < 32) {
 1723         return false; // Implementation limitation
 1724       }
 1725       break;
 1726     case Op_VectorLoadShuffle:
 1727     case Op_VectorRearrange:
 1728       if(vlen == 2) {
 1729         return false; // Implementation limitation due to how shuffle is loaded
 1730       } else if (size_in_bits == 256 && UseAVX < 2) {
 1731         return false; // Implementation limitation
 1732       }
 1733       break;
 1734     case Op_VectorLoadMask:
 1735     case Op_VectorMaskCast:
 1736       if (size_in_bits == 256 && UseAVX < 2) {
 1737         return false; // Implementation limitation
 1738       }
 1739       // fallthrough
 1740     case Op_VectorStoreMask:
 1741       if (vlen == 2) {
 1742         return false; // Implementation limitation
 1743       }
 1744       break;
 1745     case Op_PopulateIndex:
 1746       if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
 1747         return false;
 1748       }
 1749       break;
 1750     case Op_VectorCastB2X:
 1751     case Op_VectorCastS2X:
 1752     case Op_VectorCastI2X:
 1753       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
 1754         return false;
 1755       }
 1756       break;
 1757     case Op_VectorCastL2X:
 1758       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
 1759         return false;
 1760       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
 1761         return false;
 1762       }
 1763       break;
 1764     case Op_VectorCastF2X: {
 1765         // As per JLS section 5.1.3 narrowing conversion to sub-word types
 1766         // happen after intermediate conversion to integer and special handling
 1767         // code needs AVX2 vpcmpeqd instruction for 256 bit vectors.
 1768         int src_size_in_bits = type2aelembytes(T_FLOAT) * vlen * BitsPerByte;
 1769         if (is_integral_type(bt) && src_size_in_bits == 256 && UseAVX < 2) {
 1770           return false;
 1771         }
 1772       }
 1773       // fallthrough
 1774     case Op_VectorCastD2X:
 1775       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
 1776         return false;
 1777       }
 1778       break;
 1779     case Op_VectorCastF2HF:
 1780     case Op_VectorCastHF2F:
 1781       if (!VM_Version::supports_f16c() &&
 1782          ((!VM_Version::supports_evex() ||
 1783          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())))) {
 1784         return false;
 1785       }
 1786       break;
 1787     case Op_RoundVD:
 1788       if (!VM_Version::supports_avx512dq()) {
 1789         return false;
 1790       }
 1791       break;
 1792     case Op_MulReductionVI:
 1793       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1794         return false;
 1795       }
 1796       break;
 1797     case Op_LoadVectorGatherMasked:
 1798       if (!is_subword_type(bt) && size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1799         return false;
 1800       }
 1801       if (is_subword_type(bt) &&
 1802          ((size_in_bits > 256 && !VM_Version::supports_avx512bw()) ||
 1803           (size_in_bits < 64)                                      ||
 1804           (bt == T_SHORT && !VM_Version::supports_bmi2()))) {
 1805         return false;
 1806       }
 1807       break;
 1808     case Op_StoreVectorScatterMasked:
 1809     case Op_StoreVectorScatter:
 1810       if (is_subword_type(bt)) {
 1811         return false;
 1812       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1813         return false;
 1814       }
 1815       // fallthrough
 1816     case Op_LoadVectorGather:
 1817       if (!is_subword_type(bt) && size_in_bits == 64) {
 1818         return false;
 1819       }
 1820       if (is_subword_type(bt) && size_in_bits < 64) {
 1821         return false;
 1822       }
 1823       break;
 1824     case Op_SaturatingAddV:
 1825     case Op_SaturatingSubV:
 1826       if (UseAVX < 1) {
 1827         return false; // Implementation limitation
 1828       }
 1829       if (is_subword_type(bt) && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1830         return false;
 1831       }
 1832       break;
 1833     case Op_SelectFromTwoVector:
 1834        if (size_in_bits < 128 || (size_in_bits < 512 && !VM_Version::supports_avx512vl())) {
 1835          return false;
 1836        }
 1837        if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 1838          return false;
 1839        }
 1840        if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 1841          return false;
 1842        }
 1843        if ((bt == T_INT || bt == T_FLOAT || bt == T_DOUBLE) && !VM_Version::supports_evex()) {
 1844          return false;
 1845        }
 1846        break;
 1847     case Op_MaskAll:
 1848       if (!VM_Version::supports_evex()) {
 1849         return false;
 1850       }
 1851       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
 1852         return false;
 1853       }
 1854       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1855         return false;
 1856       }
 1857       break;
 1858     case Op_VectorMaskCmp:
 1859       if (vlen < 2 || size_in_bits < 32) {
 1860         return false;
 1861       }
 1862       break;
 1863     case Op_CompressM:
 1864       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1865         return false;
 1866       }
 1867       break;
 1868     case Op_CompressV:
 1869     case Op_ExpandV:
 1870       if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
 1871         return false;
 1872       }
 1873       if (size_in_bits < 128 ) {
 1874         return false;
 1875       }
 1876     case Op_VectorLongToMask:
 1877       if (UseAVX < 1) {
 1878         return false;
 1879       }
 1880       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
 1881         return false;
 1882       }
 1883       break;
 1884     case Op_SignumVD:
 1885     case Op_SignumVF:
 1886       if (UseAVX < 1) {
 1887         return false;
 1888       }
 1889       break;
 1890     case Op_PopCountVI:
 1891     case Op_PopCountVL: {
 1892         if (!is_pop_count_instr_target(bt) &&
 1893             (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
 1894           return false;
 1895         }
 1896       }
 1897       break;
 1898     case Op_ReverseV:
 1899     case Op_ReverseBytesV:
 1900       if (UseAVX < 2) {
 1901         return false;
 1902       }
 1903       break;
 1904     case Op_CountTrailingZerosV:
 1905     case Op_CountLeadingZerosV:
 1906       if (UseAVX < 2) {
 1907         return false;
 1908       }
 1909       break;
 1910   }
 1911   return true;  // Per default match rules are supported.
 1912 }
 1913 
 1914 bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
 1915   // ADLC based match_rule_supported routine checks for the existence of pattern based
 1916   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
 1917   // of their non-masked counterpart with mask edge being the differentiator.
 1918   // This routine does a strict check on the existence of masked operation patterns
 1919   // by returning a default false value for all the other opcodes apart from the
 1920   // ones whose masked instruction patterns are defined in this file.
 1921   if (!match_rule_supported_vector(opcode, vlen, bt)) {
 1922     return false;
 1923   }
 1924 
 1925   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1926   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
 1927     return false;
 1928   }
 1929   switch(opcode) {
 1930     // Unary masked operations
 1931     case Op_AbsVB:
 1932     case Op_AbsVS:
 1933       if(!VM_Version::supports_avx512bw()) {
 1934         return false;  // Implementation limitation
 1935       }
 1936     case Op_AbsVI:
 1937     case Op_AbsVL:
 1938       return true;
 1939 
 1940     // Ternary masked operations
 1941     case Op_FmaVF:
 1942     case Op_FmaVD:
 1943       return true;
 1944 
 1945     case Op_MacroLogicV:
 1946       if(bt != T_INT && bt != T_LONG) {
 1947         return false;
 1948       }
 1949       return true;
 1950 
 1951     // Binary masked operations
 1952     case Op_AddVB:
 1953     case Op_AddVS:
 1954     case Op_SubVB:
 1955     case Op_SubVS:
 1956     case Op_MulVS:
 1957     case Op_LShiftVS:
 1958     case Op_RShiftVS:
 1959     case Op_URShiftVS:
 1960       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1961       if (!VM_Version::supports_avx512bw()) {
 1962         return false;  // Implementation limitation
 1963       }
 1964       return true;
 1965 
 1966     case Op_MulVL:
 1967       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1968       if (!VM_Version::supports_avx512dq()) {
 1969         return false;  // Implementation limitation
 1970       }
 1971       return true;
 1972 
 1973     case Op_AndV:
 1974     case Op_OrV:
 1975     case Op_XorV:
 1976     case Op_RotateRightV:
 1977     case Op_RotateLeftV:
 1978       if (bt != T_INT && bt != T_LONG) {
 1979         return false; // Implementation limitation
 1980       }
 1981       return true;
 1982 
 1983     case Op_VectorLoadMask:
 1984       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1985       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 1986         return false;
 1987       }
 1988       return true;
 1989 
 1990     case Op_AddVI:
 1991     case Op_AddVL:
 1992     case Op_AddVF:
 1993     case Op_AddVD:
 1994     case Op_SubVI:
 1995     case Op_SubVL:
 1996     case Op_SubVF:
 1997     case Op_SubVD:
 1998     case Op_MulVI:
 1999     case Op_MulVF:
 2000     case Op_MulVD:
 2001     case Op_DivVF:
 2002     case Op_DivVD:
 2003     case Op_SqrtVF:
 2004     case Op_SqrtVD:
 2005     case Op_LShiftVI:
 2006     case Op_LShiftVL:
 2007     case Op_RShiftVI:
 2008     case Op_RShiftVL:
 2009     case Op_URShiftVI:
 2010     case Op_URShiftVL:
 2011     case Op_LoadVectorMasked:
 2012     case Op_StoreVectorMasked:
 2013     case Op_LoadVectorGatherMasked:
 2014     case Op_StoreVectorScatterMasked:
 2015       return true;
 2016 
 2017     case Op_UMinV:
 2018     case Op_UMaxV:
 2019       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 2020         return false;
 2021       } // fallthrough
 2022     case Op_MaxV:
 2023     case Op_MinV:
 2024       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2025         return false; // Implementation limitation
 2026       }
 2027       if (is_floating_point_type(bt) && !VM_Version::supports_avx10_2()) {
 2028         return false; // Implementation limitation
 2029       }
 2030       return true;
 2031     case Op_SaturatingAddV:
 2032     case Op_SaturatingSubV:
 2033       if (!is_subword_type(bt)) {
 2034         return false;
 2035       }
 2036       if (size_in_bits < 128 || !VM_Version::supports_avx512bw()) {
 2037         return false; // Implementation limitation
 2038       }
 2039       return true;
 2040 
 2041     case Op_VectorMaskCmp:
 2042       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2043         return false; // Implementation limitation
 2044       }
 2045       return true;
 2046 
 2047     case Op_VectorRearrange:
 2048       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 2049         return false; // Implementation limitation
 2050       }
 2051       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 2052         return false; // Implementation limitation
 2053       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
 2054         return false; // Implementation limitation
 2055       }
 2056       return true;
 2057 
 2058     // Binary Logical operations
 2059     case Op_AndVMask:
 2060     case Op_OrVMask:
 2061     case Op_XorVMask:
 2062       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
 2063         return false; // Implementation limitation
 2064       }
 2065       return true;
 2066 
 2067     case Op_PopCountVI:
 2068     case Op_PopCountVL:
 2069       if (!is_pop_count_instr_target(bt)) {
 2070         return false;
 2071       }
 2072       return true;
 2073 
 2074     case Op_MaskAll:
 2075       return true;
 2076 
 2077     case Op_CountLeadingZerosV:
 2078       if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
 2079         return true;
 2080       }
 2081     default:
 2082       return false;
 2083   }
 2084 }
 2085 
 2086 bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
 2087   return false;
 2088 }
 2089 
 2090 // Return true if Vector::rearrange needs preparation of the shuffle argument
 2091 bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen) {
 2092   switch (elem_bt) {
 2093     case T_BYTE:  return false;
 2094     case T_SHORT: return !VM_Version::supports_avx512bw();
 2095     case T_INT:   return !VM_Version::supports_avx();
 2096     case T_LONG:  return vlen < 8 && !VM_Version::supports_avx512vl();
 2097     default:
 2098       ShouldNotReachHere();
 2099       return false;
 2100   }
 2101 }
 2102 
 2103 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
 2104   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
 2105   bool legacy = (generic_opnd->opcode() == LEGVEC);
 2106   if (!VM_Version::supports_avx512vlbwdq() && // KNL
 2107       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
 2108     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
 2109     return new legVecZOper();
 2110   }
 2111   if (legacy) {
 2112     switch (ideal_reg) {
 2113       case Op_VecS: return new legVecSOper();
 2114       case Op_VecD: return new legVecDOper();
 2115       case Op_VecX: return new legVecXOper();
 2116       case Op_VecY: return new legVecYOper();
 2117       case Op_VecZ: return new legVecZOper();
 2118     }
 2119   } else {
 2120     switch (ideal_reg) {
 2121       case Op_VecS: return new vecSOper();
 2122       case Op_VecD: return new vecDOper();
 2123       case Op_VecX: return new vecXOper();
 2124       case Op_VecY: return new vecYOper();
 2125       case Op_VecZ: return new vecZOper();
 2126     }
 2127   }
 2128   ShouldNotReachHere();
 2129   return nullptr;
 2130 }
 2131 
 2132 bool Matcher::is_reg2reg_move(MachNode* m) {
 2133   switch (m->rule()) {
 2134     case MoveVec2Leg_rule:
 2135     case MoveLeg2Vec_rule:
 2136     case MoveF2VL_rule:
 2137     case MoveF2LEG_rule:
 2138     case MoveVL2F_rule:
 2139     case MoveLEG2F_rule:
 2140     case MoveD2VL_rule:
 2141     case MoveD2LEG_rule:
 2142     case MoveVL2D_rule:
 2143     case MoveLEG2D_rule:
 2144       return true;
 2145     default:
 2146       return false;
 2147   }
 2148 }
 2149 
 2150 bool Matcher::is_generic_vector(MachOper* opnd) {
 2151   switch (opnd->opcode()) {
 2152     case VEC:
 2153     case LEGVEC:
 2154       return true;
 2155     default:
 2156       return false;
 2157   }
 2158 }
 2159 
 2160 //------------------------------------------------------------------------
 2161 
 2162 const RegMask* Matcher::predicate_reg_mask(void) {
 2163   return &_VECTMASK_REG_mask;
 2164 }
 2165 
 2166 // Max vector size in bytes. 0 if not supported.
 2167 int Matcher::vector_width_in_bytes(BasicType bt) {
 2168   assert(is_java_primitive(bt), "only primitive type vectors");
 2169   // SSE2 supports 128bit vectors for all types.
 2170   // AVX2 supports 256bit vectors for all types.
 2171   // AVX2/EVEX supports 512bit vectors for all types.
 2172   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
 2173   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 2174   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 2175     size = (UseAVX > 2) ? 64 : 32;
 2176   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
 2177     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
 2178   // Use flag to limit vector size.
 2179   size = MIN2(size,(int)MaxVectorSize);
 2180   // Minimum 2 values in vector (or 4 for bytes).
 2181   switch (bt) {
 2182   case T_DOUBLE:
 2183   case T_LONG:
 2184     if (size < 16) return 0;
 2185     break;
 2186   case T_FLOAT:
 2187   case T_INT:
 2188     if (size < 8) return 0;
 2189     break;
 2190   case T_BOOLEAN:
 2191     if (size < 4) return 0;
 2192     break;
 2193   case T_CHAR:
 2194     if (size < 4) return 0;
 2195     break;
 2196   case T_BYTE:
 2197     if (size < 4) return 0;
 2198     break;
 2199   case T_SHORT:
 2200     if (size < 4) return 0;
 2201     break;
 2202   default:
 2203     ShouldNotReachHere();
 2204   }
 2205   return size;
 2206 }
 2207 
 2208 // Limits on vector size (number of elements) loaded into vector.
 2209 int Matcher::max_vector_size(const BasicType bt) {
 2210   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 2211 }
 2212 int Matcher::min_vector_size(const BasicType bt) {
 2213   int max_size = max_vector_size(bt);
 2214   // Min size which can be loaded into vector is 4 bytes.
 2215   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 2216   // Support for calling svml double64 vectors
 2217   if (bt == T_DOUBLE) {
 2218     size = 1;
 2219   }
 2220   return MIN2(size,max_size);
 2221 }
 2222 
 2223 int Matcher::max_vector_size_auto_vectorization(const BasicType bt) {
 2224   // Limit the max vector size for auto vectorization to 256 bits (32 bytes)
 2225   // by default on Cascade Lake
 2226   if (VM_Version::is_default_intel_cascade_lake()) {
 2227     return MIN2(Matcher::max_vector_size(bt), 32 / type2aelembytes(bt));
 2228   }
 2229   return Matcher::max_vector_size(bt);
 2230 }
 2231 
 2232 int Matcher::scalable_vector_reg_size(const BasicType bt) {
 2233   return -1;
 2234 }
 2235 
 2236 // Vector ideal reg corresponding to specified size in bytes
 2237 uint Matcher::vector_ideal_reg(int size) {
 2238   assert(MaxVectorSize >= size, "");
 2239   switch(size) {
 2240     case  4: return Op_VecS;
 2241     case  8: return Op_VecD;
 2242     case 16: return Op_VecX;
 2243     case 32: return Op_VecY;
 2244     case 64: return Op_VecZ;
 2245   }
 2246   ShouldNotReachHere();
 2247   return 0;
 2248 }
 2249 
 2250 // Check for shift by small constant as well
 2251 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
 2252   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
 2253       shift->in(2)->get_int() <= 3 &&
 2254       // Are there other uses besides address expressions?
 2255       !matcher->is_visited(shift)) {
 2256     address_visited.set(shift->_idx); // Flag as address_visited
 2257     mstack.push(shift->in(2), Matcher::Visit);
 2258     Node *conv = shift->in(1);
 2259     // Allow Matcher to match the rule which bypass
 2260     // ConvI2L operation for an array index on LP64
 2261     // if the index value is positive.
 2262     if (conv->Opcode() == Op_ConvI2L &&
 2263         conv->as_Type()->type()->is_long()->_lo >= 0 &&
 2264         // Are there other uses besides address expressions?
 2265         !matcher->is_visited(conv)) {
 2266       address_visited.set(conv->_idx); // Flag as address_visited
 2267       mstack.push(conv->in(1), Matcher::Pre_Visit);
 2268     } else {
 2269       mstack.push(conv, Matcher::Pre_Visit);
 2270     }
 2271     return true;
 2272   }
 2273   return false;
 2274 }
 2275 
 2276 // This function identifies sub-graphs in which a 'load' node is
 2277 // input to two different nodes, and such that it can be matched
 2278 // with BMI instructions like blsi, blsr, etc.
 2279 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
 2280 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
 2281 // refers to the same node.
 2282 //
 2283 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
 2284 // This is a temporary solution until we make DAGs expressible in ADL.
 2285 template<typename ConType>
 2286 class FusedPatternMatcher {
 2287   Node* _op1_node;
 2288   Node* _mop_node;
 2289   int _con_op;
 2290 
 2291   static int match_next(Node* n, int next_op, int next_op_idx) {
 2292     if (n->in(1) == nullptr || n->in(2) == nullptr) {
 2293       return -1;
 2294     }
 2295 
 2296     if (next_op_idx == -1) { // n is commutative, try rotations
 2297       if (n->in(1)->Opcode() == next_op) {
 2298         return 1;
 2299       } else if (n->in(2)->Opcode() == next_op) {
 2300         return 2;
 2301       }
 2302     } else {
 2303       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
 2304       if (n->in(next_op_idx)->Opcode() == next_op) {
 2305         return next_op_idx;
 2306       }
 2307     }
 2308     return -1;
 2309   }
 2310 
 2311  public:
 2312   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
 2313     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
 2314 
 2315   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
 2316              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
 2317              typename ConType::NativeType con_value) {
 2318     if (_op1_node->Opcode() != op1) {
 2319       return false;
 2320     }
 2321     if (_mop_node->outcnt() > 2) {
 2322       return false;
 2323     }
 2324     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
 2325     if (op1_op2_idx == -1) {
 2326       return false;
 2327     }
 2328     // Memory operation must be the other edge
 2329     int op1_mop_idx = (op1_op2_idx & 1) + 1;
 2330 
 2331     // Check that the mop node is really what we want
 2332     if (_op1_node->in(op1_mop_idx) == _mop_node) {
 2333       Node* op2_node = _op1_node->in(op1_op2_idx);
 2334       if (op2_node->outcnt() > 1) {
 2335         return false;
 2336       }
 2337       assert(op2_node->Opcode() == op2, "Should be");
 2338       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
 2339       if (op2_con_idx == -1) {
 2340         return false;
 2341       }
 2342       // Memory operation must be the other edge
 2343       int op2_mop_idx = (op2_con_idx & 1) + 1;
 2344       // Check that the memory operation is the same node
 2345       if (op2_node->in(op2_mop_idx) == _mop_node) {
 2346         // Now check the constant
 2347         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
 2348         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
 2349           return true;
 2350         }
 2351       }
 2352     }
 2353     return false;
 2354   }
 2355 };
 2356 
 2357 static bool is_bmi_pattern(Node* n, Node* m) {
 2358   assert(UseBMI1Instructions, "sanity");
 2359   if (n != nullptr && m != nullptr) {
 2360     if (m->Opcode() == Op_LoadI) {
 2361       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
 2362       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
 2363              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
 2364              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
 2365     } else if (m->Opcode() == Op_LoadL) {
 2366       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
 2367       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
 2368              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
 2369              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
 2370     }
 2371   }
 2372   return false;
 2373 }
 2374 
 2375 // Should the matcher clone input 'm' of node 'n'?
 2376 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 2377   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
 2378   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
 2379     mstack.push(m, Visit);
 2380     return true;
 2381   }
 2382   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
 2383     mstack.push(m, Visit);           // m = ShiftCntV
 2384     return true;
 2385   }
 2386   if (is_encode_and_store_pattern(n, m)) {
 2387     mstack.push(m, Visit);
 2388     return true;
 2389   }
 2390   return false;
 2391 }
 2392 
 2393 // Should the Matcher clone shifts on addressing modes, expecting them
 2394 // to be subsumed into complex addressing expressions or compute them
 2395 // into registers?
 2396 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 2397   Node *off = m->in(AddPNode::Offset);
 2398   if (off->is_Con()) {
 2399     address_visited.test_set(m->_idx); // Flag as address_visited
 2400     Node *adr = m->in(AddPNode::Address);
 2401 
 2402     // Intel can handle 2 adds in addressing mode, with one of them using an immediate offset.
 2403     // AtomicAdd is not an addressing expression.
 2404     // Cheap to find it by looking for screwy base.
 2405     if (adr->is_AddP() &&
 2406         !adr->in(AddPNode::Base)->is_top() &&
 2407         !adr->in(AddPNode::Offset)->is_Con() &&
 2408         off->get_long() == (int) (off->get_long()) && // immL32
 2409         // Are there other uses besides address expressions?
 2410         !is_visited(adr)) {
 2411       address_visited.set(adr->_idx); // Flag as address_visited
 2412       Node *shift = adr->in(AddPNode::Offset);
 2413       if (!clone_shift(shift, this, mstack, address_visited)) {
 2414         mstack.push(shift, Pre_Visit);
 2415       }
 2416       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
 2417       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
 2418     } else {
 2419       mstack.push(adr, Pre_Visit);
 2420     }
 2421 
 2422     // Clone X+offset as it also folds into most addressing expressions
 2423     mstack.push(off, Visit);
 2424     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2425     return true;
 2426   } else if (clone_shift(off, this, mstack, address_visited)) {
 2427     address_visited.test_set(m->_idx); // Flag as address_visited
 2428     mstack.push(m->in(AddPNode::Address), Pre_Visit);
 2429     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2430     return true;
 2431   }
 2432   return false;
 2433 }
 2434 
 2435 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
 2436   switch (bt) {
 2437     case BoolTest::eq:
 2438       return Assembler::eq;
 2439     case BoolTest::ne:
 2440       return Assembler::neq;
 2441     case BoolTest::le:
 2442     case BoolTest::ule:
 2443       return Assembler::le;
 2444     case BoolTest::ge:
 2445     case BoolTest::uge:
 2446       return Assembler::nlt;
 2447     case BoolTest::lt:
 2448     case BoolTest::ult:
 2449       return Assembler::lt;
 2450     case BoolTest::gt:
 2451     case BoolTest::ugt:
 2452       return Assembler::nle;
 2453     default : ShouldNotReachHere(); return Assembler::_false;
 2454   }
 2455 }
 2456 
 2457 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
 2458   switch (bt) {
 2459   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
 2460   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
 2461   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
 2462   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
 2463   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
 2464   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
 2465   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
 2466   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
 2467   }
 2468 }
 2469 
 2470 // Helper methods for MachSpillCopyNode::implementation().
 2471 static void vec_mov_helper(C2_MacroAssembler *masm, int src_lo, int dst_lo,
 2472                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 2473   assert(ireg == Op_VecS || // 32bit vector
 2474          ((src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 2475           (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi),
 2476          "no non-adjacent vector moves" );
 2477   if (masm) {
 2478     switch (ireg) {
 2479     case Op_VecS: // copy whole register
 2480     case Op_VecD:
 2481     case Op_VecX:
 2482       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2483         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2484       } else {
 2485         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2486      }
 2487       break;
 2488     case Op_VecY:
 2489       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2490         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2491       } else {
 2492         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2493      }
 2494       break;
 2495     case Op_VecZ:
 2496       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
 2497       break;
 2498     default:
 2499       ShouldNotReachHere();
 2500     }
 2501 #ifndef PRODUCT
 2502   } else {
 2503     switch (ireg) {
 2504     case Op_VecS:
 2505     case Op_VecD:
 2506     case Op_VecX:
 2507       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2508       break;
 2509     case Op_VecY:
 2510     case Op_VecZ:
 2511       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2512       break;
 2513     default:
 2514       ShouldNotReachHere();
 2515     }
 2516 #endif
 2517   }
 2518 }
 2519 
 2520 void vec_spill_helper(C2_MacroAssembler *masm, bool is_load,
 2521                      int stack_offset, int reg, uint ireg, outputStream* st) {
 2522   if (masm) {
 2523     if (is_load) {
 2524       switch (ireg) {
 2525       case Op_VecS:
 2526         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2527         break;
 2528       case Op_VecD:
 2529         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2530         break;
 2531       case Op_VecX:
 2532         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2533           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2534         } else {
 2535           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2536           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2537         }
 2538         break;
 2539       case Op_VecY:
 2540         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2541           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2542         } else {
 2543           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2544           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2545         }
 2546         break;
 2547       case Op_VecZ:
 2548         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
 2549         break;
 2550       default:
 2551         ShouldNotReachHere();
 2552       }
 2553     } else { // store
 2554       switch (ireg) {
 2555       case Op_VecS:
 2556         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2557         break;
 2558       case Op_VecD:
 2559         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2560         break;
 2561       case Op_VecX:
 2562         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2563           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2564         }
 2565         else {
 2566           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2567         }
 2568         break;
 2569       case Op_VecY:
 2570         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2571           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2572         }
 2573         else {
 2574           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2575         }
 2576         break;
 2577       case Op_VecZ:
 2578         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2579         break;
 2580       default:
 2581         ShouldNotReachHere();
 2582       }
 2583     }
 2584 #ifndef PRODUCT
 2585   } else {
 2586     if (is_load) {
 2587       switch (ireg) {
 2588       case Op_VecS:
 2589         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2590         break;
 2591       case Op_VecD:
 2592         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2593         break;
 2594        case Op_VecX:
 2595         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2596         break;
 2597       case Op_VecY:
 2598       case Op_VecZ:
 2599         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2600         break;
 2601       default:
 2602         ShouldNotReachHere();
 2603       }
 2604     } else { // store
 2605       switch (ireg) {
 2606       case Op_VecS:
 2607         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2608         break;
 2609       case Op_VecD:
 2610         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2611         break;
 2612        case Op_VecX:
 2613         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2614         break;
 2615       case Op_VecY:
 2616       case Op_VecZ:
 2617         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2618         break;
 2619       default:
 2620         ShouldNotReachHere();
 2621       }
 2622     }
 2623 #endif
 2624   }
 2625 }
 2626 
 2627 template <class T>
 2628 static inline GrowableArray<jbyte>* vreplicate_imm(BasicType bt, T con, int len) {
 2629   int size = type2aelembytes(bt) * len;
 2630   GrowableArray<jbyte>* val = new GrowableArray<jbyte>(size, size, 0);
 2631   for (int i = 0; i < len; i++) {
 2632     int offset = i * type2aelembytes(bt);
 2633     switch (bt) {
 2634       case T_BYTE: val->at(i) = con; break;
 2635       case T_SHORT: {
 2636         jshort c = con;
 2637         memcpy(val->adr_at(offset), &c, sizeof(jshort));
 2638         break;
 2639       }
 2640       case T_INT: {
 2641         jint c = con;
 2642         memcpy(val->adr_at(offset), &c, sizeof(jint));
 2643         break;
 2644       }
 2645       case T_LONG: {
 2646         jlong c = con;
 2647         memcpy(val->adr_at(offset), &c, sizeof(jlong));
 2648         break;
 2649       }
 2650       case T_FLOAT: {
 2651         jfloat c = con;
 2652         memcpy(val->adr_at(offset), &c, sizeof(jfloat));
 2653         break;
 2654       }
 2655       case T_DOUBLE: {
 2656         jdouble c = con;
 2657         memcpy(val->adr_at(offset), &c, sizeof(jdouble));
 2658         break;
 2659       }
 2660       default: assert(false, "%s", type2name(bt));
 2661     }
 2662   }
 2663   return val;
 2664 }
 2665 
 2666 static inline jlong high_bit_set(BasicType bt) {
 2667   switch (bt) {
 2668     case T_BYTE:  return 0x8080808080808080;
 2669     case T_SHORT: return 0x8000800080008000;
 2670     case T_INT:   return 0x8000000080000000;
 2671     case T_LONG:  return 0x8000000000000000;
 2672     default:
 2673       ShouldNotReachHere();
 2674       return 0;
 2675   }
 2676 }
 2677 
 2678 #ifndef PRODUCT
 2679   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 2680     st->print("nop \t# %d bytes pad for loops and calls", _count);
 2681   }
 2682 #endif
 2683 
 2684   void MachNopNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc*) const {
 2685     __ nop(_count);
 2686   }
 2687 
 2688   uint MachNopNode::size(PhaseRegAlloc*) const {
 2689     return _count;
 2690   }
 2691 
 2692 #ifndef PRODUCT
 2693   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 2694     st->print("# breakpoint");
 2695   }
 2696 #endif
 2697 
 2698   void MachBreakpointNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc* ra_) const {
 2699     __ int3();
 2700   }
 2701 
 2702   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 2703     return MachNode::size(ra_);
 2704   }
 2705 
 2706 %}
 2707 
 2708 encode %{
 2709 
 2710   enc_class call_epilog %{
 2711     if (VerifyStackAtCalls) {
 2712       // Check that stack depth is unchanged: find majik cookie on stack
 2713       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 2714       Label L;
 2715       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 2716       __ jccb(Assembler::equal, L);
 2717       // Die if stack mismatch
 2718       __ int3();
 2719       __ bind(L);
 2720     }
 2721     if (tf()->returns_inline_type_as_fields() && !_method->is_method_handle_intrinsic() && _method->return_type()->is_loaded()) {
 2722       // The last return value is not set by the callee but used to pass the null marker to compiled code.
 2723       // Search for the corresponding projection, get the register and emit code that initialized it.
 2724       uint con = (tf()->range_cc()->cnt() - 1);
 2725       for (DUIterator_Fast imax, i = fast_outs(imax); i < imax; i++) {
 2726         ProjNode* proj = fast_out(i)->as_Proj();
 2727         if (proj->_con == con) {
 2728           // Set null marker if rax is non-null (a non-null value is returned buffered or scalarized)
 2729           OptoReg::Name optoReg = ra_->get_reg_first(proj);
 2730           VMReg reg = OptoReg::as_VMReg(optoReg, ra_->_framesize, OptoReg::reg2stack(ra_->_matcher._new_SP));
 2731           Register toReg = reg->is_reg() ? reg->as_Register() : rscratch1;
 2732           __ testq(rax, rax);
 2733           __ setb(Assembler::notZero, toReg);
 2734           __ movzbl(toReg, toReg);
 2735           if (reg->is_stack()) {
 2736             int st_off = reg->reg2stack() * VMRegImpl::stack_slot_size;
 2737             __ movq(Address(rsp, st_off), toReg);
 2738           }
 2739           break;
 2740         }
 2741       }
 2742       if (return_value_is_used()) {
 2743         // An inline type is returned as fields in multiple registers.
 2744         // Rax either contains an oop if the inline type is buffered or a pointer
 2745         // to the corresponding InlineKlass with the lowest bit set to 1. Zero rax
 2746         // if the lowest bit is set to allow C2 to use the oop after null checking.
 2747         // rax &= (rax & 1) - 1
 2748         __ movptr(rscratch1, rax);
 2749         __ andptr(rscratch1, 0x1);
 2750         __ subptr(rscratch1, 0x1);
 2751         __ andptr(rax, rscratch1);
 2752       }
 2753     }
 2754   %}
 2755 
 2756 %}
 2757 
 2758 // Operands for bound floating pointer register arguments
 2759 operand rxmm0() %{
 2760   constraint(ALLOC_IN_RC(xmm0_reg));
 2761   match(VecX);
 2762   format%{%}
 2763   interface(REG_INTER);
 2764 %}
 2765 
 2766 //----------OPERANDS-----------------------------------------------------------
 2767 // Operand definitions must precede instruction definitions for correct parsing
 2768 // in the ADLC because operands constitute user defined types which are used in
 2769 // instruction definitions.
 2770 
 2771 // Vectors
 2772 
 2773 // Dummy generic vector class. Should be used for all vector operands.
 2774 // Replaced with vec[SDXYZ] during post-selection pass.
 2775 operand vec() %{
 2776   constraint(ALLOC_IN_RC(dynamic));
 2777   match(VecX);
 2778   match(VecY);
 2779   match(VecZ);
 2780   match(VecS);
 2781   match(VecD);
 2782 
 2783   format %{ %}
 2784   interface(REG_INTER);
 2785 %}
 2786 
 2787 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
 2788 // Replaced with legVec[SDXYZ] during post-selection cleanup.
 2789 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
 2790 // runtime code generation via reg_class_dynamic.
 2791 operand legVec() %{
 2792   constraint(ALLOC_IN_RC(dynamic));
 2793   match(VecX);
 2794   match(VecY);
 2795   match(VecZ);
 2796   match(VecS);
 2797   match(VecD);
 2798 
 2799   format %{ %}
 2800   interface(REG_INTER);
 2801 %}
 2802 
 2803 // Replaces vec during post-selection cleanup. See above.
 2804 operand vecS() %{
 2805   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
 2806   match(VecS);
 2807 
 2808   format %{ %}
 2809   interface(REG_INTER);
 2810 %}
 2811 
 2812 // Replaces legVec during post-selection cleanup. See above.
 2813 operand legVecS() %{
 2814   constraint(ALLOC_IN_RC(vectors_reg_legacy));
 2815   match(VecS);
 2816 
 2817   format %{ %}
 2818   interface(REG_INTER);
 2819 %}
 2820 
 2821 // Replaces vec during post-selection cleanup. See above.
 2822 operand vecD() %{
 2823   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
 2824   match(VecD);
 2825 
 2826   format %{ %}
 2827   interface(REG_INTER);
 2828 %}
 2829 
 2830 // Replaces legVec during post-selection cleanup. See above.
 2831 operand legVecD() %{
 2832   constraint(ALLOC_IN_RC(vectord_reg_legacy));
 2833   match(VecD);
 2834 
 2835   format %{ %}
 2836   interface(REG_INTER);
 2837 %}
 2838 
 2839 // Replaces vec during post-selection cleanup. See above.
 2840 operand vecX() %{
 2841   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
 2842   match(VecX);
 2843 
 2844   format %{ %}
 2845   interface(REG_INTER);
 2846 %}
 2847 
 2848 // Replaces legVec during post-selection cleanup. See above.
 2849 operand legVecX() %{
 2850   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
 2851   match(VecX);
 2852 
 2853   format %{ %}
 2854   interface(REG_INTER);
 2855 %}
 2856 
 2857 // Replaces vec during post-selection cleanup. See above.
 2858 operand vecY() %{
 2859   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
 2860   match(VecY);
 2861 
 2862   format %{ %}
 2863   interface(REG_INTER);
 2864 %}
 2865 
 2866 // Replaces legVec during post-selection cleanup. See above.
 2867 operand legVecY() %{
 2868   constraint(ALLOC_IN_RC(vectory_reg_legacy));
 2869   match(VecY);
 2870 
 2871   format %{ %}
 2872   interface(REG_INTER);
 2873 %}
 2874 
 2875 // Replaces vec during post-selection cleanup. See above.
 2876 operand vecZ() %{
 2877   constraint(ALLOC_IN_RC(vectorz_reg));
 2878   match(VecZ);
 2879 
 2880   format %{ %}
 2881   interface(REG_INTER);
 2882 %}
 2883 
 2884 // Replaces legVec during post-selection cleanup. See above.
 2885 operand legVecZ() %{
 2886   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
 2887   match(VecZ);
 2888 
 2889   format %{ %}
 2890   interface(REG_INTER);
 2891 %}
 2892 
 2893 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 2894 
 2895 // ============================================================================
 2896 
 2897 instruct ShouldNotReachHere() %{
 2898   match(Halt);
 2899   format %{ "stop\t# ShouldNotReachHere" %}
 2900   ins_encode %{
 2901     if (is_reachable()) {
 2902       const char* str = __ code_string(_halt_reason);
 2903       __ stop(str);
 2904     }
 2905   %}
 2906   ins_pipe(pipe_slow);
 2907 %}
 2908 
 2909 // ============================================================================
 2910 
 2911 instruct addF_reg(regF dst, regF src) %{
 2912   predicate(UseAVX == 0);
 2913   match(Set dst (AddF dst src));
 2914 
 2915   format %{ "addss   $dst, $src" %}
 2916   ins_cost(150);
 2917   ins_encode %{
 2918     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 2919   %}
 2920   ins_pipe(pipe_slow);
 2921 %}
 2922 
 2923 instruct addF_mem(regF dst, memory src) %{
 2924   predicate(UseAVX == 0);
 2925   match(Set dst (AddF dst (LoadF src)));
 2926 
 2927   format %{ "addss   $dst, $src" %}
 2928   ins_cost(150);
 2929   ins_encode %{
 2930     __ addss($dst$$XMMRegister, $src$$Address);
 2931   %}
 2932   ins_pipe(pipe_slow);
 2933 %}
 2934 
 2935 instruct addF_imm(regF dst, immF con) %{
 2936   predicate(UseAVX == 0);
 2937   match(Set dst (AddF dst con));
 2938   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 2939   ins_cost(150);
 2940   ins_encode %{
 2941     __ addss($dst$$XMMRegister, $constantaddress($con));
 2942   %}
 2943   ins_pipe(pipe_slow);
 2944 %}
 2945 
 2946 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 2947   predicate(UseAVX > 0);
 2948   match(Set dst (AddF src1 src2));
 2949 
 2950   format %{ "vaddss  $dst, $src1, $src2" %}
 2951   ins_cost(150);
 2952   ins_encode %{
 2953     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 2954   %}
 2955   ins_pipe(pipe_slow);
 2956 %}
 2957 
 2958 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 2959   predicate(UseAVX > 0);
 2960   match(Set dst (AddF src1 (LoadF src2)));
 2961 
 2962   format %{ "vaddss  $dst, $src1, $src2" %}
 2963   ins_cost(150);
 2964   ins_encode %{
 2965     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 2966   %}
 2967   ins_pipe(pipe_slow);
 2968 %}
 2969 
 2970 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 2971   predicate(UseAVX > 0);
 2972   match(Set dst (AddF src con));
 2973 
 2974   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 2975   ins_cost(150);
 2976   ins_encode %{
 2977     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 2978   %}
 2979   ins_pipe(pipe_slow);
 2980 %}
 2981 
 2982 instruct addD_reg(regD dst, regD src) %{
 2983   predicate(UseAVX == 0);
 2984   match(Set dst (AddD dst src));
 2985 
 2986   format %{ "addsd   $dst, $src" %}
 2987   ins_cost(150);
 2988   ins_encode %{
 2989     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 2990   %}
 2991   ins_pipe(pipe_slow);
 2992 %}
 2993 
 2994 instruct addD_mem(regD dst, memory src) %{
 2995   predicate(UseAVX == 0);
 2996   match(Set dst (AddD dst (LoadD src)));
 2997 
 2998   format %{ "addsd   $dst, $src" %}
 2999   ins_cost(150);
 3000   ins_encode %{
 3001     __ addsd($dst$$XMMRegister, $src$$Address);
 3002   %}
 3003   ins_pipe(pipe_slow);
 3004 %}
 3005 
 3006 instruct addD_imm(regD dst, immD con) %{
 3007   predicate(UseAVX == 0);
 3008   match(Set dst (AddD dst con));
 3009   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3010   ins_cost(150);
 3011   ins_encode %{
 3012     __ addsd($dst$$XMMRegister, $constantaddress($con));
 3013   %}
 3014   ins_pipe(pipe_slow);
 3015 %}
 3016 
 3017 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 3018   predicate(UseAVX > 0);
 3019   match(Set dst (AddD src1 src2));
 3020 
 3021   format %{ "vaddsd  $dst, $src1, $src2" %}
 3022   ins_cost(150);
 3023   ins_encode %{
 3024     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3025   %}
 3026   ins_pipe(pipe_slow);
 3027 %}
 3028 
 3029 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 3030   predicate(UseAVX > 0);
 3031   match(Set dst (AddD src1 (LoadD src2)));
 3032 
 3033   format %{ "vaddsd  $dst, $src1, $src2" %}
 3034   ins_cost(150);
 3035   ins_encode %{
 3036     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3037   %}
 3038   ins_pipe(pipe_slow);
 3039 %}
 3040 
 3041 instruct addD_reg_imm(regD dst, regD src, immD con) %{
 3042   predicate(UseAVX > 0);
 3043   match(Set dst (AddD src con));
 3044 
 3045   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3046   ins_cost(150);
 3047   ins_encode %{
 3048     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3049   %}
 3050   ins_pipe(pipe_slow);
 3051 %}
 3052 
 3053 instruct subF_reg(regF dst, regF src) %{
 3054   predicate(UseAVX == 0);
 3055   match(Set dst (SubF dst src));
 3056 
 3057   format %{ "subss   $dst, $src" %}
 3058   ins_cost(150);
 3059   ins_encode %{
 3060     __ subss($dst$$XMMRegister, $src$$XMMRegister);
 3061   %}
 3062   ins_pipe(pipe_slow);
 3063 %}
 3064 
 3065 instruct subF_mem(regF dst, memory src) %{
 3066   predicate(UseAVX == 0);
 3067   match(Set dst (SubF dst (LoadF src)));
 3068 
 3069   format %{ "subss   $dst, $src" %}
 3070   ins_cost(150);
 3071   ins_encode %{
 3072     __ subss($dst$$XMMRegister, $src$$Address);
 3073   %}
 3074   ins_pipe(pipe_slow);
 3075 %}
 3076 
 3077 instruct subF_imm(regF dst, immF con) %{
 3078   predicate(UseAVX == 0);
 3079   match(Set dst (SubF dst con));
 3080   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3081   ins_cost(150);
 3082   ins_encode %{
 3083     __ subss($dst$$XMMRegister, $constantaddress($con));
 3084   %}
 3085   ins_pipe(pipe_slow);
 3086 %}
 3087 
 3088 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
 3089   predicate(UseAVX > 0);
 3090   match(Set dst (SubF src1 src2));
 3091 
 3092   format %{ "vsubss  $dst, $src1, $src2" %}
 3093   ins_cost(150);
 3094   ins_encode %{
 3095     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3096   %}
 3097   ins_pipe(pipe_slow);
 3098 %}
 3099 
 3100 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
 3101   predicate(UseAVX > 0);
 3102   match(Set dst (SubF src1 (LoadF src2)));
 3103 
 3104   format %{ "vsubss  $dst, $src1, $src2" %}
 3105   ins_cost(150);
 3106   ins_encode %{
 3107     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3108   %}
 3109   ins_pipe(pipe_slow);
 3110 %}
 3111 
 3112 instruct subF_reg_imm(regF dst, regF src, immF con) %{
 3113   predicate(UseAVX > 0);
 3114   match(Set dst (SubF src con));
 3115 
 3116   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3117   ins_cost(150);
 3118   ins_encode %{
 3119     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3120   %}
 3121   ins_pipe(pipe_slow);
 3122 %}
 3123 
 3124 instruct subD_reg(regD dst, regD src) %{
 3125   predicate(UseAVX == 0);
 3126   match(Set dst (SubD dst src));
 3127 
 3128   format %{ "subsd   $dst, $src" %}
 3129   ins_cost(150);
 3130   ins_encode %{
 3131     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
 3132   %}
 3133   ins_pipe(pipe_slow);
 3134 %}
 3135 
 3136 instruct subD_mem(regD dst, memory src) %{
 3137   predicate(UseAVX == 0);
 3138   match(Set dst (SubD dst (LoadD src)));
 3139 
 3140   format %{ "subsd   $dst, $src" %}
 3141   ins_cost(150);
 3142   ins_encode %{
 3143     __ subsd($dst$$XMMRegister, $src$$Address);
 3144   %}
 3145   ins_pipe(pipe_slow);
 3146 %}
 3147 
 3148 instruct subD_imm(regD dst, immD con) %{
 3149   predicate(UseAVX == 0);
 3150   match(Set dst (SubD dst con));
 3151   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3152   ins_cost(150);
 3153   ins_encode %{
 3154     __ subsd($dst$$XMMRegister, $constantaddress($con));
 3155   %}
 3156   ins_pipe(pipe_slow);
 3157 %}
 3158 
 3159 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
 3160   predicate(UseAVX > 0);
 3161   match(Set dst (SubD src1 src2));
 3162 
 3163   format %{ "vsubsd  $dst, $src1, $src2" %}
 3164   ins_cost(150);
 3165   ins_encode %{
 3166     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3167   %}
 3168   ins_pipe(pipe_slow);
 3169 %}
 3170 
 3171 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
 3172   predicate(UseAVX > 0);
 3173   match(Set dst (SubD src1 (LoadD src2)));
 3174 
 3175   format %{ "vsubsd  $dst, $src1, $src2" %}
 3176   ins_cost(150);
 3177   ins_encode %{
 3178     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3179   %}
 3180   ins_pipe(pipe_slow);
 3181 %}
 3182 
 3183 instruct subD_reg_imm(regD dst, regD src, immD con) %{
 3184   predicate(UseAVX > 0);
 3185   match(Set dst (SubD src con));
 3186 
 3187   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3188   ins_cost(150);
 3189   ins_encode %{
 3190     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3191   %}
 3192   ins_pipe(pipe_slow);
 3193 %}
 3194 
 3195 instruct mulF_reg(regF dst, regF src) %{
 3196   predicate(UseAVX == 0);
 3197   match(Set dst (MulF dst src));
 3198 
 3199   format %{ "mulss   $dst, $src" %}
 3200   ins_cost(150);
 3201   ins_encode %{
 3202     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
 3203   %}
 3204   ins_pipe(pipe_slow);
 3205 %}
 3206 
 3207 instruct mulF_mem(regF dst, memory src) %{
 3208   predicate(UseAVX == 0);
 3209   match(Set dst (MulF dst (LoadF src)));
 3210 
 3211   format %{ "mulss   $dst, $src" %}
 3212   ins_cost(150);
 3213   ins_encode %{
 3214     __ mulss($dst$$XMMRegister, $src$$Address);
 3215   %}
 3216   ins_pipe(pipe_slow);
 3217 %}
 3218 
 3219 instruct mulF_imm(regF dst, immF con) %{
 3220   predicate(UseAVX == 0);
 3221   match(Set dst (MulF dst con));
 3222   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3223   ins_cost(150);
 3224   ins_encode %{
 3225     __ mulss($dst$$XMMRegister, $constantaddress($con));
 3226   %}
 3227   ins_pipe(pipe_slow);
 3228 %}
 3229 
 3230 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
 3231   predicate(UseAVX > 0);
 3232   match(Set dst (MulF src1 src2));
 3233 
 3234   format %{ "vmulss  $dst, $src1, $src2" %}
 3235   ins_cost(150);
 3236   ins_encode %{
 3237     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3238   %}
 3239   ins_pipe(pipe_slow);
 3240 %}
 3241 
 3242 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 3243   predicate(UseAVX > 0);
 3244   match(Set dst (MulF src1 (LoadF src2)));
 3245 
 3246   format %{ "vmulss  $dst, $src1, $src2" %}
 3247   ins_cost(150);
 3248   ins_encode %{
 3249     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3250   %}
 3251   ins_pipe(pipe_slow);
 3252 %}
 3253 
 3254 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 3255   predicate(UseAVX > 0);
 3256   match(Set dst (MulF src con));
 3257 
 3258   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3259   ins_cost(150);
 3260   ins_encode %{
 3261     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3262   %}
 3263   ins_pipe(pipe_slow);
 3264 %}
 3265 
 3266 instruct mulD_reg(regD dst, regD src) %{
 3267   predicate(UseAVX == 0);
 3268   match(Set dst (MulD dst src));
 3269 
 3270   format %{ "mulsd   $dst, $src" %}
 3271   ins_cost(150);
 3272   ins_encode %{
 3273     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
 3274   %}
 3275   ins_pipe(pipe_slow);
 3276 %}
 3277 
 3278 instruct mulD_mem(regD dst, memory src) %{
 3279   predicate(UseAVX == 0);
 3280   match(Set dst (MulD dst (LoadD src)));
 3281 
 3282   format %{ "mulsd   $dst, $src" %}
 3283   ins_cost(150);
 3284   ins_encode %{
 3285     __ mulsd($dst$$XMMRegister, $src$$Address);
 3286   %}
 3287   ins_pipe(pipe_slow);
 3288 %}
 3289 
 3290 instruct mulD_imm(regD dst, immD con) %{
 3291   predicate(UseAVX == 0);
 3292   match(Set dst (MulD dst con));
 3293   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3294   ins_cost(150);
 3295   ins_encode %{
 3296     __ mulsd($dst$$XMMRegister, $constantaddress($con));
 3297   %}
 3298   ins_pipe(pipe_slow);
 3299 %}
 3300 
 3301 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
 3302   predicate(UseAVX > 0);
 3303   match(Set dst (MulD src1 src2));
 3304 
 3305   format %{ "vmulsd  $dst, $src1, $src2" %}
 3306   ins_cost(150);
 3307   ins_encode %{
 3308     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3309   %}
 3310   ins_pipe(pipe_slow);
 3311 %}
 3312 
 3313 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
 3314   predicate(UseAVX > 0);
 3315   match(Set dst (MulD src1 (LoadD src2)));
 3316 
 3317   format %{ "vmulsd  $dst, $src1, $src2" %}
 3318   ins_cost(150);
 3319   ins_encode %{
 3320     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3321   %}
 3322   ins_pipe(pipe_slow);
 3323 %}
 3324 
 3325 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
 3326   predicate(UseAVX > 0);
 3327   match(Set dst (MulD src con));
 3328 
 3329   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3330   ins_cost(150);
 3331   ins_encode %{
 3332     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3333   %}
 3334   ins_pipe(pipe_slow);
 3335 %}
 3336 
 3337 instruct divF_reg(regF dst, regF src) %{
 3338   predicate(UseAVX == 0);
 3339   match(Set dst (DivF dst src));
 3340 
 3341   format %{ "divss   $dst, $src" %}
 3342   ins_cost(150);
 3343   ins_encode %{
 3344     __ divss($dst$$XMMRegister, $src$$XMMRegister);
 3345   %}
 3346   ins_pipe(pipe_slow);
 3347 %}
 3348 
 3349 instruct divF_mem(regF dst, memory src) %{
 3350   predicate(UseAVX == 0);
 3351   match(Set dst (DivF dst (LoadF src)));
 3352 
 3353   format %{ "divss   $dst, $src" %}
 3354   ins_cost(150);
 3355   ins_encode %{
 3356     __ divss($dst$$XMMRegister, $src$$Address);
 3357   %}
 3358   ins_pipe(pipe_slow);
 3359 %}
 3360 
 3361 instruct divF_imm(regF dst, immF con) %{
 3362   predicate(UseAVX == 0);
 3363   match(Set dst (DivF dst con));
 3364   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3365   ins_cost(150);
 3366   ins_encode %{
 3367     __ divss($dst$$XMMRegister, $constantaddress($con));
 3368   %}
 3369   ins_pipe(pipe_slow);
 3370 %}
 3371 
 3372 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
 3373   predicate(UseAVX > 0);
 3374   match(Set dst (DivF src1 src2));
 3375 
 3376   format %{ "vdivss  $dst, $src1, $src2" %}
 3377   ins_cost(150);
 3378   ins_encode %{
 3379     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3380   %}
 3381   ins_pipe(pipe_slow);
 3382 %}
 3383 
 3384 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
 3385   predicate(UseAVX > 0);
 3386   match(Set dst (DivF src1 (LoadF src2)));
 3387 
 3388   format %{ "vdivss  $dst, $src1, $src2" %}
 3389   ins_cost(150);
 3390   ins_encode %{
 3391     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3392   %}
 3393   ins_pipe(pipe_slow);
 3394 %}
 3395 
 3396 instruct divF_reg_imm(regF dst, regF src, immF con) %{
 3397   predicate(UseAVX > 0);
 3398   match(Set dst (DivF src con));
 3399 
 3400   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3401   ins_cost(150);
 3402   ins_encode %{
 3403     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3404   %}
 3405   ins_pipe(pipe_slow);
 3406 %}
 3407 
 3408 instruct divD_reg(regD dst, regD src) %{
 3409   predicate(UseAVX == 0);
 3410   match(Set dst (DivD dst src));
 3411 
 3412   format %{ "divsd   $dst, $src" %}
 3413   ins_cost(150);
 3414   ins_encode %{
 3415     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
 3416   %}
 3417   ins_pipe(pipe_slow);
 3418 %}
 3419 
 3420 instruct divD_mem(regD dst, memory src) %{
 3421   predicate(UseAVX == 0);
 3422   match(Set dst (DivD dst (LoadD src)));
 3423 
 3424   format %{ "divsd   $dst, $src" %}
 3425   ins_cost(150);
 3426   ins_encode %{
 3427     __ divsd($dst$$XMMRegister, $src$$Address);
 3428   %}
 3429   ins_pipe(pipe_slow);
 3430 %}
 3431 
 3432 instruct divD_imm(regD dst, immD con) %{
 3433   predicate(UseAVX == 0);
 3434   match(Set dst (DivD dst con));
 3435   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3436   ins_cost(150);
 3437   ins_encode %{
 3438     __ divsd($dst$$XMMRegister, $constantaddress($con));
 3439   %}
 3440   ins_pipe(pipe_slow);
 3441 %}
 3442 
 3443 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
 3444   predicate(UseAVX > 0);
 3445   match(Set dst (DivD src1 src2));
 3446 
 3447   format %{ "vdivsd  $dst, $src1, $src2" %}
 3448   ins_cost(150);
 3449   ins_encode %{
 3450     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3451   %}
 3452   ins_pipe(pipe_slow);
 3453 %}
 3454 
 3455 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
 3456   predicate(UseAVX > 0);
 3457   match(Set dst (DivD src1 (LoadD src2)));
 3458 
 3459   format %{ "vdivsd  $dst, $src1, $src2" %}
 3460   ins_cost(150);
 3461   ins_encode %{
 3462     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3463   %}
 3464   ins_pipe(pipe_slow);
 3465 %}
 3466 
 3467 instruct divD_reg_imm(regD dst, regD src, immD con) %{
 3468   predicate(UseAVX > 0);
 3469   match(Set dst (DivD src con));
 3470 
 3471   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3472   ins_cost(150);
 3473   ins_encode %{
 3474     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3475   %}
 3476   ins_pipe(pipe_slow);
 3477 %}
 3478 
 3479 instruct absF_reg(regF dst) %{
 3480   predicate(UseAVX == 0);
 3481   match(Set dst (AbsF dst));
 3482   ins_cost(150);
 3483   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
 3484   ins_encode %{
 3485     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
 3486   %}
 3487   ins_pipe(pipe_slow);
 3488 %}
 3489 
 3490 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
 3491   predicate(UseAVX > 0);
 3492   match(Set dst (AbsF src));
 3493   ins_cost(150);
 3494   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
 3495   ins_encode %{
 3496     int vlen_enc = Assembler::AVX_128bit;
 3497     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
 3498               ExternalAddress(float_signmask()), vlen_enc);
 3499   %}
 3500   ins_pipe(pipe_slow);
 3501 %}
 3502 
 3503 instruct absD_reg(regD dst) %{
 3504   predicate(UseAVX == 0);
 3505   match(Set dst (AbsD dst));
 3506   ins_cost(150);
 3507   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
 3508             "# abs double by sign masking" %}
 3509   ins_encode %{
 3510     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
 3511   %}
 3512   ins_pipe(pipe_slow);
 3513 %}
 3514 
 3515 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
 3516   predicate(UseAVX > 0);
 3517   match(Set dst (AbsD src));
 3518   ins_cost(150);
 3519   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
 3520             "# abs double by sign masking" %}
 3521   ins_encode %{
 3522     int vlen_enc = Assembler::AVX_128bit;
 3523     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
 3524               ExternalAddress(double_signmask()), vlen_enc);
 3525   %}
 3526   ins_pipe(pipe_slow);
 3527 %}
 3528 
 3529 instruct negF_reg(regF dst) %{
 3530   predicate(UseAVX == 0);
 3531   match(Set dst (NegF dst));
 3532   ins_cost(150);
 3533   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
 3534   ins_encode %{
 3535     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
 3536   %}
 3537   ins_pipe(pipe_slow);
 3538 %}
 3539 
 3540 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
 3541   predicate(UseAVX > 0);
 3542   match(Set dst (NegF src));
 3543   ins_cost(150);
 3544   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
 3545   ins_encode %{
 3546     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
 3547                  ExternalAddress(float_signflip()));
 3548   %}
 3549   ins_pipe(pipe_slow);
 3550 %}
 3551 
 3552 instruct negD_reg(regD dst) %{
 3553   predicate(UseAVX == 0);
 3554   match(Set dst (NegD dst));
 3555   ins_cost(150);
 3556   format %{ "xorpd   $dst, [0x8000000000000000]\t"
 3557             "# neg double by sign flipping" %}
 3558   ins_encode %{
 3559     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
 3560   %}
 3561   ins_pipe(pipe_slow);
 3562 %}
 3563 
 3564 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
 3565   predicate(UseAVX > 0);
 3566   match(Set dst (NegD src));
 3567   ins_cost(150);
 3568   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
 3569             "# neg double by sign flipping" %}
 3570   ins_encode %{
 3571     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
 3572                  ExternalAddress(double_signflip()));
 3573   %}
 3574   ins_pipe(pipe_slow);
 3575 %}
 3576 
 3577 // sqrtss instruction needs destination register to be pre initialized for best performance
 3578 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3579 instruct sqrtF_reg(regF dst) %{
 3580   match(Set dst (SqrtF dst));
 3581   format %{ "sqrtss  $dst, $dst" %}
 3582   ins_encode %{
 3583     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
 3584   %}
 3585   ins_pipe(pipe_slow);
 3586 %}
 3587 
 3588 // sqrtsd instruction needs destination register to be pre initialized for best performance
 3589 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3590 instruct sqrtD_reg(regD dst) %{
 3591   match(Set dst (SqrtD dst));
 3592   format %{ "sqrtsd  $dst, $dst" %}
 3593   ins_encode %{
 3594     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
 3595   %}
 3596   ins_pipe(pipe_slow);
 3597 %}
 3598 
 3599 instruct convF2HF_reg_reg(rRegI dst, vlRegF src, vlRegF tmp) %{
 3600   effect(TEMP tmp);
 3601   match(Set dst (ConvF2HF src));
 3602   ins_cost(125);
 3603   format %{ "vcvtps2ph $dst,$src \t using $tmp as TEMP"%}
 3604   ins_encode %{
 3605     __ flt_to_flt16($dst$$Register, $src$$XMMRegister, $tmp$$XMMRegister);
 3606   %}
 3607   ins_pipe( pipe_slow );
 3608 %}
 3609 
 3610 instruct convF2HF_mem_reg(memory mem, regF src, kReg ktmp, rRegI rtmp) %{
 3611   predicate((UseAVX > 2) && VM_Version::supports_avx512vl());
 3612   effect(TEMP ktmp, TEMP rtmp);
 3613   match(Set mem (StoreC mem (ConvF2HF src)));
 3614   format %{ "evcvtps2ph $mem,$src \t using $ktmp and $rtmp as TEMP" %}
 3615   ins_encode %{
 3616     __ movl($rtmp$$Register, 0x1);
 3617     __ kmovwl($ktmp$$KRegister, $rtmp$$Register);
 3618     __ evcvtps2ph($mem$$Address, $ktmp$$KRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
 3619   %}
 3620   ins_pipe( pipe_slow );
 3621 %}
 3622 
 3623 instruct vconvF2HF(vec dst, vec src) %{
 3624   match(Set dst (VectorCastF2HF src));
 3625   format %{ "vector_conv_F2HF $dst $src" %}
 3626   ins_encode %{
 3627     int vlen_enc = vector_length_encoding(this, $src);
 3628     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, vlen_enc);
 3629   %}
 3630   ins_pipe( pipe_slow );
 3631 %}
 3632 
 3633 instruct vconvF2HF_mem_reg(memory mem, vec src) %{
 3634   predicate(n->as_StoreVector()->memory_size() >= 16);
 3635   match(Set mem (StoreVector mem (VectorCastF2HF src)));
 3636   format %{ "vcvtps2ph $mem,$src" %}
 3637   ins_encode %{
 3638     int vlen_enc = vector_length_encoding(this, $src);
 3639     __ vcvtps2ph($mem$$Address, $src$$XMMRegister, 0x04, vlen_enc);
 3640   %}
 3641   ins_pipe( pipe_slow );
 3642 %}
 3643 
 3644 instruct convHF2F_reg_reg(vlRegF dst, rRegI src) %{
 3645   match(Set dst (ConvHF2F src));
 3646   format %{ "vcvtph2ps $dst,$src" %}
 3647   ins_encode %{
 3648     __ flt16_to_flt($dst$$XMMRegister, $src$$Register);
 3649   %}
 3650   ins_pipe( pipe_slow );
 3651 %}
 3652 
 3653 instruct vconvHF2F_reg_mem(vec dst, memory mem) %{
 3654   match(Set dst (VectorCastHF2F (LoadVector mem)));
 3655   format %{ "vcvtph2ps $dst,$mem" %}
 3656   ins_encode %{
 3657     int vlen_enc = vector_length_encoding(this);
 3658     __ vcvtph2ps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 3659   %}
 3660   ins_pipe( pipe_slow );
 3661 %}
 3662 
 3663 instruct vconvHF2F(vec dst, vec src) %{
 3664   match(Set dst (VectorCastHF2F src));
 3665   ins_cost(125);
 3666   format %{ "vector_conv_HF2F $dst,$src" %}
 3667   ins_encode %{
 3668     int vlen_enc = vector_length_encoding(this);
 3669     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 3670   %}
 3671   ins_pipe( pipe_slow );
 3672 %}
 3673 
 3674 // ---------------------------------------- VectorReinterpret ------------------------------------
 3675 instruct reinterpret_mask(kReg dst) %{
 3676   predicate(n->bottom_type()->isa_vectmask() &&
 3677             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
 3678   match(Set dst (VectorReinterpret dst));
 3679   ins_cost(125);
 3680   format %{ "vector_reinterpret $dst\t!" %}
 3681   ins_encode %{
 3682     // empty
 3683   %}
 3684   ins_pipe( pipe_slow );
 3685 %}
 3686 
 3687 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
 3688   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3689             n->bottom_type()->isa_vectmask() &&
 3690             n->in(1)->bottom_type()->isa_vectmask() &&
 3691             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
 3692             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3693   match(Set dst (VectorReinterpret src));
 3694   effect(TEMP xtmp);
 3695   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
 3696   ins_encode %{
 3697      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
 3698      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3699      assert(src_sz == dst_sz , "src and dst size mismatch");
 3700      int vlen_enc = vector_length_encoding(src_sz);
 3701      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3702      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3703   %}
 3704   ins_pipe( pipe_slow );
 3705 %}
 3706 
 3707 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
 3708   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3709             n->bottom_type()->isa_vectmask() &&
 3710             n->in(1)->bottom_type()->isa_vectmask() &&
 3711             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
 3712              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
 3713             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3714   match(Set dst (VectorReinterpret src));
 3715   effect(TEMP xtmp);
 3716   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
 3717   ins_encode %{
 3718      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
 3719      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3720      assert(src_sz == dst_sz , "src and dst size mismatch");
 3721      int vlen_enc = vector_length_encoding(src_sz);
 3722      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3723      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3724   %}
 3725   ins_pipe( pipe_slow );
 3726 %}
 3727 
 3728 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
 3729   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3730             n->bottom_type()->isa_vectmask() &&
 3731             n->in(1)->bottom_type()->isa_vectmask() &&
 3732             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
 3733              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
 3734             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3735   match(Set dst (VectorReinterpret src));
 3736   effect(TEMP xtmp);
 3737   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
 3738   ins_encode %{
 3739      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
 3740      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3741      assert(src_sz == dst_sz , "src and dst size mismatch");
 3742      int vlen_enc = vector_length_encoding(src_sz);
 3743      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3744      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3745   %}
 3746   ins_pipe( pipe_slow );
 3747 %}
 3748 
 3749 instruct reinterpret(vec dst) %{
 3750   predicate(!n->bottom_type()->isa_vectmask() &&
 3751             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
 3752   match(Set dst (VectorReinterpret dst));
 3753   ins_cost(125);
 3754   format %{ "vector_reinterpret $dst\t!" %}
 3755   ins_encode %{
 3756     // empty
 3757   %}
 3758   ins_pipe( pipe_slow );
 3759 %}
 3760 
 3761 instruct reinterpret_expand(vec dst, vec src) %{
 3762   predicate(UseAVX == 0 &&
 3763             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3764   match(Set dst (VectorReinterpret src));
 3765   ins_cost(125);
 3766   effect(TEMP dst);
 3767   format %{ "vector_reinterpret_expand $dst,$src" %}
 3768   ins_encode %{
 3769     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
 3770     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
 3771 
 3772     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
 3773     if (src_vlen_in_bytes == 4) {
 3774       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), noreg);
 3775     } else {
 3776       assert(src_vlen_in_bytes == 8, "");
 3777       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), noreg);
 3778     }
 3779     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 3780   %}
 3781   ins_pipe( pipe_slow );
 3782 %}
 3783 
 3784 instruct vreinterpret_expand4(legVec dst, vec src) %{
 3785   predicate(UseAVX > 0 &&
 3786             !n->bottom_type()->isa_vectmask() &&
 3787             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
 3788             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3789   match(Set dst (VectorReinterpret src));
 3790   ins_cost(125);
 3791   format %{ "vector_reinterpret_expand $dst,$src" %}
 3792   ins_encode %{
 3793     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, noreg);
 3794   %}
 3795   ins_pipe( pipe_slow );
 3796 %}
 3797 
 3798 
 3799 instruct vreinterpret_expand(legVec dst, vec src) %{
 3800   predicate(UseAVX > 0 &&
 3801             !n->bottom_type()->isa_vectmask() &&
 3802             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
 3803             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3804   match(Set dst (VectorReinterpret src));
 3805   ins_cost(125);
 3806   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
 3807   ins_encode %{
 3808     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3809       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3810       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3811       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3812       default: ShouldNotReachHere();
 3813     }
 3814   %}
 3815   ins_pipe( pipe_slow );
 3816 %}
 3817 
 3818 instruct reinterpret_shrink(vec dst, legVec src) %{
 3819   predicate(!n->bottom_type()->isa_vectmask() &&
 3820             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
 3821   match(Set dst (VectorReinterpret src));
 3822   ins_cost(125);
 3823   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
 3824   ins_encode %{
 3825     switch (Matcher::vector_length_in_bytes(this)) {
 3826       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
 3827       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3828       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3829       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3830       default: ShouldNotReachHere();
 3831     }
 3832   %}
 3833   ins_pipe( pipe_slow );
 3834 %}
 3835 
 3836 // ----------------------------------------------------------------------------------------------------
 3837 
 3838 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
 3839   match(Set dst (RoundDoubleMode src rmode));
 3840   format %{ "roundsd $dst,$src" %}
 3841   ins_cost(150);
 3842   ins_encode %{
 3843     assert(UseSSE >= 4, "required");
 3844     if ((UseAVX == 0) && ($dst$$XMMRegister != $src$$XMMRegister)) {
 3845       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 3846     }
 3847     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
 3848   %}
 3849   ins_pipe(pipe_slow);
 3850 %}
 3851 
 3852 instruct roundD_imm(legRegD dst, immD con, immU8 rmode) %{
 3853   match(Set dst (RoundDoubleMode con rmode));
 3854   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
 3855   ins_cost(150);
 3856   ins_encode %{
 3857     assert(UseSSE >= 4, "required");
 3858     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, noreg);
 3859   %}
 3860   ins_pipe(pipe_slow);
 3861 %}
 3862 
 3863 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
 3864   predicate(Matcher::vector_length(n) < 8);
 3865   match(Set dst (RoundDoubleModeV src rmode));
 3866   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
 3867   ins_encode %{
 3868     assert(UseAVX > 0, "required");
 3869     int vlen_enc = vector_length_encoding(this);
 3870     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
 3871   %}
 3872   ins_pipe( pipe_slow );
 3873 %}
 3874 
 3875 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
 3876   predicate(Matcher::vector_length(n) == 8);
 3877   match(Set dst (RoundDoubleModeV src rmode));
 3878   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
 3879   ins_encode %{
 3880     assert(UseAVX > 2, "required");
 3881     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
 3882   %}
 3883   ins_pipe( pipe_slow );
 3884 %}
 3885 
 3886 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
 3887   predicate(Matcher::vector_length(n) < 8);
 3888   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3889   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
 3890   ins_encode %{
 3891     assert(UseAVX > 0, "required");
 3892     int vlen_enc = vector_length_encoding(this);
 3893     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
 3894   %}
 3895   ins_pipe( pipe_slow );
 3896 %}
 3897 
 3898 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
 3899   predicate(Matcher::vector_length(n) == 8);
 3900   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3901   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
 3902   ins_encode %{
 3903     assert(UseAVX > 2, "required");
 3904     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
 3905   %}
 3906   ins_pipe( pipe_slow );
 3907 %}
 3908 
 3909 instruct onspinwait() %{
 3910   match(OnSpinWait);
 3911   ins_cost(200);
 3912 
 3913   format %{
 3914     $$template
 3915     $$emit$$"pause\t! membar_onspinwait"
 3916   %}
 3917   ins_encode %{
 3918     __ pause();
 3919   %}
 3920   ins_pipe(pipe_slow);
 3921 %}
 3922 
 3923 // a * b + c
 3924 instruct fmaD_reg(regD a, regD b, regD c) %{
 3925   match(Set c (FmaD  c (Binary a b)));
 3926   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
 3927   ins_cost(150);
 3928   ins_encode %{
 3929     assert(UseFMA, "Needs FMA instructions support.");
 3930     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3931   %}
 3932   ins_pipe( pipe_slow );
 3933 %}
 3934 
 3935 // a * b + c
 3936 instruct fmaF_reg(regF a, regF b, regF c) %{
 3937   match(Set c (FmaF  c (Binary a b)));
 3938   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
 3939   ins_cost(150);
 3940   ins_encode %{
 3941     assert(UseFMA, "Needs FMA instructions support.");
 3942     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3943   %}
 3944   ins_pipe( pipe_slow );
 3945 %}
 3946 
 3947 // ====================VECTOR INSTRUCTIONS=====================================
 3948 
 3949 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
 3950 instruct MoveVec2Leg(legVec dst, vec src) %{
 3951   match(Set dst src);
 3952   format %{ "" %}
 3953   ins_encode %{
 3954     ShouldNotReachHere();
 3955   %}
 3956   ins_pipe( fpu_reg_reg );
 3957 %}
 3958 
 3959 instruct MoveLeg2Vec(vec dst, legVec src) %{
 3960   match(Set dst src);
 3961   format %{ "" %}
 3962   ins_encode %{
 3963     ShouldNotReachHere();
 3964   %}
 3965   ins_pipe( fpu_reg_reg );
 3966 %}
 3967 
 3968 // ============================================================================
 3969 
 3970 // Load vectors generic operand pattern
 3971 instruct loadV(vec dst, memory mem) %{
 3972   match(Set dst (LoadVector mem));
 3973   ins_cost(125);
 3974   format %{ "load_vector $dst,$mem" %}
 3975   ins_encode %{
 3976     BasicType bt = Matcher::vector_element_basic_type(this);
 3977     __ load_vector(bt, $dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
 3978   %}
 3979   ins_pipe( pipe_slow );
 3980 %}
 3981 
 3982 // Store vectors generic operand pattern.
 3983 instruct storeV(memory mem, vec src) %{
 3984   match(Set mem (StoreVector mem src));
 3985   ins_cost(145);
 3986   format %{ "store_vector $mem,$src\n\t" %}
 3987   ins_encode %{
 3988     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3989       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
 3990       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
 3991       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
 3992       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
 3993       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
 3994       default: ShouldNotReachHere();
 3995     }
 3996   %}
 3997   ins_pipe( pipe_slow );
 3998 %}
 3999 
 4000 // ---------------------------------------- Gather ------------------------------------
 4001 
 4002 // Gather BYTE, SHORT, INT, LONG, FLOAT, DOUBLE
 4003 
 4004 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
 4005   predicate(!VM_Version::supports_avx512vl() && !is_subword_type(Matcher::vector_element_basic_type(n)) &&
 4006             Matcher::vector_length_in_bytes(n) <= 32);
 4007   match(Set dst (LoadVectorGather mem idx));
 4008   effect(TEMP dst, TEMP tmp, TEMP mask);
 4009   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
 4010   ins_encode %{
 4011     int vlen_enc = vector_length_encoding(this);
 4012     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4013     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4014     __ vpcmpeqd($mask$$XMMRegister, $mask$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4015     __ lea($tmp$$Register, $mem$$Address);
 4016     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4017   %}
 4018   ins_pipe( pipe_slow );
 4019 %}
 4020 
 4021 
 4022 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
 4023   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4024             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4025   match(Set dst (LoadVectorGather mem idx));
 4026   effect(TEMP dst, TEMP tmp, TEMP ktmp);
 4027   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
 4028   ins_encode %{
 4029     int vlen_enc = vector_length_encoding(this);
 4030     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4031     __ kxnorwl($ktmp$$KRegister, $ktmp$$KRegister, $ktmp$$KRegister);
 4032     __ lea($tmp$$Register, $mem$$Address);
 4033     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4034   %}
 4035   ins_pipe( pipe_slow );
 4036 %}
 4037 
 4038 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4039   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4040             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4041   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
 4042   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
 4043   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
 4044   ins_encode %{
 4045     assert(UseAVX > 2, "sanity");
 4046     int vlen_enc = vector_length_encoding(this);
 4047     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4048     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4049     // Note: Since gather instruction partially updates the opmask register used
 4050     // for predication hense moving mask operand to a temporary.
 4051     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4052     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4053     __ lea($tmp$$Register, $mem$$Address);
 4054     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4055   %}
 4056   ins_pipe( pipe_slow );
 4057 %}
 4058 
 4059 instruct vgather_subwordLE8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegI rtmp) %{
 4060   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4061   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4062   effect(TEMP tmp, TEMP rtmp);
 4063   format %{ "vector_gatherLE8 $dst, $mem, $idx_base\t! using $tmp and $rtmp as TEMP" %}
 4064   ins_encode %{
 4065     int vlen_enc = vector_length_encoding(this);
 4066     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4067     __ lea($tmp$$Register, $mem$$Address);
 4068     __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp$$Register, vlen_enc);
 4069   %}
 4070   ins_pipe( pipe_slow );
 4071 %}
 4072 
 4073 instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegP idx_base_temp,
 4074                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4075   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4076   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4077   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4078   format %{ "vector_gatherGT8 $dst, $mem, $idx_base\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4079   ins_encode %{
 4080     int vlen_enc = vector_length_encoding(this);
 4081     int vector_len = Matcher::vector_length(this);
 4082     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4083     __ lea($tmp$$Register, $mem$$Address);
 4084     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4085     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, noreg, $xtmp1$$XMMRegister,
 4086                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4087   %}
 4088   ins_pipe( pipe_slow );
 4089 %}
 4090 
 4091 instruct vgather_subwordLE8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegI rtmp, rFlagsReg cr) %{
 4092   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4093   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4094   effect(TEMP tmp, TEMP rtmp, KILL cr);
 4095   format %{ "vector_gatherLE8_off $dst, $mem, $idx_base, $offset\t! using $tmp and $rtmp as TEMP" %}
 4096   ins_encode %{
 4097     int vlen_enc = vector_length_encoding(this);
 4098     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4099     __ lea($tmp$$Register, $mem$$Address);
 4100     __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register, $rtmp$$Register, vlen_enc);
 4101   %}
 4102   ins_pipe( pipe_slow );
 4103 %}
 4104 
 4105 
 4106 instruct vgather_subwordGT8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegP idx_base_temp,
 4107                                  vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4108   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4109   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4110   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4111   format %{ "vector_gatherGT8_off $dst, $mem, $idx_base, $offset\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4112   ins_encode %{
 4113     int vlen_enc = vector_length_encoding(this);
 4114     int vector_len = Matcher::vector_length(this);
 4115     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4116     __ lea($tmp$$Register, $mem$$Address);
 4117     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4118     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, noreg, $xtmp1$$XMMRegister,
 4119                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4120   %}
 4121   ins_pipe( pipe_slow );
 4122 %}
 4123 
 4124 
 4125 instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4126   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4127   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4128   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4129   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4130   ins_encode %{
 4131     int vlen_enc = vector_length_encoding(this);
 4132     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4133     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4134     __ lea($tmp$$Register, $mem$$Address);
 4135     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4136     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4137   %}
 4138   ins_pipe( pipe_slow );
 4139 %}
 4140 
 4141 instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4142                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4143   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4144   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4145   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4146   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4147   ins_encode %{
 4148     int vlen_enc = vector_length_encoding(this);
 4149     int vector_len = Matcher::vector_length(this);
 4150     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4151     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4152     __ lea($tmp$$Register, $mem$$Address);
 4153     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4154     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4155     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4156                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4157   %}
 4158   ins_pipe( pipe_slow );
 4159 %}
 4160 
 4161 instruct vgather_masked_subwordLE8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4162   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4163   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4164   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4165   format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4166   ins_encode %{
 4167     int vlen_enc = vector_length_encoding(this);
 4168     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4169     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4170     __ lea($tmp$$Register, $mem$$Address);
 4171     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4172     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
 4173                                 $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4174   %}
 4175   ins_pipe( pipe_slow );
 4176 %}
 4177 
 4178 instruct vgather_masked_subwordGT8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4179                                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4180   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4181   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4182   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4183   format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4184   ins_encode %{
 4185     int vlen_enc = vector_length_encoding(this);
 4186     int vector_len = Matcher::vector_length(this);
 4187     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4188     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4189     __ lea($tmp$$Register, $mem$$Address);
 4190     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4191     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4192     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4193                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4194   %}
 4195   ins_pipe( pipe_slow );
 4196 %}
 4197 
 4198 instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4199   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4200   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4201   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4202   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4203   ins_encode %{
 4204     int vlen_enc = vector_length_encoding(this);
 4205     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4206     __ lea($tmp$$Register, $mem$$Address);
 4207     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4208     if (elem_bt == T_SHORT) {
 4209       __ movl($mask_idx$$Register, 0x55555555);
 4210       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4211     }
 4212     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4213     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4214   %}
 4215   ins_pipe( pipe_slow );
 4216 %}
 4217 
 4218 instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegP tmp, rRegP idx_base_temp,
 4219                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4220   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4221   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4222   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4223   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4224   ins_encode %{
 4225     int vlen_enc = vector_length_encoding(this);
 4226     int vector_len = Matcher::vector_length(this);
 4227     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4228     __ lea($tmp$$Register, $mem$$Address);
 4229     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4230     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4231     if (elem_bt == T_SHORT) {
 4232       __ movl($mask_idx$$Register, 0x55555555);
 4233       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4234     }
 4235     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4236     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4237                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4238   %}
 4239   ins_pipe( pipe_slow );
 4240 %}
 4241 
 4242 instruct vgather_masked_subwordLE8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4243   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4244   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4245   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4246   format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4247   ins_encode %{
 4248     int vlen_enc = vector_length_encoding(this);
 4249     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4250     __ lea($tmp$$Register, $mem$$Address);
 4251     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4252     if (elem_bt == T_SHORT) {
 4253       __ movl($mask_idx$$Register, 0x55555555);
 4254       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4255     }
 4256     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4257     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
 4258                                 $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4259   %}
 4260   ins_pipe( pipe_slow );
 4261 %}
 4262 
 4263 instruct vgather_masked_subwordGT8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegP tmp, rRegP idx_base_temp,
 4264                                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4265   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4266   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4267   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4268   format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4269   ins_encode %{
 4270     int vlen_enc = vector_length_encoding(this);
 4271     int vector_len = Matcher::vector_length(this);
 4272     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4273     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4274     __ lea($tmp$$Register, $mem$$Address);
 4275     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4276     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4277     if (elem_bt == T_SHORT) {
 4278       __ movl($mask_idx$$Register, 0x55555555);
 4279       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4280     }
 4281     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4282     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4283                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4284   %}
 4285   ins_pipe( pipe_slow );
 4286 %}
 4287 
 4288 // ====================Scatter=======================================
 4289 
 4290 // Scatter INT, LONG, FLOAT, DOUBLE
 4291 
 4292 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
 4293   predicate(UseAVX > 2);
 4294   match(Set mem (StoreVectorScatter mem (Binary src idx)));
 4295   effect(TEMP tmp, TEMP ktmp);
 4296   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
 4297   ins_encode %{
 4298     int vlen_enc = vector_length_encoding(this, $src);
 4299     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4300 
 4301     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4302     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4303 
 4304     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4305     __ lea($tmp$$Register, $mem$$Address);
 4306     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4307   %}
 4308   ins_pipe( pipe_slow );
 4309 %}
 4310 
 4311 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4312   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
 4313   effect(TEMP tmp, TEMP ktmp);
 4314   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
 4315   ins_encode %{
 4316     int vlen_enc = vector_length_encoding(this, $src);
 4317     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4318     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4319     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4320     // Note: Since scatter instruction partially updates the opmask register used
 4321     // for predication hense moving mask operand to a temporary.
 4322     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4323     __ lea($tmp$$Register, $mem$$Address);
 4324     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4325   %}
 4326   ins_pipe( pipe_slow );
 4327 %}
 4328 
 4329 // ====================REPLICATE=======================================
 4330 
 4331 // Replicate byte scalar to be vector
 4332 instruct vReplB_reg(vec dst, rRegI src) %{
 4333   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 4334   match(Set dst (Replicate src));
 4335   format %{ "replicateB $dst,$src" %}
 4336   ins_encode %{
 4337     uint vlen = Matcher::vector_length(this);
 4338     if (UseAVX >= 2) {
 4339       int vlen_enc = vector_length_encoding(this);
 4340       if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4341         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
 4342         __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
 4343       } else {
 4344         __ movdl($dst$$XMMRegister, $src$$Register);
 4345         __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4346       }
 4347     } else {
 4348        assert(UseAVX < 2, "");
 4349       __ movdl($dst$$XMMRegister, $src$$Register);
 4350       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
 4351       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4352       if (vlen >= 16) {
 4353         assert(vlen == 16, "");
 4354         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4355       }
 4356     }
 4357   %}
 4358   ins_pipe( pipe_slow );
 4359 %}
 4360 
 4361 instruct ReplB_mem(vec dst, memory mem) %{
 4362   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_BYTE);
 4363   match(Set dst (Replicate (LoadB mem)));
 4364   format %{ "replicateB $dst,$mem" %}
 4365   ins_encode %{
 4366     int vlen_enc = vector_length_encoding(this);
 4367     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4368   %}
 4369   ins_pipe( pipe_slow );
 4370 %}
 4371 
 4372 // ====================ReplicateS=======================================
 4373 
 4374 instruct vReplS_reg(vec dst, rRegI src) %{
 4375   predicate(Matcher::vector_element_basic_type(n) == T_SHORT);
 4376   match(Set dst (Replicate src));
 4377   format %{ "replicateS $dst,$src" %}
 4378   ins_encode %{
 4379     uint vlen = Matcher::vector_length(this);
 4380     int vlen_enc = vector_length_encoding(this);
 4381     if (UseAVX >= 2) {
 4382       if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4383         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
 4384         __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
 4385       } else {
 4386         __ movdl($dst$$XMMRegister, $src$$Register);
 4387         __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4388       }
 4389     } else {
 4390       assert(UseAVX < 2, "");
 4391       __ movdl($dst$$XMMRegister, $src$$Register);
 4392       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4393       if (vlen >= 8) {
 4394         assert(vlen == 8, "");
 4395         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4396       }
 4397     }
 4398   %}
 4399   ins_pipe( pipe_slow );
 4400 %}
 4401 
 4402 instruct ReplHF_imm(vec dst, immH con, rRegI rtmp) %{
 4403   match(Set dst (Replicate con));
 4404   effect(TEMP rtmp);
 4405   format %{ "replicateHF $dst, $con \t! using $rtmp as TEMP" %}
 4406   ins_encode %{
 4407     int vlen_enc = vector_length_encoding(this);
 4408     BasicType bt = Matcher::vector_element_basic_type(this);
 4409     assert(VM_Version::supports_avx512_fp16() && bt == T_SHORT, "");
 4410     __ movl($rtmp$$Register, $con$$constant);
 4411     __ evpbroadcastw($dst$$XMMRegister, $rtmp$$Register, vlen_enc);
 4412   %}
 4413   ins_pipe( pipe_slow );
 4414 %}
 4415 
 4416 instruct ReplHF_reg(vec dst, regF src, rRegI rtmp) %{
 4417   predicate(VM_Version::supports_avx512_fp16() && Matcher::vector_element_basic_type(n) == T_SHORT);
 4418   match(Set dst (Replicate src));
 4419   effect(TEMP rtmp);
 4420   format %{ "replicateHF $dst, $src \t! using $rtmp as TEMP" %}
 4421   ins_encode %{
 4422     int vlen_enc = vector_length_encoding(this);
 4423     __ vmovw($rtmp$$Register, $src$$XMMRegister);
 4424     __ evpbroadcastw($dst$$XMMRegister, $rtmp$$Register, vlen_enc);
 4425   %}
 4426   ins_pipe( pipe_slow );
 4427 %}
 4428 
 4429 instruct ReplS_mem(vec dst, memory mem) %{
 4430   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_SHORT);
 4431   match(Set dst (Replicate (LoadS mem)));
 4432   format %{ "replicateS $dst,$mem" %}
 4433   ins_encode %{
 4434     int vlen_enc = vector_length_encoding(this);
 4435     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4436   %}
 4437   ins_pipe( pipe_slow );
 4438 %}
 4439 
 4440 // ====================ReplicateI=======================================
 4441 
 4442 instruct ReplI_reg(vec dst, rRegI src) %{
 4443   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4444   match(Set dst (Replicate src));
 4445   format %{ "replicateI $dst,$src" %}
 4446   ins_encode %{
 4447     uint vlen = Matcher::vector_length(this);
 4448     int vlen_enc = vector_length_encoding(this);
 4449     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4450       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
 4451     } else if (VM_Version::supports_avx2()) {
 4452       __ movdl($dst$$XMMRegister, $src$$Register);
 4453       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4454     } else {
 4455       __ movdl($dst$$XMMRegister, $src$$Register);
 4456       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4457     }
 4458   %}
 4459   ins_pipe( pipe_slow );
 4460 %}
 4461 
 4462 instruct ReplI_mem(vec dst, memory mem) %{
 4463   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4464   match(Set dst (Replicate (LoadI mem)));
 4465   format %{ "replicateI $dst,$mem" %}
 4466   ins_encode %{
 4467     int vlen_enc = vector_length_encoding(this);
 4468     if (VM_Version::supports_avx2()) {
 4469       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4470     } else if (VM_Version::supports_avx()) {
 4471       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4472     } else {
 4473       __ movdl($dst$$XMMRegister, $mem$$Address);
 4474       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4475     }
 4476   %}
 4477   ins_pipe( pipe_slow );
 4478 %}
 4479 
 4480 instruct ReplI_imm(vec dst, immI con) %{
 4481   predicate(Matcher::is_non_long_integral_vector(n));
 4482   match(Set dst (Replicate con));
 4483   format %{ "replicateI $dst,$con" %}
 4484   ins_encode %{
 4485     InternalAddress addr = $constantaddress(vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
 4486                                                            (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 16) /
 4487                                                                    type2aelembytes(Matcher::vector_element_basic_type(this))));
 4488     BasicType bt = Matcher::vector_element_basic_type(this);
 4489     int vlen = Matcher::vector_length_in_bytes(this);
 4490     __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
 4491   %}
 4492   ins_pipe( pipe_slow );
 4493 %}
 4494 
 4495 // Replicate scalar zero to be vector
 4496 instruct ReplI_zero(vec dst, immI_0 zero) %{
 4497   predicate(Matcher::is_non_long_integral_vector(n));
 4498   match(Set dst (Replicate zero));
 4499   format %{ "replicateI $dst,$zero" %}
 4500   ins_encode %{
 4501     int vlen_enc = vector_length_encoding(this);
 4502     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4503       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4504     } else {
 4505       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4506     }
 4507   %}
 4508   ins_pipe( fpu_reg_reg );
 4509 %}
 4510 
 4511 instruct ReplI_M1(vec dst, immI_M1 con) %{
 4512   predicate(Matcher::is_non_long_integral_vector(n));
 4513   match(Set dst (Replicate con));
 4514   format %{ "vallones $dst" %}
 4515   ins_encode %{
 4516     int vector_len = vector_length_encoding(this);
 4517     __ vallones($dst$$XMMRegister, vector_len);
 4518   %}
 4519   ins_pipe( pipe_slow );
 4520 %}
 4521 
 4522 // ====================ReplicateL=======================================
 4523 
 4524 // Replicate long (8 byte) scalar to be vector
 4525 instruct ReplL_reg(vec dst, rRegL src) %{
 4526   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4527   match(Set dst (Replicate src));
 4528   format %{ "replicateL $dst,$src" %}
 4529   ins_encode %{
 4530     int vlen = Matcher::vector_length(this);
 4531     int vlen_enc = vector_length_encoding(this);
 4532     if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4533       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
 4534     } else if (VM_Version::supports_avx2()) {
 4535       __ movdq($dst$$XMMRegister, $src$$Register);
 4536       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4537     } else {
 4538       __ movdq($dst$$XMMRegister, $src$$Register);
 4539       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4540     }
 4541   %}
 4542   ins_pipe( pipe_slow );
 4543 %}
 4544 
 4545 instruct ReplL_mem(vec dst, memory mem) %{
 4546   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4547   match(Set dst (Replicate (LoadL mem)));
 4548   format %{ "replicateL $dst,$mem" %}
 4549   ins_encode %{
 4550     int vlen_enc = vector_length_encoding(this);
 4551     if (VM_Version::supports_avx2()) {
 4552       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4553     } else if (VM_Version::supports_sse3()) {
 4554       __ movddup($dst$$XMMRegister, $mem$$Address);
 4555     } else {
 4556       __ movq($dst$$XMMRegister, $mem$$Address);
 4557       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4558     }
 4559   %}
 4560   ins_pipe( pipe_slow );
 4561 %}
 4562 
 4563 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
 4564 instruct ReplL_imm(vec dst, immL con) %{
 4565   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4566   match(Set dst (Replicate con));
 4567   format %{ "replicateL $dst,$con" %}
 4568   ins_encode %{
 4569     InternalAddress addr = $constantaddress(vreplicate_imm(T_LONG, $con$$constant, VM_Version::supports_sse3() ? 1 : 2));
 4570     int vlen = Matcher::vector_length_in_bytes(this);
 4571     __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
 4572   %}
 4573   ins_pipe( pipe_slow );
 4574 %}
 4575 
 4576 instruct ReplL_zero(vec dst, immL0 zero) %{
 4577   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4578   match(Set dst (Replicate zero));
 4579   format %{ "replicateL $dst,$zero" %}
 4580   ins_encode %{
 4581     int vlen_enc = vector_length_encoding(this);
 4582     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4583       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4584     } else {
 4585       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4586     }
 4587   %}
 4588   ins_pipe( fpu_reg_reg );
 4589 %}
 4590 
 4591 instruct ReplL_M1(vec dst, immL_M1 con) %{
 4592   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4593   match(Set dst (Replicate con));
 4594   format %{ "vallones $dst" %}
 4595   ins_encode %{
 4596     int vector_len = vector_length_encoding(this);
 4597     __ vallones($dst$$XMMRegister, vector_len);
 4598   %}
 4599   ins_pipe( pipe_slow );
 4600 %}
 4601 
 4602 // ====================ReplicateF=======================================
 4603 
 4604 instruct vReplF_reg(vec dst, vlRegF src) %{
 4605   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4606   match(Set dst (Replicate src));
 4607   format %{ "replicateF $dst,$src" %}
 4608   ins_encode %{
 4609     uint vlen = Matcher::vector_length(this);
 4610     int vlen_enc = vector_length_encoding(this);
 4611     if (vlen <= 4) {
 4612       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4613     } else if (VM_Version::supports_avx2()) {
 4614       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4615     } else {
 4616       assert(vlen == 8, "sanity");
 4617       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4618       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4619     }
 4620   %}
 4621   ins_pipe( pipe_slow );
 4622 %}
 4623 
 4624 instruct ReplF_reg(vec dst, vlRegF src) %{
 4625   predicate(UseAVX == 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4626   match(Set dst (Replicate src));
 4627   format %{ "replicateF $dst,$src" %}
 4628   ins_encode %{
 4629     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4630   %}
 4631   ins_pipe( pipe_slow );
 4632 %}
 4633 
 4634 instruct ReplF_mem(vec dst, memory mem) %{
 4635   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4636   match(Set dst (Replicate (LoadF mem)));
 4637   format %{ "replicateF $dst,$mem" %}
 4638   ins_encode %{
 4639     int vlen_enc = vector_length_encoding(this);
 4640     __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4641   %}
 4642   ins_pipe( pipe_slow );
 4643 %}
 4644 
 4645 // Replicate float scalar immediate to be vector by loading from const table.
 4646 instruct ReplF_imm(vec dst, immF con) %{
 4647   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4648   match(Set dst (Replicate con));
 4649   format %{ "replicateF $dst,$con" %}
 4650   ins_encode %{
 4651     InternalAddress addr = $constantaddress(vreplicate_imm(T_FLOAT, $con$$constant,
 4652                                                            VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 4));
 4653     int vlen = Matcher::vector_length_in_bytes(this);
 4654     __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
 4655   %}
 4656   ins_pipe( pipe_slow );
 4657 %}
 4658 
 4659 instruct ReplF_zero(vec dst, immF0 zero) %{
 4660   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4661   match(Set dst (Replicate zero));
 4662   format %{ "replicateF $dst,$zero" %}
 4663   ins_encode %{
 4664     int vlen_enc = vector_length_encoding(this);
 4665     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4666       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4667     } else {
 4668       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4669     }
 4670   %}
 4671   ins_pipe( fpu_reg_reg );
 4672 %}
 4673 
 4674 // ====================ReplicateD=======================================
 4675 
 4676 // Replicate double (8 bytes) scalar to be vector
 4677 instruct vReplD_reg(vec dst, vlRegD src) %{
 4678   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4679   match(Set dst (Replicate src));
 4680   format %{ "replicateD $dst,$src" %}
 4681   ins_encode %{
 4682     uint vlen = Matcher::vector_length(this);
 4683     int vlen_enc = vector_length_encoding(this);
 4684     if (vlen <= 2) {
 4685       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4686     } else if (VM_Version::supports_avx2()) {
 4687       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4688     } else {
 4689       assert(vlen == 4, "sanity");
 4690       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4691       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4692     }
 4693   %}
 4694   ins_pipe( pipe_slow );
 4695 %}
 4696 
 4697 instruct ReplD_reg(vec dst, vlRegD src) %{
 4698   predicate(UseSSE < 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4699   match(Set dst (Replicate src));
 4700   format %{ "replicateD $dst,$src" %}
 4701   ins_encode %{
 4702     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4703   %}
 4704   ins_pipe( pipe_slow );
 4705 %}
 4706 
 4707 instruct ReplD_mem(vec dst, memory mem) %{
 4708   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4709   match(Set dst (Replicate (LoadD mem)));
 4710   format %{ "replicateD $dst,$mem" %}
 4711   ins_encode %{
 4712     if (Matcher::vector_length(this) >= 4) {
 4713       int vlen_enc = vector_length_encoding(this);
 4714       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4715     } else {
 4716       __ movddup($dst$$XMMRegister, $mem$$Address);
 4717     }
 4718   %}
 4719   ins_pipe( pipe_slow );
 4720 %}
 4721 
 4722 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
 4723 instruct ReplD_imm(vec dst, immD con) %{
 4724   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4725   match(Set dst (Replicate con));
 4726   format %{ "replicateD $dst,$con" %}
 4727   ins_encode %{
 4728     InternalAddress addr = $constantaddress(vreplicate_imm(T_DOUBLE, $con$$constant, VM_Version::supports_sse3() ? 1 : 2));
 4729     int vlen = Matcher::vector_length_in_bytes(this);
 4730     __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
 4731   %}
 4732   ins_pipe( pipe_slow );
 4733 %}
 4734 
 4735 instruct ReplD_zero(vec dst, immD0 zero) %{
 4736   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4737   match(Set dst (Replicate zero));
 4738   format %{ "replicateD $dst,$zero" %}
 4739   ins_encode %{
 4740     int vlen_enc = vector_length_encoding(this);
 4741     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4742       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4743     } else {
 4744       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4745     }
 4746   %}
 4747   ins_pipe( fpu_reg_reg );
 4748 %}
 4749 
 4750 // ====================VECTOR INSERT=======================================
 4751 
 4752 instruct insert(vec dst, rRegI val, immU8 idx) %{
 4753   predicate(Matcher::vector_length_in_bytes(n) < 32);
 4754   match(Set dst (VectorInsert (Binary dst val) idx));
 4755   format %{ "vector_insert $dst,$val,$idx" %}
 4756   ins_encode %{
 4757     assert(UseSSE >= 4, "required");
 4758     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
 4759 
 4760     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4761 
 4762     assert(is_integral_type(elem_bt), "");
 4763     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4764 
 4765     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
 4766   %}
 4767   ins_pipe( pipe_slow );
 4768 %}
 4769 
 4770 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
 4771   predicate(Matcher::vector_length_in_bytes(n) == 32);
 4772   match(Set dst (VectorInsert (Binary src val) idx));
 4773   effect(TEMP vtmp);
 4774   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4775   ins_encode %{
 4776     int vlen_enc = Assembler::AVX_256bit;
 4777     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4778     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4779     int log2epr = log2(elem_per_lane);
 4780 
 4781     assert(is_integral_type(elem_bt), "sanity");
 4782     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4783 
 4784     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4785     uint y_idx = ($idx$$constant >> log2epr) & 1;
 4786     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4787     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4788     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4789   %}
 4790   ins_pipe( pipe_slow );
 4791 %}
 4792 
 4793 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
 4794   predicate(Matcher::vector_length_in_bytes(n) == 64);
 4795   match(Set dst (VectorInsert (Binary src val) idx));
 4796   effect(TEMP vtmp);
 4797   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4798   ins_encode %{
 4799     assert(UseAVX > 2, "sanity");
 4800 
 4801     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4802     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4803     int log2epr = log2(elem_per_lane);
 4804 
 4805     assert(is_integral_type(elem_bt), "");
 4806     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4807 
 4808     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4809     uint y_idx = ($idx$$constant >> log2epr) & 3;
 4810     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4811     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4812     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4813   %}
 4814   ins_pipe( pipe_slow );
 4815 %}
 4816 
 4817 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
 4818   predicate(Matcher::vector_length(n) == 2);
 4819   match(Set dst (VectorInsert (Binary dst val) idx));
 4820   format %{ "vector_insert $dst,$val,$idx" %}
 4821   ins_encode %{
 4822     assert(UseSSE >= 4, "required");
 4823     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4824     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4825 
 4826     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
 4827   %}
 4828   ins_pipe( pipe_slow );
 4829 %}
 4830 
 4831 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
 4832   predicate(Matcher::vector_length(n) == 4);
 4833   match(Set dst (VectorInsert (Binary src val) idx));
 4834   effect(TEMP vtmp);
 4835   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4836   ins_encode %{
 4837     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4838     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4839 
 4840     uint x_idx = $idx$$constant & right_n_bits(1);
 4841     uint y_idx = ($idx$$constant >> 1) & 1;
 4842     int vlen_enc = Assembler::AVX_256bit;
 4843     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4844     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4845     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4846   %}
 4847   ins_pipe( pipe_slow );
 4848 %}
 4849 
 4850 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
 4851   predicate(Matcher::vector_length(n) == 8);
 4852   match(Set dst (VectorInsert (Binary src val) idx));
 4853   effect(TEMP vtmp);
 4854   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4855   ins_encode %{
 4856     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
 4857     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4858 
 4859     uint x_idx = $idx$$constant & right_n_bits(1);
 4860     uint y_idx = ($idx$$constant >> 1) & 3;
 4861     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4862     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4863     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4864   %}
 4865   ins_pipe( pipe_slow );
 4866 %}
 4867 
 4868 instruct insertF(vec dst, regF val, immU8 idx) %{
 4869   predicate(Matcher::vector_length(n) < 8);
 4870   match(Set dst (VectorInsert (Binary dst val) idx));
 4871   format %{ "vector_insert $dst,$val,$idx" %}
 4872   ins_encode %{
 4873     assert(UseSSE >= 4, "sanity");
 4874 
 4875     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4876     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4877 
 4878     uint x_idx = $idx$$constant & right_n_bits(2);
 4879     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4880   %}
 4881   ins_pipe( pipe_slow );
 4882 %}
 4883 
 4884 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
 4885   predicate(Matcher::vector_length(n) >= 8);
 4886   match(Set dst (VectorInsert (Binary src val) idx));
 4887   effect(TEMP vtmp);
 4888   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4889   ins_encode %{
 4890     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4891     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4892 
 4893     int vlen = Matcher::vector_length(this);
 4894     uint x_idx = $idx$$constant & right_n_bits(2);
 4895     if (vlen == 8) {
 4896       uint y_idx = ($idx$$constant >> 2) & 1;
 4897       int vlen_enc = Assembler::AVX_256bit;
 4898       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4899       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4900       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4901     } else {
 4902       assert(vlen == 16, "sanity");
 4903       uint y_idx = ($idx$$constant >> 2) & 3;
 4904       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4905       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4906       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4907     }
 4908   %}
 4909   ins_pipe( pipe_slow );
 4910 %}
 4911 
 4912 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
 4913   predicate(Matcher::vector_length(n) == 2);
 4914   match(Set dst (VectorInsert (Binary dst val) idx));
 4915   effect(TEMP tmp);
 4916   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
 4917   ins_encode %{
 4918     assert(UseSSE >= 4, "sanity");
 4919     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4920     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4921 
 4922     __ movq($tmp$$Register, $val$$XMMRegister);
 4923     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
 4924   %}
 4925   ins_pipe( pipe_slow );
 4926 %}
 4927 
 4928 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
 4929   predicate(Matcher::vector_length(n) == 4);
 4930   match(Set dst (VectorInsert (Binary src val) idx));
 4931   effect(TEMP vtmp, TEMP tmp);
 4932   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
 4933   ins_encode %{
 4934     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4935     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4936 
 4937     uint x_idx = $idx$$constant & right_n_bits(1);
 4938     uint y_idx = ($idx$$constant >> 1) & 1;
 4939     int vlen_enc = Assembler::AVX_256bit;
 4940     __ movq($tmp$$Register, $val$$XMMRegister);
 4941     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4942     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4943     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4944   %}
 4945   ins_pipe( pipe_slow );
 4946 %}
 4947 
 4948 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
 4949   predicate(Matcher::vector_length(n) == 8);
 4950   match(Set dst (VectorInsert (Binary src val) idx));
 4951   effect(TEMP tmp, TEMP vtmp);
 4952   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4953   ins_encode %{
 4954     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4955     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4956 
 4957     uint x_idx = $idx$$constant & right_n_bits(1);
 4958     uint y_idx = ($idx$$constant >> 1) & 3;
 4959     __ movq($tmp$$Register, $val$$XMMRegister);
 4960     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4961     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4962     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4963   %}
 4964   ins_pipe( pipe_slow );
 4965 %}
 4966 
 4967 // ====================REDUCTION ARITHMETIC=======================================
 4968 
 4969 // =======================Int Reduction==========================================
 4970 
 4971 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4972   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
 4973   match(Set dst (AddReductionVI src1 src2));
 4974   match(Set dst (MulReductionVI src1 src2));
 4975   match(Set dst (AndReductionV  src1 src2));
 4976   match(Set dst ( OrReductionV  src1 src2));
 4977   match(Set dst (XorReductionV  src1 src2));
 4978   match(Set dst (MinReductionV  src1 src2));
 4979   match(Set dst (MaxReductionV  src1 src2));
 4980   effect(TEMP vtmp1, TEMP vtmp2);
 4981   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4982   ins_encode %{
 4983     int opcode = this->ideal_Opcode();
 4984     int vlen = Matcher::vector_length(this, $src2);
 4985     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4986   %}
 4987   ins_pipe( pipe_slow );
 4988 %}
 4989 
 4990 // =======================Long Reduction==========================================
 4991 
 4992 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4993   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
 4994   match(Set dst (AddReductionVL src1 src2));
 4995   match(Set dst (MulReductionVL src1 src2));
 4996   match(Set dst (AndReductionV  src1 src2));
 4997   match(Set dst ( OrReductionV  src1 src2));
 4998   match(Set dst (XorReductionV  src1 src2));
 4999   match(Set dst (MinReductionV  src1 src2));
 5000   match(Set dst (MaxReductionV  src1 src2));
 5001   effect(TEMP vtmp1, TEMP vtmp2);
 5002   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5003   ins_encode %{
 5004     int opcode = this->ideal_Opcode();
 5005     int vlen = Matcher::vector_length(this, $src2);
 5006     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5007   %}
 5008   ins_pipe( pipe_slow );
 5009 %}
 5010 
 5011 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
 5012   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
 5013   match(Set dst (AddReductionVL src1 src2));
 5014   match(Set dst (MulReductionVL src1 src2));
 5015   match(Set dst (AndReductionV  src1 src2));
 5016   match(Set dst ( OrReductionV  src1 src2));
 5017   match(Set dst (XorReductionV  src1 src2));
 5018   match(Set dst (MinReductionV  src1 src2));
 5019   match(Set dst (MaxReductionV  src1 src2));
 5020   effect(TEMP vtmp1, TEMP vtmp2);
 5021   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5022   ins_encode %{
 5023     int opcode = this->ideal_Opcode();
 5024     int vlen = Matcher::vector_length(this, $src2);
 5025     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5026   %}
 5027   ins_pipe( pipe_slow );
 5028 %}
 5029 
 5030 // =======================Float Reduction==========================================
 5031 
 5032 instruct reductionF128(regF dst, vec src, vec vtmp) %{
 5033   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) <= 4); // src
 5034   match(Set dst (AddReductionVF dst src));
 5035   match(Set dst (MulReductionVF dst src));
 5036   effect(TEMP dst, TEMP vtmp);
 5037   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
 5038   ins_encode %{
 5039     int opcode = this->ideal_Opcode();
 5040     int vlen = Matcher::vector_length(this, $src);
 5041     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5042   %}
 5043   ins_pipe( pipe_slow );
 5044 %}
 5045 
 5046 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
 5047   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5048   match(Set dst (AddReductionVF dst src));
 5049   match(Set dst (MulReductionVF dst src));
 5050   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5051   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5052   ins_encode %{
 5053     int opcode = this->ideal_Opcode();
 5054     int vlen = Matcher::vector_length(this, $src);
 5055     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5056   %}
 5057   ins_pipe( pipe_slow );
 5058 %}
 5059 
 5060 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5061   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src
 5062   match(Set dst (AddReductionVF dst src));
 5063   match(Set dst (MulReductionVF dst src));
 5064   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5065   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5066   ins_encode %{
 5067     int opcode = this->ideal_Opcode();
 5068     int vlen = Matcher::vector_length(this, $src);
 5069     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5070   %}
 5071   ins_pipe( pipe_slow );
 5072 %}
 5073 
 5074 
 5075 instruct unordered_reduction2F(regF dst, regF src1, vec src2) %{
 5076   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5077   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5078   // src1 contains reduction identity
 5079   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5080   match(Set dst (AddReductionVF src1 src2));
 5081   match(Set dst (MulReductionVF src1 src2));
 5082   effect(TEMP dst);
 5083   format %{ "vector_reduction_float  $dst,$src1,$src2 ;" %}
 5084   ins_encode %{
 5085     int opcode = this->ideal_Opcode();
 5086     int vlen = Matcher::vector_length(this, $src2);
 5087     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5088   %}
 5089   ins_pipe( pipe_slow );
 5090 %}
 5091 
 5092 instruct unordered_reduction4F(regF dst, regF src1, vec src2, vec vtmp) %{
 5093   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5094   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5095   // src1 contains reduction identity
 5096   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5097   match(Set dst (AddReductionVF src1 src2));
 5098   match(Set dst (MulReductionVF src1 src2));
 5099   effect(TEMP dst, TEMP vtmp);
 5100   format %{ "vector_reduction_float  $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5101   ins_encode %{
 5102     int opcode = this->ideal_Opcode();
 5103     int vlen = Matcher::vector_length(this, $src2);
 5104     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5105   %}
 5106   ins_pipe( pipe_slow );
 5107 %}
 5108 
 5109 instruct unordered_reduction8F(regF dst, regF src1, vec src2, vec vtmp1, vec vtmp2) %{
 5110   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5111   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5112   // src1 contains reduction identity
 5113   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5114   match(Set dst (AddReductionVF src1 src2));
 5115   match(Set dst (MulReductionVF src1 src2));
 5116   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5117   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5118   ins_encode %{
 5119     int opcode = this->ideal_Opcode();
 5120     int vlen = Matcher::vector_length(this, $src2);
 5121     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5122   %}
 5123   ins_pipe( pipe_slow );
 5124 %}
 5125 
 5126 instruct unordered_reduction16F(regF dst, regF src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5127   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5128   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5129   // src1 contains reduction identity
 5130   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src2
 5131   match(Set dst (AddReductionVF src1 src2));
 5132   match(Set dst (MulReductionVF src1 src2));
 5133   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5134   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5135   ins_encode %{
 5136     int opcode = this->ideal_Opcode();
 5137     int vlen = Matcher::vector_length(this, $src2);
 5138     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5139   %}
 5140   ins_pipe( pipe_slow );
 5141 %}
 5142 
 5143 // =======================Double Reduction==========================================
 5144 
 5145 instruct reduction2D(regD dst, vec src, vec vtmp) %{
 5146   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src
 5147   match(Set dst (AddReductionVD dst src));
 5148   match(Set dst (MulReductionVD dst src));
 5149   effect(TEMP dst, TEMP vtmp);
 5150   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
 5151   ins_encode %{
 5152     int opcode = this->ideal_Opcode();
 5153     int vlen = Matcher::vector_length(this, $src);
 5154     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5155 %}
 5156   ins_pipe( pipe_slow );
 5157 %}
 5158 
 5159 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
 5160   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src
 5161   match(Set dst (AddReductionVD dst src));
 5162   match(Set dst (MulReductionVD dst src));
 5163   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5164   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5165   ins_encode %{
 5166     int opcode = this->ideal_Opcode();
 5167     int vlen = Matcher::vector_length(this, $src);
 5168     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5169   %}
 5170   ins_pipe( pipe_slow );
 5171 %}
 5172 
 5173 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5174   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5175   match(Set dst (AddReductionVD dst src));
 5176   match(Set dst (MulReductionVD dst src));
 5177   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5178   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5179   ins_encode %{
 5180     int opcode = this->ideal_Opcode();
 5181     int vlen = Matcher::vector_length(this, $src);
 5182     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5183   %}
 5184   ins_pipe( pipe_slow );
 5185 %}
 5186 
 5187 instruct unordered_reduction2D(regD dst, regD src1, vec src2) %{
 5188   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5189   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5190   // src1 contains reduction identity
 5191   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5192   match(Set dst (AddReductionVD src1 src2));
 5193   match(Set dst (MulReductionVD src1 src2));
 5194   effect(TEMP dst);
 5195   format %{ "vector_reduction_double $dst,$src1,$src2 ;" %}
 5196   ins_encode %{
 5197     int opcode = this->ideal_Opcode();
 5198     int vlen = Matcher::vector_length(this, $src2);
 5199     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5200 %}
 5201   ins_pipe( pipe_slow );
 5202 %}
 5203 
 5204 instruct unordered_reduction4D(regD dst, regD src1, vec src2, vec vtmp) %{
 5205   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5206   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5207   // src1 contains reduction identity
 5208   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5209   match(Set dst (AddReductionVD src1 src2));
 5210   match(Set dst (MulReductionVD src1 src2));
 5211   effect(TEMP dst, TEMP vtmp);
 5212   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5213   ins_encode %{
 5214     int opcode = this->ideal_Opcode();
 5215     int vlen = Matcher::vector_length(this, $src2);
 5216     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5217   %}
 5218   ins_pipe( pipe_slow );
 5219 %}
 5220 
 5221 instruct unordered_reduction8D(regD dst, regD src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5222   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5223   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5224   // src1 contains reduction identity
 5225   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5226   match(Set dst (AddReductionVD src1 src2));
 5227   match(Set dst (MulReductionVD src1 src2));
 5228   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5229   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5230   ins_encode %{
 5231     int opcode = this->ideal_Opcode();
 5232     int vlen = Matcher::vector_length(this, $src2);
 5233     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5234   %}
 5235   ins_pipe( pipe_slow );
 5236 %}
 5237 
 5238 // =======================Byte Reduction==========================================
 5239 
 5240 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5241   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
 5242   match(Set dst (AddReductionVI src1 src2));
 5243   match(Set dst (AndReductionV  src1 src2));
 5244   match(Set dst ( OrReductionV  src1 src2));
 5245   match(Set dst (XorReductionV  src1 src2));
 5246   match(Set dst (MinReductionV  src1 src2));
 5247   match(Set dst (MaxReductionV  src1 src2));
 5248   effect(TEMP vtmp1, TEMP vtmp2);
 5249   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5250   ins_encode %{
 5251     int opcode = this->ideal_Opcode();
 5252     int vlen = Matcher::vector_length(this, $src2);
 5253     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5254   %}
 5255   ins_pipe( pipe_slow );
 5256 %}
 5257 
 5258 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5259   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
 5260   match(Set dst (AddReductionVI src1 src2));
 5261   match(Set dst (AndReductionV  src1 src2));
 5262   match(Set dst ( OrReductionV  src1 src2));
 5263   match(Set dst (XorReductionV  src1 src2));
 5264   match(Set dst (MinReductionV  src1 src2));
 5265   match(Set dst (MaxReductionV  src1 src2));
 5266   effect(TEMP vtmp1, TEMP vtmp2);
 5267   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5268   ins_encode %{
 5269     int opcode = this->ideal_Opcode();
 5270     int vlen = Matcher::vector_length(this, $src2);
 5271     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5272   %}
 5273   ins_pipe( pipe_slow );
 5274 %}
 5275 
 5276 // =======================Short Reduction==========================================
 5277 
 5278 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5279   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
 5280   match(Set dst (AddReductionVI src1 src2));
 5281   match(Set dst (MulReductionVI src1 src2));
 5282   match(Set dst (AndReductionV  src1 src2));
 5283   match(Set dst ( OrReductionV  src1 src2));
 5284   match(Set dst (XorReductionV  src1 src2));
 5285   match(Set dst (MinReductionV  src1 src2));
 5286   match(Set dst (MaxReductionV  src1 src2));
 5287   effect(TEMP vtmp1, TEMP vtmp2);
 5288   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5289   ins_encode %{
 5290     int opcode = this->ideal_Opcode();
 5291     int vlen = Matcher::vector_length(this, $src2);
 5292     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5293   %}
 5294   ins_pipe( pipe_slow );
 5295 %}
 5296 
 5297 // =======================Mul Reduction==========================================
 5298 
 5299 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5300   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5301             Matcher::vector_length(n->in(2)) <= 32); // src2
 5302   match(Set dst (MulReductionVI src1 src2));
 5303   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5304   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5305   ins_encode %{
 5306     int opcode = this->ideal_Opcode();
 5307     int vlen = Matcher::vector_length(this, $src2);
 5308     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5309   %}
 5310   ins_pipe( pipe_slow );
 5311 %}
 5312 
 5313 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5314   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5315             Matcher::vector_length(n->in(2)) == 64); // src2
 5316   match(Set dst (MulReductionVI src1 src2));
 5317   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5318   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5319   ins_encode %{
 5320     int opcode = this->ideal_Opcode();
 5321     int vlen = Matcher::vector_length(this, $src2);
 5322     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5323   %}
 5324   ins_pipe( pipe_slow );
 5325 %}
 5326 
 5327 //--------------------Min/Max Float Reduction --------------------
 5328 // Float Min Reduction
 5329 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5330                             legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5331   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5332             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5333              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5334             Matcher::vector_length(n->in(2)) == 2);
 5335   match(Set dst (MinReductionV src1 src2));
 5336   match(Set dst (MaxReductionV src1 src2));
 5337   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5338   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5339   ins_encode %{
 5340     assert(UseAVX > 0, "sanity");
 5341 
 5342     int opcode = this->ideal_Opcode();
 5343     int vlen = Matcher::vector_length(this, $src2);
 5344     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5345                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5346   %}
 5347   ins_pipe( pipe_slow );
 5348 %}
 5349 
 5350 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5351                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5352   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5353             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5354              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5355             Matcher::vector_length(n->in(2)) >= 4);
 5356   match(Set dst (MinReductionV src1 src2));
 5357   match(Set dst (MaxReductionV src1 src2));
 5358   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5359   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5360   ins_encode %{
 5361     assert(UseAVX > 0, "sanity");
 5362 
 5363     int opcode = this->ideal_Opcode();
 5364     int vlen = Matcher::vector_length(this, $src2);
 5365     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5366                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5367   %}
 5368   ins_pipe( pipe_slow );
 5369 %}
 5370 
 5371 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp, legVec atmp,
 5372                                legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5373   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5374             Matcher::vector_length(n->in(2)) == 2);
 5375   match(Set dst (MinReductionV dst src));
 5376   match(Set dst (MaxReductionV dst src));
 5377   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5378   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5379   ins_encode %{
 5380     assert(UseAVX > 0, "sanity");
 5381 
 5382     int opcode = this->ideal_Opcode();
 5383     int vlen = Matcher::vector_length(this, $src);
 5384     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5385                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5386   %}
 5387   ins_pipe( pipe_slow );
 5388 %}
 5389 
 5390 
 5391 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp, legVec atmp, legVec btmp,
 5392                               legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5393   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5394             Matcher::vector_length(n->in(2)) >= 4);
 5395   match(Set dst (MinReductionV dst src));
 5396   match(Set dst (MaxReductionV dst src));
 5397   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5398   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5399   ins_encode %{
 5400     assert(UseAVX > 0, "sanity");
 5401 
 5402     int opcode = this->ideal_Opcode();
 5403     int vlen = Matcher::vector_length(this, $src);
 5404     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5405                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5406   %}
 5407   ins_pipe( pipe_slow );
 5408 %}
 5409 
 5410 instruct minmax_reduction2F_avx10(regF dst, immF src1, vec src2, vec xtmp1) %{
 5411   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5412             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5413              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5414             Matcher::vector_length(n->in(2)) == 2);
 5415   match(Set dst (MinReductionV src1 src2));
 5416   match(Set dst (MaxReductionV src1 src2));
 5417   effect(TEMP dst, TEMP xtmp1);
 5418   format %{ "vector_minmax_reduction $dst, $src1, $src2 \t; using $xtmp1 as TEMP" %}
 5419   ins_encode %{
 5420     int opcode = this->ideal_Opcode();
 5421     int vlen = Matcher::vector_length(this, $src2);
 5422     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5423                          xnoreg, xnoreg, xnoreg, $xtmp1$$XMMRegister);
 5424   %}
 5425   ins_pipe( pipe_slow );
 5426 %}
 5427 
 5428 instruct minmax_reductionF_avx10(regF dst, immF src1, vec src2, vec xtmp1, vec xtmp2) %{
 5429   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5430             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5431              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5432             Matcher::vector_length(n->in(2)) >= 4);
 5433   match(Set dst (MinReductionV src1 src2));
 5434   match(Set dst (MaxReductionV src1 src2));
 5435   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 5436   format %{ "vector_minmax_reduction $dst, $src1, $src2 \t; using $xtmp1 and $xtmp2 as TEMP" %}
 5437   ins_encode %{
 5438     int opcode = this->ideal_Opcode();
 5439     int vlen = Matcher::vector_length(this, $src2);
 5440     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg,
 5441                          xnoreg, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
 5442   %}
 5443   ins_pipe( pipe_slow );
 5444 %}
 5445 
 5446 instruct minmax_reduction2F_avx10_av(regF dst, vec src, vec xtmp1) %{
 5447   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5448             Matcher::vector_length(n->in(2)) == 2);
 5449   match(Set dst (MinReductionV dst src));
 5450   match(Set dst (MaxReductionV dst src));
 5451   effect(TEMP dst, TEMP xtmp1);
 5452   format %{ "vector_minmax2F_reduction $dst, $src \t; using $xtmp1 as TEMP" %}
 5453   ins_encode %{
 5454     int opcode = this->ideal_Opcode();
 5455     int vlen = Matcher::vector_length(this, $src);
 5456     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg, xnoreg,
 5457                          $xtmp1$$XMMRegister);
 5458   %}
 5459   ins_pipe( pipe_slow );
 5460 %}
 5461 
 5462 instruct minmax_reductionF_avx10_av(regF dst, vec src, vec xtmp1, vec xtmp2) %{
 5463   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5464             Matcher::vector_length(n->in(2)) >= 4);
 5465   match(Set dst (MinReductionV dst src));
 5466   match(Set dst (MaxReductionV dst src));
 5467   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 5468   format %{ "vector_minmax2F_reduction $dst, $src \t; using $xtmp1 and $xtmp2 as TEMP" %}
 5469   ins_encode %{
 5470     int opcode = this->ideal_Opcode();
 5471     int vlen = Matcher::vector_length(this, $src);
 5472     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg, xnoreg,
 5473                          $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
 5474   %}
 5475   ins_pipe( pipe_slow );
 5476 %}
 5477 
 5478 //--------------------Min Double Reduction --------------------
 5479 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2, legVec tmp1, legVec tmp2,
 5480                             legVec tmp3, legVec tmp4, rFlagsReg cr) %{
 5481   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5482             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5483              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5484             Matcher::vector_length(n->in(2)) == 2);
 5485   match(Set dst (MinReductionV src1 src2));
 5486   match(Set dst (MaxReductionV src1 src2));
 5487   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5488   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5489   ins_encode %{
 5490     assert(UseAVX > 0, "sanity");
 5491 
 5492     int opcode = this->ideal_Opcode();
 5493     int vlen = Matcher::vector_length(this, $src2);
 5494     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5495                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5496   %}
 5497   ins_pipe( pipe_slow );
 5498 %}
 5499 
 5500 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2, legVec tmp1, legVec tmp2,
 5501                            legVec tmp3, legVec tmp4, legVec tmp5, rFlagsReg cr) %{
 5502   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5503             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5504              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5505             Matcher::vector_length(n->in(2)) >= 4);
 5506   match(Set dst (MinReductionV src1 src2));
 5507   match(Set dst (MaxReductionV src1 src2));
 5508   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5509   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5510   ins_encode %{
 5511     assert(UseAVX > 0, "sanity");
 5512 
 5513     int opcode = this->ideal_Opcode();
 5514     int vlen = Matcher::vector_length(this, $src2);
 5515     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5516                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5517   %}
 5518   ins_pipe( pipe_slow );
 5519 %}
 5520 
 5521 
 5522 instruct minmax_reduction2D_av(legRegD dst, legVec src, legVec tmp1, legVec tmp2,
 5523                                legVec tmp3, legVec tmp4, rFlagsReg cr) %{
 5524   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5525             Matcher::vector_length(n->in(2)) == 2);
 5526   match(Set dst (MinReductionV dst src));
 5527   match(Set dst (MaxReductionV dst src));
 5528   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5529   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5530   ins_encode %{
 5531     assert(UseAVX > 0, "sanity");
 5532 
 5533     int opcode = this->ideal_Opcode();
 5534     int vlen = Matcher::vector_length(this, $src);
 5535     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5536                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5537   %}
 5538   ins_pipe( pipe_slow );
 5539 %}
 5540 
 5541 instruct minmax_reductionD_av(legRegD dst, legVec src, legVec tmp1, legVec tmp2, legVec tmp3,
 5542                               legVec tmp4, legVec tmp5, rFlagsReg cr) %{
 5543   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5544             Matcher::vector_length(n->in(2)) >= 4);
 5545   match(Set dst (MinReductionV dst src));
 5546   match(Set dst (MaxReductionV dst src));
 5547   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5548   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5549   ins_encode %{
 5550     assert(UseAVX > 0, "sanity");
 5551 
 5552     int opcode = this->ideal_Opcode();
 5553     int vlen = Matcher::vector_length(this, $src);
 5554     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5555                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5556   %}
 5557   ins_pipe( pipe_slow );
 5558 %}
 5559 
 5560 instruct minmax_reduction2D_avx10(regD dst, immD src1, vec src2, vec xtmp1) %{
 5561   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5562             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5563              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5564             Matcher::vector_length(n->in(2)) == 2);
 5565   match(Set dst (MinReductionV src1 src2));
 5566   match(Set dst (MaxReductionV src1 src2));
 5567   effect(TEMP dst, TEMP xtmp1);
 5568   format %{ "vector_minmax2D_reduction $dst, $src1, $src2 ; using $xtmp1 as TEMP" %}
 5569   ins_encode %{
 5570     int opcode = this->ideal_Opcode();
 5571     int vlen = Matcher::vector_length(this, $src2);
 5572     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, xnoreg,
 5573                           xnoreg, xnoreg, $xtmp1$$XMMRegister);
 5574   %}
 5575   ins_pipe( pipe_slow );
 5576 %}
 5577 
 5578 instruct minmax_reductionD_avx10(regD dst, immD src1, vec src2, vec xtmp1, vec xtmp2) %{
 5579   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5580             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5581              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5582             Matcher::vector_length(n->in(2)) >= 4);
 5583   match(Set dst (MinReductionV src1 src2));
 5584   match(Set dst (MaxReductionV src1 src2));
 5585   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 5586   format %{ "vector_minmaxD_reduction $dst, $src1, $src2 ; using $xtmp1 and $xtmp2 as TEMP" %}
 5587   ins_encode %{
 5588     int opcode = this->ideal_Opcode();
 5589     int vlen = Matcher::vector_length(this, $src2);
 5590     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg,
 5591                           xnoreg, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
 5592   %}
 5593   ins_pipe( pipe_slow );
 5594 %}
 5595 
 5596 
 5597 instruct minmax_reduction2D_av_avx10(regD dst, vec src, vec xtmp1) %{
 5598   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5599             Matcher::vector_length(n->in(2)) == 2);
 5600   match(Set dst (MinReductionV dst src));
 5601   match(Set dst (MaxReductionV dst src));
 5602   effect(TEMP dst, TEMP xtmp1);
 5603   format %{ "vector_minmax2D_reduction $dst, $src ; using $xtmp1 as TEMP" %}
 5604   ins_encode %{
 5605     int opcode = this->ideal_Opcode();
 5606     int vlen = Matcher::vector_length(this, $src);
 5607     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5608                           xnoreg, xnoreg, xnoreg, $xtmp1$$XMMRegister);
 5609   %}
 5610   ins_pipe( pipe_slow );
 5611 %}
 5612 
 5613 instruct minmax_reductionD_av_avx10(regD dst, vec src, vec xtmp1, vec xtmp2) %{
 5614   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5615             Matcher::vector_length(n->in(2)) >= 4);
 5616   match(Set dst (MinReductionV dst src));
 5617   match(Set dst (MaxReductionV dst src));
 5618   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 5619   format %{ "vector_minmaxD_reduction $dst, $src ; using $xtmp1 and $xtmp2 as TEMP" %}
 5620   ins_encode %{
 5621     int opcode = this->ideal_Opcode();
 5622     int vlen = Matcher::vector_length(this, $src);
 5623     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5624                           xnoreg, xnoreg, xnoreg, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
 5625   %}
 5626   ins_pipe( pipe_slow );
 5627 %}
 5628 
 5629 // ====================VECTOR ARITHMETIC=======================================
 5630 
 5631 // --------------------------------- ADD --------------------------------------
 5632 
 5633 // Bytes vector add
 5634 instruct vaddB(vec dst, vec src) %{
 5635   predicate(UseAVX == 0);
 5636   match(Set dst (AddVB dst src));
 5637   format %{ "paddb   $dst,$src\t! add packedB" %}
 5638   ins_encode %{
 5639     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
 5640   %}
 5641   ins_pipe( pipe_slow );
 5642 %}
 5643 
 5644 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
 5645   predicate(UseAVX > 0);
 5646   match(Set dst (AddVB src1 src2));
 5647   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
 5648   ins_encode %{
 5649     int vlen_enc = vector_length_encoding(this);
 5650     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5651   %}
 5652   ins_pipe( pipe_slow );
 5653 %}
 5654 
 5655 instruct vaddB_mem(vec dst, vec src, memory mem) %{
 5656   predicate((UseAVX > 0) &&
 5657             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5658   match(Set dst (AddVB src (LoadVector mem)));
 5659   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
 5660   ins_encode %{
 5661     int vlen_enc = vector_length_encoding(this);
 5662     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5663   %}
 5664   ins_pipe( pipe_slow );
 5665 %}
 5666 
 5667 // Shorts/Chars vector add
 5668 instruct vaddS(vec dst, vec src) %{
 5669   predicate(UseAVX == 0);
 5670   match(Set dst (AddVS dst src));
 5671   format %{ "paddw   $dst,$src\t! add packedS" %}
 5672   ins_encode %{
 5673     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
 5674   %}
 5675   ins_pipe( pipe_slow );
 5676 %}
 5677 
 5678 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
 5679   predicate(UseAVX > 0);
 5680   match(Set dst (AddVS src1 src2));
 5681   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
 5682   ins_encode %{
 5683     int vlen_enc = vector_length_encoding(this);
 5684     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5685   %}
 5686   ins_pipe( pipe_slow );
 5687 %}
 5688 
 5689 instruct vaddS_mem(vec dst, vec src, memory mem) %{
 5690   predicate((UseAVX > 0) &&
 5691             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5692   match(Set dst (AddVS src (LoadVector mem)));
 5693   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
 5694   ins_encode %{
 5695     int vlen_enc = vector_length_encoding(this);
 5696     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5697   %}
 5698   ins_pipe( pipe_slow );
 5699 %}
 5700 
 5701 // Integers vector add
 5702 instruct vaddI(vec dst, vec src) %{
 5703   predicate(UseAVX == 0);
 5704   match(Set dst (AddVI dst src));
 5705   format %{ "paddd   $dst,$src\t! add packedI" %}
 5706   ins_encode %{
 5707     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
 5708   %}
 5709   ins_pipe( pipe_slow );
 5710 %}
 5711 
 5712 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
 5713   predicate(UseAVX > 0);
 5714   match(Set dst (AddVI src1 src2));
 5715   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
 5716   ins_encode %{
 5717     int vlen_enc = vector_length_encoding(this);
 5718     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5719   %}
 5720   ins_pipe( pipe_slow );
 5721 %}
 5722 
 5723 
 5724 instruct vaddI_mem(vec dst, vec src, memory mem) %{
 5725   predicate((UseAVX > 0) &&
 5726             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5727   match(Set dst (AddVI src (LoadVector mem)));
 5728   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
 5729   ins_encode %{
 5730     int vlen_enc = vector_length_encoding(this);
 5731     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5732   %}
 5733   ins_pipe( pipe_slow );
 5734 %}
 5735 
 5736 // Longs vector add
 5737 instruct vaddL(vec dst, vec src) %{
 5738   predicate(UseAVX == 0);
 5739   match(Set dst (AddVL dst src));
 5740   format %{ "paddq   $dst,$src\t! add packedL" %}
 5741   ins_encode %{
 5742     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
 5743   %}
 5744   ins_pipe( pipe_slow );
 5745 %}
 5746 
 5747 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
 5748   predicate(UseAVX > 0);
 5749   match(Set dst (AddVL src1 src2));
 5750   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
 5751   ins_encode %{
 5752     int vlen_enc = vector_length_encoding(this);
 5753     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5754   %}
 5755   ins_pipe( pipe_slow );
 5756 %}
 5757 
 5758 instruct vaddL_mem(vec dst, vec src, memory mem) %{
 5759   predicate((UseAVX > 0) &&
 5760             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5761   match(Set dst (AddVL src (LoadVector mem)));
 5762   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
 5763   ins_encode %{
 5764     int vlen_enc = vector_length_encoding(this);
 5765     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5766   %}
 5767   ins_pipe( pipe_slow );
 5768 %}
 5769 
 5770 // Floats vector add
 5771 instruct vaddF(vec dst, vec src) %{
 5772   predicate(UseAVX == 0);
 5773   match(Set dst (AddVF dst src));
 5774   format %{ "addps   $dst,$src\t! add packedF" %}
 5775   ins_encode %{
 5776     __ addps($dst$$XMMRegister, $src$$XMMRegister);
 5777   %}
 5778   ins_pipe( pipe_slow );
 5779 %}
 5780 
 5781 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
 5782   predicate(UseAVX > 0);
 5783   match(Set dst (AddVF src1 src2));
 5784   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
 5785   ins_encode %{
 5786     int vlen_enc = vector_length_encoding(this);
 5787     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5788   %}
 5789   ins_pipe( pipe_slow );
 5790 %}
 5791 
 5792 instruct vaddF_mem(vec dst, vec src, memory mem) %{
 5793   predicate((UseAVX > 0) &&
 5794             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5795   match(Set dst (AddVF src (LoadVector mem)));
 5796   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
 5797   ins_encode %{
 5798     int vlen_enc = vector_length_encoding(this);
 5799     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5800   %}
 5801   ins_pipe( pipe_slow );
 5802 %}
 5803 
 5804 // Doubles vector add
 5805 instruct vaddD(vec dst, vec src) %{
 5806   predicate(UseAVX == 0);
 5807   match(Set dst (AddVD dst src));
 5808   format %{ "addpd   $dst,$src\t! add packedD" %}
 5809   ins_encode %{
 5810     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
 5811   %}
 5812   ins_pipe( pipe_slow );
 5813 %}
 5814 
 5815 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
 5816   predicate(UseAVX > 0);
 5817   match(Set dst (AddVD src1 src2));
 5818   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
 5819   ins_encode %{
 5820     int vlen_enc = vector_length_encoding(this);
 5821     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5822   %}
 5823   ins_pipe( pipe_slow );
 5824 %}
 5825 
 5826 instruct vaddD_mem(vec dst, vec src, memory mem) %{
 5827   predicate((UseAVX > 0) &&
 5828             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5829   match(Set dst (AddVD src (LoadVector mem)));
 5830   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
 5831   ins_encode %{
 5832     int vlen_enc = vector_length_encoding(this);
 5833     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5834   %}
 5835   ins_pipe( pipe_slow );
 5836 %}
 5837 
 5838 // --------------------------------- SUB --------------------------------------
 5839 
 5840 // Bytes vector sub
 5841 instruct vsubB(vec dst, vec src) %{
 5842   predicate(UseAVX == 0);
 5843   match(Set dst (SubVB dst src));
 5844   format %{ "psubb   $dst,$src\t! sub packedB" %}
 5845   ins_encode %{
 5846     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
 5847   %}
 5848   ins_pipe( pipe_slow );
 5849 %}
 5850 
 5851 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
 5852   predicate(UseAVX > 0);
 5853   match(Set dst (SubVB src1 src2));
 5854   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
 5855   ins_encode %{
 5856     int vlen_enc = vector_length_encoding(this);
 5857     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5858   %}
 5859   ins_pipe( pipe_slow );
 5860 %}
 5861 
 5862 instruct vsubB_mem(vec dst, vec src, memory mem) %{
 5863   predicate((UseAVX > 0) &&
 5864             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5865   match(Set dst (SubVB src (LoadVector mem)));
 5866   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
 5867   ins_encode %{
 5868     int vlen_enc = vector_length_encoding(this);
 5869     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5870   %}
 5871   ins_pipe( pipe_slow );
 5872 %}
 5873 
 5874 // Shorts/Chars vector sub
 5875 instruct vsubS(vec dst, vec src) %{
 5876   predicate(UseAVX == 0);
 5877   match(Set dst (SubVS dst src));
 5878   format %{ "psubw   $dst,$src\t! sub packedS" %}
 5879   ins_encode %{
 5880     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
 5881   %}
 5882   ins_pipe( pipe_slow );
 5883 %}
 5884 
 5885 
 5886 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
 5887   predicate(UseAVX > 0);
 5888   match(Set dst (SubVS src1 src2));
 5889   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
 5890   ins_encode %{
 5891     int vlen_enc = vector_length_encoding(this);
 5892     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5893   %}
 5894   ins_pipe( pipe_slow );
 5895 %}
 5896 
 5897 instruct vsubS_mem(vec dst, vec src, memory mem) %{
 5898   predicate((UseAVX > 0) &&
 5899             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5900   match(Set dst (SubVS src (LoadVector mem)));
 5901   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
 5902   ins_encode %{
 5903     int vlen_enc = vector_length_encoding(this);
 5904     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5905   %}
 5906   ins_pipe( pipe_slow );
 5907 %}
 5908 
 5909 // Integers vector sub
 5910 instruct vsubI(vec dst, vec src) %{
 5911   predicate(UseAVX == 0);
 5912   match(Set dst (SubVI dst src));
 5913   format %{ "psubd   $dst,$src\t! sub packedI" %}
 5914   ins_encode %{
 5915     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
 5916   %}
 5917   ins_pipe( pipe_slow );
 5918 %}
 5919 
 5920 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
 5921   predicate(UseAVX > 0);
 5922   match(Set dst (SubVI src1 src2));
 5923   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
 5924   ins_encode %{
 5925     int vlen_enc = vector_length_encoding(this);
 5926     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5927   %}
 5928   ins_pipe( pipe_slow );
 5929 %}
 5930 
 5931 instruct vsubI_mem(vec dst, vec src, memory mem) %{
 5932   predicate((UseAVX > 0) &&
 5933             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5934   match(Set dst (SubVI src (LoadVector mem)));
 5935   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
 5936   ins_encode %{
 5937     int vlen_enc = vector_length_encoding(this);
 5938     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5939   %}
 5940   ins_pipe( pipe_slow );
 5941 %}
 5942 
 5943 // Longs vector sub
 5944 instruct vsubL(vec dst, vec src) %{
 5945   predicate(UseAVX == 0);
 5946   match(Set dst (SubVL dst src));
 5947   format %{ "psubq   $dst,$src\t! sub packedL" %}
 5948   ins_encode %{
 5949     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
 5950   %}
 5951   ins_pipe( pipe_slow );
 5952 %}
 5953 
 5954 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
 5955   predicate(UseAVX > 0);
 5956   match(Set dst (SubVL src1 src2));
 5957   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
 5958   ins_encode %{
 5959     int vlen_enc = vector_length_encoding(this);
 5960     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5961   %}
 5962   ins_pipe( pipe_slow );
 5963 %}
 5964 
 5965 
 5966 instruct vsubL_mem(vec dst, vec src, memory mem) %{
 5967   predicate((UseAVX > 0) &&
 5968             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5969   match(Set dst (SubVL src (LoadVector mem)));
 5970   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
 5971   ins_encode %{
 5972     int vlen_enc = vector_length_encoding(this);
 5973     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5974   %}
 5975   ins_pipe( pipe_slow );
 5976 %}
 5977 
 5978 // Floats vector sub
 5979 instruct vsubF(vec dst, vec src) %{
 5980   predicate(UseAVX == 0);
 5981   match(Set dst (SubVF dst src));
 5982   format %{ "subps   $dst,$src\t! sub packedF" %}
 5983   ins_encode %{
 5984     __ subps($dst$$XMMRegister, $src$$XMMRegister);
 5985   %}
 5986   ins_pipe( pipe_slow );
 5987 %}
 5988 
 5989 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
 5990   predicate(UseAVX > 0);
 5991   match(Set dst (SubVF src1 src2));
 5992   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
 5993   ins_encode %{
 5994     int vlen_enc = vector_length_encoding(this);
 5995     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5996   %}
 5997   ins_pipe( pipe_slow );
 5998 %}
 5999 
 6000 instruct vsubF_mem(vec dst, vec src, memory mem) %{
 6001   predicate((UseAVX > 0) &&
 6002             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6003   match(Set dst (SubVF src (LoadVector mem)));
 6004   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
 6005   ins_encode %{
 6006     int vlen_enc = vector_length_encoding(this);
 6007     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6008   %}
 6009   ins_pipe( pipe_slow );
 6010 %}
 6011 
 6012 // Doubles vector sub
 6013 instruct vsubD(vec dst, vec src) %{
 6014   predicate(UseAVX == 0);
 6015   match(Set dst (SubVD dst src));
 6016   format %{ "subpd   $dst,$src\t! sub packedD" %}
 6017   ins_encode %{
 6018     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
 6019   %}
 6020   ins_pipe( pipe_slow );
 6021 %}
 6022 
 6023 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
 6024   predicate(UseAVX > 0);
 6025   match(Set dst (SubVD src1 src2));
 6026   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
 6027   ins_encode %{
 6028     int vlen_enc = vector_length_encoding(this);
 6029     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6030   %}
 6031   ins_pipe( pipe_slow );
 6032 %}
 6033 
 6034 instruct vsubD_mem(vec dst, vec src, memory mem) %{
 6035   predicate((UseAVX > 0) &&
 6036             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6037   match(Set dst (SubVD src (LoadVector mem)));
 6038   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
 6039   ins_encode %{
 6040     int vlen_enc = vector_length_encoding(this);
 6041     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6042   %}
 6043   ins_pipe( pipe_slow );
 6044 %}
 6045 
 6046 // --------------------------------- MUL --------------------------------------
 6047 
 6048 // Byte vector mul
 6049 instruct vmul8B(vec dst, vec src1, vec src2, vec xtmp) %{
 6050   predicate(Matcher::vector_length_in_bytes(n) <= 8);
 6051   match(Set dst (MulVB src1 src2));
 6052   effect(TEMP dst, TEMP xtmp);
 6053   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6054   ins_encode %{
 6055     assert(UseSSE > 3, "required");
 6056     __ pmovsxbw($dst$$XMMRegister, $src1$$XMMRegister);
 6057     __ pmovsxbw($xtmp$$XMMRegister, $src2$$XMMRegister);
 6058     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 6059     __ psllw($dst$$XMMRegister, 8);
 6060     __ psrlw($dst$$XMMRegister, 8);
 6061     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6062   %}
 6063   ins_pipe( pipe_slow );
 6064 %}
 6065 
 6066 instruct vmulB(vec dst, vec src1, vec src2, vec xtmp) %{
 6067   predicate(UseAVX == 0 && Matcher::vector_length_in_bytes(n) > 8);
 6068   match(Set dst (MulVB src1 src2));
 6069   effect(TEMP dst, TEMP xtmp);
 6070   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6071   ins_encode %{
 6072     assert(UseSSE > 3, "required");
 6073     // Odd-index elements
 6074     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
 6075     __ psrlw($dst$$XMMRegister, 8);
 6076     __ movdqu($xtmp$$XMMRegister, $src2$$XMMRegister);
 6077     __ psrlw($xtmp$$XMMRegister, 8);
 6078     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 6079     __ psllw($dst$$XMMRegister, 8);
 6080     // Even-index elements
 6081     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 6082     __ pmullw($xtmp$$XMMRegister, $src2$$XMMRegister);
 6083     __ psllw($xtmp$$XMMRegister, 8);
 6084     __ psrlw($xtmp$$XMMRegister, 8);
 6085     // Combine
 6086     __ por($dst$$XMMRegister, $xtmp$$XMMRegister);
 6087   %}
 6088   ins_pipe( pipe_slow );
 6089 %}
 6090 
 6091 instruct vmulB_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 6092   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) > 8);
 6093   match(Set dst (MulVB src1 src2));
 6094   effect(TEMP xtmp1, TEMP xtmp2);
 6095   format %{ "vmulVB  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 6096   ins_encode %{
 6097     int vlen_enc = vector_length_encoding(this);
 6098     // Odd-index elements
 6099     __ vpsrlw($xtmp2$$XMMRegister, $src1$$XMMRegister, 8, vlen_enc);
 6100     __ vpsrlw($xtmp1$$XMMRegister, $src2$$XMMRegister, 8, vlen_enc);
 6101     __ vpmullw($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6102     __ vpsllw($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 8, vlen_enc);
 6103     // Even-index elements
 6104     __ vpmullw($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6105     __ vpsllw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 6106     __ vpsrlw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 6107     // Combine
 6108     __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6109   %}
 6110   ins_pipe( pipe_slow );
 6111 %}
 6112 
 6113 // Shorts/Chars vector mul
 6114 instruct vmulS(vec dst, vec src) %{
 6115   predicate(UseAVX == 0);
 6116   match(Set dst (MulVS dst src));
 6117   format %{ "pmullw  $dst,$src\t! mul packedS" %}
 6118   ins_encode %{
 6119     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
 6120   %}
 6121   ins_pipe( pipe_slow );
 6122 %}
 6123 
 6124 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
 6125   predicate(UseAVX > 0);
 6126   match(Set dst (MulVS src1 src2));
 6127   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
 6128   ins_encode %{
 6129     int vlen_enc = vector_length_encoding(this);
 6130     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6131   %}
 6132   ins_pipe( pipe_slow );
 6133 %}
 6134 
 6135 instruct vmulS_mem(vec dst, vec src, memory mem) %{
 6136   predicate((UseAVX > 0) &&
 6137             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6138   match(Set dst (MulVS src (LoadVector mem)));
 6139   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
 6140   ins_encode %{
 6141     int vlen_enc = vector_length_encoding(this);
 6142     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6143   %}
 6144   ins_pipe( pipe_slow );
 6145 %}
 6146 
 6147 // Integers vector mul
 6148 instruct vmulI(vec dst, vec src) %{
 6149   predicate(UseAVX == 0);
 6150   match(Set dst (MulVI dst src));
 6151   format %{ "pmulld  $dst,$src\t! mul packedI" %}
 6152   ins_encode %{
 6153     assert(UseSSE > 3, "required");
 6154     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
 6155   %}
 6156   ins_pipe( pipe_slow );
 6157 %}
 6158 
 6159 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
 6160   predicate(UseAVX > 0);
 6161   match(Set dst (MulVI src1 src2));
 6162   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
 6163   ins_encode %{
 6164     int vlen_enc = vector_length_encoding(this);
 6165     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6166   %}
 6167   ins_pipe( pipe_slow );
 6168 %}
 6169 
 6170 instruct vmulI_mem(vec dst, vec src, memory mem) %{
 6171   predicate((UseAVX > 0) &&
 6172             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6173   match(Set dst (MulVI src (LoadVector mem)));
 6174   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
 6175   ins_encode %{
 6176     int vlen_enc = vector_length_encoding(this);
 6177     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6178   %}
 6179   ins_pipe( pipe_slow );
 6180 %}
 6181 
 6182 // Longs vector mul
 6183 instruct evmulL_reg(vec dst, vec src1, vec src2) %{
 6184   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6185              VM_Version::supports_avx512dq()) ||
 6186             VM_Version::supports_avx512vldq());
 6187   match(Set dst (MulVL src1 src2));
 6188   ins_cost(500);
 6189   format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
 6190   ins_encode %{
 6191     assert(UseAVX > 2, "required");
 6192     int vlen_enc = vector_length_encoding(this);
 6193     __ evpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6194   %}
 6195   ins_pipe( pipe_slow );
 6196 %}
 6197 
 6198 instruct evmulL_mem(vec dst, vec src, memory mem) %{
 6199   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6200              VM_Version::supports_avx512dq()) ||
 6201             (Matcher::vector_length_in_bytes(n) > 8 &&
 6202              VM_Version::supports_avx512vldq()));
 6203   match(Set dst (MulVL src (LoadVector mem)));
 6204   format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
 6205   ins_cost(500);
 6206   ins_encode %{
 6207     assert(UseAVX > 2, "required");
 6208     int vlen_enc = vector_length_encoding(this);
 6209     __ evpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6210   %}
 6211   ins_pipe( pipe_slow );
 6212 %}
 6213 
 6214 instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
 6215   predicate(UseAVX == 0);
 6216   match(Set dst (MulVL src1 src2));
 6217   ins_cost(500);
 6218   effect(TEMP dst, TEMP xtmp);
 6219   format %{ "mulVL   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6220   ins_encode %{
 6221     assert(VM_Version::supports_sse4_1(), "required");
 6222     // Get the lo-hi products, only the lower 32 bits is in concerns
 6223     __ pshufd($xtmp$$XMMRegister, $src2$$XMMRegister, 0xB1);
 6224     __ pmulld($xtmp$$XMMRegister, $src1$$XMMRegister);
 6225     __ pshufd($dst$$XMMRegister, $xtmp$$XMMRegister, 0xB1);
 6226     __ paddd($dst$$XMMRegister, $xtmp$$XMMRegister);
 6227     __ psllq($dst$$XMMRegister, 32);
 6228     // Get the lo-lo products
 6229     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 6230     __ pmuludq($xtmp$$XMMRegister, $src2$$XMMRegister);
 6231     __ paddq($dst$$XMMRegister, $xtmp$$XMMRegister);
 6232   %}
 6233   ins_pipe( pipe_slow );
 6234 %}
 6235 
 6236 instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 6237   predicate(UseAVX > 0 &&
 6238             ((Matcher::vector_length_in_bytes(n) == 64 &&
 6239               !VM_Version::supports_avx512dq()) ||
 6240              (Matcher::vector_length_in_bytes(n) < 64 &&
 6241               !VM_Version::supports_avx512vldq())));
 6242   match(Set dst (MulVL src1 src2));
 6243   effect(TEMP xtmp1, TEMP xtmp2);
 6244   ins_cost(500);
 6245   format %{ "vmulVL  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 6246   ins_encode %{
 6247     int vlen_enc = vector_length_encoding(this);
 6248     // Get the lo-hi products, only the lower 32 bits is in concerns
 6249     __ vpshufd($xtmp1$$XMMRegister, $src2$$XMMRegister, 0xB1, vlen_enc);
 6250     __ vpmulld($xtmp1$$XMMRegister, $src1$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6251     __ vpshufd($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 0xB1, vlen_enc);
 6252     __ vpaddd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6253     __ vpsllq($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 32, vlen_enc);
 6254     // Get the lo-lo products
 6255     __ vpmuludq($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6256     __ vpaddq($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6257   %}
 6258   ins_pipe( pipe_slow );
 6259 %}
 6260 
 6261 instruct vmuludq_reg(vec dst, vec src1, vec src2) %{
 6262   predicate(UseAVX > 0 && n->as_MulVL()->has_uint_inputs());
 6263   match(Set dst (MulVL src1 src2));
 6264   ins_cost(100);
 6265   format %{ "vpmuludq $dst,$src1,$src2\t! muludq packedL" %}
 6266   ins_encode %{
 6267     int vlen_enc = vector_length_encoding(this);
 6268     __ vpmuludq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6269   %}
 6270   ins_pipe( pipe_slow );
 6271 %}
 6272 
 6273 instruct vmuldq_reg(vec dst, vec src1, vec src2) %{
 6274   predicate(UseAVX > 0 && n->as_MulVL()->has_int_inputs());
 6275   match(Set dst (MulVL src1 src2));
 6276   ins_cost(100);
 6277   format %{ "vpmuldq $dst,$src1,$src2\t! muldq packedL" %}
 6278   ins_encode %{
 6279     int vlen_enc = vector_length_encoding(this);
 6280     __ vpmuldq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6281   %}
 6282   ins_pipe( pipe_slow );
 6283 %}
 6284 
 6285 // Floats vector mul
 6286 instruct vmulF(vec dst, vec src) %{
 6287   predicate(UseAVX == 0);
 6288   match(Set dst (MulVF dst src));
 6289   format %{ "mulps   $dst,$src\t! mul packedF" %}
 6290   ins_encode %{
 6291     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
 6292   %}
 6293   ins_pipe( pipe_slow );
 6294 %}
 6295 
 6296 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
 6297   predicate(UseAVX > 0);
 6298   match(Set dst (MulVF src1 src2));
 6299   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
 6300   ins_encode %{
 6301     int vlen_enc = vector_length_encoding(this);
 6302     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6303   %}
 6304   ins_pipe( pipe_slow );
 6305 %}
 6306 
 6307 instruct vmulF_mem(vec dst, vec src, memory mem) %{
 6308   predicate((UseAVX > 0) &&
 6309             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6310   match(Set dst (MulVF src (LoadVector mem)));
 6311   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
 6312   ins_encode %{
 6313     int vlen_enc = vector_length_encoding(this);
 6314     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6315   %}
 6316   ins_pipe( pipe_slow );
 6317 %}
 6318 
 6319 // Doubles vector mul
 6320 instruct vmulD(vec dst, vec src) %{
 6321   predicate(UseAVX == 0);
 6322   match(Set dst (MulVD dst src));
 6323   format %{ "mulpd   $dst,$src\t! mul packedD" %}
 6324   ins_encode %{
 6325     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
 6326   %}
 6327   ins_pipe( pipe_slow );
 6328 %}
 6329 
 6330 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
 6331   predicate(UseAVX > 0);
 6332   match(Set dst (MulVD src1 src2));
 6333   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
 6334   ins_encode %{
 6335     int vlen_enc = vector_length_encoding(this);
 6336     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6337   %}
 6338   ins_pipe( pipe_slow );
 6339 %}
 6340 
 6341 instruct vmulD_mem(vec dst, vec src, memory mem) %{
 6342   predicate((UseAVX > 0) &&
 6343             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6344   match(Set dst (MulVD src (LoadVector mem)));
 6345   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
 6346   ins_encode %{
 6347     int vlen_enc = vector_length_encoding(this);
 6348     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6349   %}
 6350   ins_pipe( pipe_slow );
 6351 %}
 6352 
 6353 // --------------------------------- DIV --------------------------------------
 6354 
 6355 // Floats vector div
 6356 instruct vdivF(vec dst, vec src) %{
 6357   predicate(UseAVX == 0);
 6358   match(Set dst (DivVF dst src));
 6359   format %{ "divps   $dst,$src\t! div packedF" %}
 6360   ins_encode %{
 6361     __ divps($dst$$XMMRegister, $src$$XMMRegister);
 6362   %}
 6363   ins_pipe( pipe_slow );
 6364 %}
 6365 
 6366 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
 6367   predicate(UseAVX > 0);
 6368   match(Set dst (DivVF src1 src2));
 6369   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
 6370   ins_encode %{
 6371     int vlen_enc = vector_length_encoding(this);
 6372     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6373   %}
 6374   ins_pipe( pipe_slow );
 6375 %}
 6376 
 6377 instruct vdivF_mem(vec dst, vec src, memory mem) %{
 6378   predicate((UseAVX > 0) &&
 6379             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6380   match(Set dst (DivVF src (LoadVector mem)));
 6381   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
 6382   ins_encode %{
 6383     int vlen_enc = vector_length_encoding(this);
 6384     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6385   %}
 6386   ins_pipe( pipe_slow );
 6387 %}
 6388 
 6389 // Doubles vector div
 6390 instruct vdivD(vec dst, vec src) %{
 6391   predicate(UseAVX == 0);
 6392   match(Set dst (DivVD dst src));
 6393   format %{ "divpd   $dst,$src\t! div packedD" %}
 6394   ins_encode %{
 6395     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
 6396   %}
 6397   ins_pipe( pipe_slow );
 6398 %}
 6399 
 6400 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
 6401   predicate(UseAVX > 0);
 6402   match(Set dst (DivVD src1 src2));
 6403   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
 6404   ins_encode %{
 6405     int vlen_enc = vector_length_encoding(this);
 6406     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6407   %}
 6408   ins_pipe( pipe_slow );
 6409 %}
 6410 
 6411 instruct vdivD_mem(vec dst, vec src, memory mem) %{
 6412   predicate((UseAVX > 0) &&
 6413             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6414   match(Set dst (DivVD src (LoadVector mem)));
 6415   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
 6416   ins_encode %{
 6417     int vlen_enc = vector_length_encoding(this);
 6418     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6419   %}
 6420   ins_pipe( pipe_slow );
 6421 %}
 6422 
 6423 // ------------------------------ MinMax ---------------------------------------
 6424 
 6425 // Byte, Short, Int vector Min/Max
 6426 instruct minmax_reg_sse(vec dst, vec src) %{
 6427   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6428             UseAVX == 0);
 6429   match(Set dst (MinV dst src));
 6430   match(Set dst (MaxV dst src));
 6431   format %{ "vector_minmax  $dst,$src\t!  " %}
 6432   ins_encode %{
 6433     assert(UseSSE >= 4, "required");
 6434 
 6435     int opcode = this->ideal_Opcode();
 6436     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6437     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
 6438   %}
 6439   ins_pipe( pipe_slow );
 6440 %}
 6441 
 6442 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
 6443   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6444             UseAVX > 0);
 6445   match(Set dst (MinV src1 src2));
 6446   match(Set dst (MaxV src1 src2));
 6447   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
 6448   ins_encode %{
 6449     int opcode = this->ideal_Opcode();
 6450     int vlen_enc = vector_length_encoding(this);
 6451     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6452 
 6453     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6454   %}
 6455   ins_pipe( pipe_slow );
 6456 %}
 6457 
 6458 // Long vector Min/Max
 6459 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
 6460   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6461             UseAVX == 0);
 6462   match(Set dst (MinV dst src));
 6463   match(Set dst (MaxV src dst));
 6464   effect(TEMP dst, TEMP tmp);
 6465   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
 6466   ins_encode %{
 6467     assert(UseSSE >= 4, "required");
 6468 
 6469     int opcode = this->ideal_Opcode();
 6470     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6471     assert(elem_bt == T_LONG, "sanity");
 6472 
 6473     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
 6474   %}
 6475   ins_pipe( pipe_slow );
 6476 %}
 6477 
 6478 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
 6479   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6480             UseAVX > 0 && !VM_Version::supports_avx512vl());
 6481   match(Set dst (MinV src1 src2));
 6482   match(Set dst (MaxV src1 src2));
 6483   effect(TEMP dst);
 6484   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
 6485   ins_encode %{
 6486     int vlen_enc = vector_length_encoding(this);
 6487     int opcode = this->ideal_Opcode();
 6488     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6489     assert(elem_bt == T_LONG, "sanity");
 6490 
 6491     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6492   %}
 6493   ins_pipe( pipe_slow );
 6494 %}
 6495 
 6496 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
 6497   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
 6498             Matcher::vector_element_basic_type(n) == T_LONG);
 6499   match(Set dst (MinV src1 src2));
 6500   match(Set dst (MaxV src1 src2));
 6501   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
 6502   ins_encode %{
 6503     assert(UseAVX > 2, "required");
 6504 
 6505     int vlen_enc = vector_length_encoding(this);
 6506     int opcode = this->ideal_Opcode();
 6507     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6508     assert(elem_bt == T_LONG, "sanity");
 6509 
 6510     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6511   %}
 6512   ins_pipe( pipe_slow );
 6513 %}
 6514 
 6515 // Float/Double vector Min/Max
 6516 instruct minmaxFP_avx10_reg(vec dst, vec a, vec b) %{
 6517   predicate(VM_Version::supports_avx10_2() &&
 6518             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6519   match(Set dst (MinV a b));
 6520   match(Set dst (MaxV a b));
 6521   format %{ "vector_minmaxFP  $dst, $a, $b" %}
 6522   ins_encode %{
 6523     int vlen_enc = vector_length_encoding(this);
 6524     int opcode = this->ideal_Opcode();
 6525     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6526     __ vminmax_fp(opcode, elem_bt, $dst$$XMMRegister, k0, $a$$XMMRegister, $b$$XMMRegister, vlen_enc);
 6527   %}
 6528   ins_pipe( pipe_slow );
 6529 %}
 6530 
 6531 // Float/Double vector Min/Max
 6532 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
 6533   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_length_in_bytes(n) <= 32 &&
 6534             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
 6535             UseAVX > 0);
 6536   match(Set dst (MinV a b));
 6537   match(Set dst (MaxV a b));
 6538   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
 6539   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
 6540   ins_encode %{
 6541     assert(UseAVX > 0, "required");
 6542 
 6543     int opcode = this->ideal_Opcode();
 6544     int vlen_enc = vector_length_encoding(this);
 6545     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6546 
 6547     __ vminmax_fp(opcode, elem_bt,
 6548                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6549                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6550   %}
 6551   ins_pipe( pipe_slow );
 6552 %}
 6553 
 6554 instruct evminmaxFP_reg_evex(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
 6555   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_length_in_bytes(n) == 64 &&
 6556             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6557   match(Set dst (MinV a b));
 6558   match(Set dst (MaxV a b));
 6559   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
 6560   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
 6561   ins_encode %{
 6562     assert(UseAVX > 2, "required");
 6563 
 6564     int opcode = this->ideal_Opcode();
 6565     int vlen_enc = vector_length_encoding(this);
 6566     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6567 
 6568     __ evminmax_fp(opcode, elem_bt,
 6569                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6570                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6571   %}
 6572   ins_pipe( pipe_slow );
 6573 %}
 6574 
 6575 // ------------------------------ Unsigned vector Min/Max ----------------------
 6576 
 6577 instruct vector_uminmax_reg(vec dst, vec a, vec b) %{
 6578   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6579   match(Set dst (UMinV a b));
 6580   match(Set dst (UMaxV a b));
 6581   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6582   ins_encode %{
 6583     int opcode = this->ideal_Opcode();
 6584     int vlen_enc = vector_length_encoding(this);
 6585     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6586     assert(is_integral_type(elem_bt), "");
 6587     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vlen_enc);
 6588   %}
 6589   ins_pipe( pipe_slow );
 6590 %}
 6591 
 6592 instruct vector_uminmax_mem(vec dst, vec a, memory b) %{
 6593   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6594   match(Set dst (UMinV a (LoadVector b)));
 6595   match(Set dst (UMaxV a (LoadVector b)));
 6596   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6597   ins_encode %{
 6598     int opcode = this->ideal_Opcode();
 6599     int vlen_enc = vector_length_encoding(this);
 6600     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6601     assert(is_integral_type(elem_bt), "");
 6602     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$Address, vlen_enc);
 6603   %}
 6604   ins_pipe( pipe_slow );
 6605 %}
 6606 
 6607 instruct vector_uminmaxq_reg(vec dst, vec a, vec b, vec xtmp1, vec xtmp2) %{
 6608   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_LONG);
 6609   match(Set dst (UMinV a b));
 6610   match(Set dst (UMaxV a b));
 6611   effect(TEMP xtmp1, TEMP xtmp2);
 6612   format %{ "vector_uminmaxq $dst,$a,$b\t! using xtmp1 and xtmp2 as TEMP" %}
 6613   ins_encode %{
 6614     int opcode = this->ideal_Opcode();
 6615     int vlen_enc = vector_length_encoding(this);
 6616     __ vpuminmaxq(opcode, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6617   %}
 6618   ins_pipe( pipe_slow );
 6619 %}
 6620 
 6621 instruct vector_uminmax_reg_masked(vec dst, vec src2, kReg mask) %{
 6622   match(Set dst (UMinV (Binary dst src2) mask));
 6623   match(Set dst (UMaxV (Binary dst src2) mask));
 6624   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6625   ins_encode %{
 6626     int vlen_enc = vector_length_encoding(this);
 6627     BasicType bt = Matcher::vector_element_basic_type(this);
 6628     int opc = this->ideal_Opcode();
 6629     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6630                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 6631   %}
 6632   ins_pipe( pipe_slow );
 6633 %}
 6634 
 6635 instruct vector_uminmax_mem_masked(vec dst, memory src2, kReg mask) %{
 6636   match(Set dst (UMinV (Binary dst (LoadVector src2)) mask));
 6637   match(Set dst (UMaxV (Binary dst (LoadVector src2)) mask));
 6638   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6639   ins_encode %{
 6640     int vlen_enc = vector_length_encoding(this);
 6641     BasicType bt = Matcher::vector_element_basic_type(this);
 6642     int opc = this->ideal_Opcode();
 6643     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6644                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 6645   %}
 6646   ins_pipe( pipe_slow );
 6647 %}
 6648 
 6649 // --------------------------------- Signum/CopySign ---------------------------
 6650 
 6651 instruct signumF_reg(regF dst, regF zero, regF one, rFlagsReg cr) %{
 6652   match(Set dst (SignumF dst (Binary zero one)));
 6653   effect(KILL cr);
 6654   format %{ "signumF $dst, $dst" %}
 6655   ins_encode %{
 6656     int opcode = this->ideal_Opcode();
 6657     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6658   %}
 6659   ins_pipe( pipe_slow );
 6660 %}
 6661 
 6662 instruct signumD_reg(regD dst, regD zero, regD one, rFlagsReg cr) %{
 6663   match(Set dst (SignumD dst (Binary zero one)));
 6664   effect(KILL cr);
 6665   format %{ "signumD $dst, $dst" %}
 6666   ins_encode %{
 6667     int opcode = this->ideal_Opcode();
 6668     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6669   %}
 6670   ins_pipe( pipe_slow );
 6671 %}
 6672 
 6673 instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
 6674   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 6675   match(Set dst (SignumVF src (Binary zero one)));
 6676   match(Set dst (SignumVD src (Binary zero one)));
 6677   effect(TEMP dst, TEMP xtmp1);
 6678   format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
 6679   ins_encode %{
 6680     int opcode = this->ideal_Opcode();
 6681     int vec_enc = vector_length_encoding(this);
 6682     __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6683                          $xtmp1$$XMMRegister, vec_enc);
 6684   %}
 6685   ins_pipe( pipe_slow );
 6686 %}
 6687 
 6688 instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
 6689   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 6690   match(Set dst (SignumVF src (Binary zero one)));
 6691   match(Set dst (SignumVD src (Binary zero one)));
 6692   effect(TEMP dst, TEMP ktmp1);
 6693   format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
 6694   ins_encode %{
 6695     int opcode = this->ideal_Opcode();
 6696     int vec_enc = vector_length_encoding(this);
 6697     __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6698                           $ktmp1$$KRegister, vec_enc);
 6699   %}
 6700   ins_pipe( pipe_slow );
 6701 %}
 6702 
 6703 // ---------------------------------------
 6704 // For copySign use 0xE4 as writemask for vpternlog
 6705 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
 6706 // C (xmm2) is set to 0x7FFFFFFF
 6707 // Wherever xmm2 is 0, we want to pick from B (sign)
 6708 // Wherever xmm2 is 1, we want to pick from A (src)
 6709 //
 6710 // A B C Result
 6711 // 0 0 0 0
 6712 // 0 0 1 0
 6713 // 0 1 0 1
 6714 // 0 1 1 0
 6715 // 1 0 0 0
 6716 // 1 0 1 1
 6717 // 1 1 0 1
 6718 // 1 1 1 1
 6719 //
 6720 // Result going from high bit to low bit is 0x11100100 = 0xe4
 6721 // ---------------------------------------
 6722 
 6723 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
 6724   match(Set dst (CopySignF dst src));
 6725   effect(TEMP tmp1, TEMP tmp2);
 6726   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6727   ins_encode %{
 6728     __ movl($tmp2$$Register, 0x7FFFFFFF);
 6729     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
 6730     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6731   %}
 6732   ins_pipe( pipe_slow );
 6733 %}
 6734 
 6735 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
 6736   match(Set dst (CopySignD dst (Binary src zero)));
 6737   ins_cost(100);
 6738   effect(TEMP tmp1, TEMP tmp2);
 6739   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6740   ins_encode %{
 6741     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
 6742     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
 6743     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6744   %}
 6745   ins_pipe( pipe_slow );
 6746 %}
 6747 
 6748 //----------------------------- CompressBits/ExpandBits ------------------------
 6749 
 6750 instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6751   predicate(n->bottom_type()->isa_int());
 6752   match(Set dst (CompressBits src mask));
 6753   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6754   ins_encode %{
 6755     __ pextl($dst$$Register, $src$$Register, $mask$$Register);
 6756   %}
 6757   ins_pipe( pipe_slow );
 6758 %}
 6759 
 6760 instruct expandBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6761   predicate(n->bottom_type()->isa_int());
 6762   match(Set dst (ExpandBits src mask));
 6763   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6764   ins_encode %{
 6765     __ pdepl($dst$$Register, $src$$Register, $mask$$Register);
 6766   %}
 6767   ins_pipe( pipe_slow );
 6768 %}
 6769 
 6770 instruct compressBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6771   predicate(n->bottom_type()->isa_int());
 6772   match(Set dst (CompressBits src (LoadI mask)));
 6773   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6774   ins_encode %{
 6775     __ pextl($dst$$Register, $src$$Register, $mask$$Address);
 6776   %}
 6777   ins_pipe( pipe_slow );
 6778 %}
 6779 
 6780 instruct expandBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6781   predicate(n->bottom_type()->isa_int());
 6782   match(Set dst (ExpandBits src (LoadI mask)));
 6783   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6784   ins_encode %{
 6785     __ pdepl($dst$$Register, $src$$Register, $mask$$Address);
 6786   %}
 6787   ins_pipe( pipe_slow );
 6788 %}
 6789 
 6790 // --------------------------------- Sqrt --------------------------------------
 6791 
 6792 instruct vsqrtF_reg(vec dst, vec src) %{
 6793   match(Set dst (SqrtVF src));
 6794   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
 6795   ins_encode %{
 6796     assert(UseAVX > 0, "required");
 6797     int vlen_enc = vector_length_encoding(this);
 6798     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6799   %}
 6800   ins_pipe( pipe_slow );
 6801 %}
 6802 
 6803 instruct vsqrtF_mem(vec dst, memory mem) %{
 6804   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6805   match(Set dst (SqrtVF (LoadVector mem)));
 6806   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
 6807   ins_encode %{
 6808     assert(UseAVX > 0, "required");
 6809     int vlen_enc = vector_length_encoding(this);
 6810     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6811   %}
 6812   ins_pipe( pipe_slow );
 6813 %}
 6814 
 6815 // Floating point vector sqrt
 6816 instruct vsqrtD_reg(vec dst, vec src) %{
 6817   match(Set dst (SqrtVD src));
 6818   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
 6819   ins_encode %{
 6820     assert(UseAVX > 0, "required");
 6821     int vlen_enc = vector_length_encoding(this);
 6822     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6823   %}
 6824   ins_pipe( pipe_slow );
 6825 %}
 6826 
 6827 instruct vsqrtD_mem(vec dst, memory mem) %{
 6828   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6829   match(Set dst (SqrtVD (LoadVector mem)));
 6830   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
 6831   ins_encode %{
 6832     assert(UseAVX > 0, "required");
 6833     int vlen_enc = vector_length_encoding(this);
 6834     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6835   %}
 6836   ins_pipe( pipe_slow );
 6837 %}
 6838 
 6839 // ------------------------------ Shift ---------------------------------------
 6840 
 6841 // Left and right shift count vectors are the same on x86
 6842 // (only lowest bits of xmm reg are used for count).
 6843 instruct vshiftcnt(vec dst, rRegI cnt) %{
 6844   match(Set dst (LShiftCntV cnt));
 6845   match(Set dst (RShiftCntV cnt));
 6846   format %{ "movdl    $dst,$cnt\t! load shift count" %}
 6847   ins_encode %{
 6848     __ movdl($dst$$XMMRegister, $cnt$$Register);
 6849   %}
 6850   ins_pipe( pipe_slow );
 6851 %}
 6852 
 6853 // Byte vector shift
 6854 instruct vshiftB(vec dst, vec src, vec shift, vec tmp) %{
 6855   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
 6856   match(Set dst ( LShiftVB src shift));
 6857   match(Set dst ( RShiftVB src shift));
 6858   match(Set dst (URShiftVB src shift));
 6859   effect(TEMP dst, USE src, USE shift, TEMP tmp);
 6860   format %{"vector_byte_shift $dst,$src,$shift" %}
 6861   ins_encode %{
 6862     assert(UseSSE > 3, "required");
 6863     int opcode = this->ideal_Opcode();
 6864     bool sign = (opcode != Op_URShiftVB);
 6865     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
 6866     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
 6867     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6868     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 6869     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6870   %}
 6871   ins_pipe( pipe_slow );
 6872 %}
 6873 
 6874 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6875   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6876             UseAVX <= 1);
 6877   match(Set dst ( LShiftVB src shift));
 6878   match(Set dst ( RShiftVB src shift));
 6879   match(Set dst (URShiftVB src shift));
 6880   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2);
 6881   format %{"vector_byte_shift $dst,$src,$shift" %}
 6882   ins_encode %{
 6883     assert(UseSSE > 3, "required");
 6884     int opcode = this->ideal_Opcode();
 6885     bool sign = (opcode != Op_URShiftVB);
 6886     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
 6887     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
 6888     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
 6889     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
 6890     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
 6891     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6892     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 6893     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 6894     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 6895   %}
 6896   ins_pipe( pipe_slow );
 6897 %}
 6898 
 6899 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6900   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6901             UseAVX > 1);
 6902   match(Set dst ( LShiftVB src shift));
 6903   match(Set dst ( RShiftVB src shift));
 6904   match(Set dst (URShiftVB src shift));
 6905   effect(TEMP dst, TEMP tmp);
 6906   format %{"vector_byte_shift $dst,$src,$shift" %}
 6907   ins_encode %{
 6908     int opcode = this->ideal_Opcode();
 6909     bool sign = (opcode != Op_URShiftVB);
 6910     int vlen_enc = Assembler::AVX_256bit;
 6911     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6912     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6913     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6914     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
 6915     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
 6916   %}
 6917   ins_pipe( pipe_slow );
 6918 %}
 6919 
 6920 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6921   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
 6922   match(Set dst ( LShiftVB src shift));
 6923   match(Set dst ( RShiftVB src shift));
 6924   match(Set dst (URShiftVB src shift));
 6925   effect(TEMP dst, TEMP tmp);
 6926   format %{"vector_byte_shift $dst,$src,$shift" %}
 6927   ins_encode %{
 6928     assert(UseAVX > 1, "required");
 6929     int opcode = this->ideal_Opcode();
 6930     bool sign = (opcode != Op_URShiftVB);
 6931     int vlen_enc = Assembler::AVX_256bit;
 6932     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
 6933     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6934     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6935     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6936     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6937     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6938     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6939     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6940     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6941   %}
 6942   ins_pipe( pipe_slow );
 6943 %}
 6944 
 6945 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6946   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
 6947   match(Set dst ( LShiftVB src shift));
 6948   match(Set dst  (RShiftVB src shift));
 6949   match(Set dst (URShiftVB src shift));
 6950   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 6951   format %{"vector_byte_shift $dst,$src,$shift" %}
 6952   ins_encode %{
 6953     assert(UseAVX > 2, "required");
 6954     int opcode = this->ideal_Opcode();
 6955     bool sign = (opcode != Op_URShiftVB);
 6956     int vlen_enc = Assembler::AVX_512bit;
 6957     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
 6958     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 6959     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6960     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6961     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6962     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6963     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6964     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6965     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6966     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 6967     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
 6968     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6969   %}
 6970   ins_pipe( pipe_slow );
 6971 %}
 6972 
 6973 // Shorts vector logical right shift produces incorrect Java result
 6974 // for negative data because java code convert short value into int with
 6975 // sign extension before a shift. But char vectors are fine since chars are
 6976 // unsigned values.
 6977 // Shorts/Chars vector left shift
 6978 instruct vshiftS(vec dst, vec src, vec shift) %{
 6979   predicate(!n->as_ShiftV()->is_var_shift());
 6980   match(Set dst ( LShiftVS src shift));
 6981   match(Set dst ( RShiftVS src shift));
 6982   match(Set dst (URShiftVS src shift));
 6983   effect(TEMP dst, USE src, USE shift);
 6984   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
 6985   ins_encode %{
 6986     int opcode = this->ideal_Opcode();
 6987     if (UseAVX > 0) {
 6988       int vlen_enc = vector_length_encoding(this);
 6989       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6990     } else {
 6991       int vlen = Matcher::vector_length(this);
 6992       if (vlen == 2) {
 6993         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 6994         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6995       } else if (vlen == 4) {
 6996         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6997         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6998       } else {
 6999         assert (vlen == 8, "sanity");
 7000         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7001         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 7002       }
 7003     }
 7004   %}
 7005   ins_pipe( pipe_slow );
 7006 %}
 7007 
 7008 // Integers vector left shift
 7009 instruct vshiftI(vec dst, vec src, vec shift) %{
 7010   predicate(!n->as_ShiftV()->is_var_shift());
 7011   match(Set dst ( LShiftVI src shift));
 7012   match(Set dst ( RShiftVI src shift));
 7013   match(Set dst (URShiftVI src shift));
 7014   effect(TEMP dst, USE src, USE shift);
 7015   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
 7016   ins_encode %{
 7017     int opcode = this->ideal_Opcode();
 7018     if (UseAVX > 0) {
 7019       int vlen_enc = vector_length_encoding(this);
 7020       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7021     } else {
 7022       int vlen = Matcher::vector_length(this);
 7023       if (vlen == 2) {
 7024         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 7025         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 7026       } else {
 7027         assert(vlen == 4, "sanity");
 7028         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7029         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 7030       }
 7031     }
 7032   %}
 7033   ins_pipe( pipe_slow );
 7034 %}
 7035 
 7036 // Integers vector left constant shift
 7037 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
 7038   match(Set dst (LShiftVI src (LShiftCntV shift)));
 7039   match(Set dst (RShiftVI src (RShiftCntV shift)));
 7040   match(Set dst (URShiftVI src (RShiftCntV shift)));
 7041   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
 7042   ins_encode %{
 7043     int opcode = this->ideal_Opcode();
 7044     if (UseAVX > 0) {
 7045       int vector_len = vector_length_encoding(this);
 7046       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 7047     } else {
 7048       int vlen = Matcher::vector_length(this);
 7049       if (vlen == 2) {
 7050         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 7051         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 7052       } else {
 7053         assert(vlen == 4, "sanity");
 7054         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7055         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 7056       }
 7057     }
 7058   %}
 7059   ins_pipe( pipe_slow );
 7060 %}
 7061 
 7062 // Longs vector shift
 7063 instruct vshiftL(vec dst, vec src, vec shift) %{
 7064   predicate(!n->as_ShiftV()->is_var_shift());
 7065   match(Set dst ( LShiftVL src shift));
 7066   match(Set dst (URShiftVL src shift));
 7067   effect(TEMP dst, USE src, USE shift);
 7068   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
 7069   ins_encode %{
 7070     int opcode = this->ideal_Opcode();
 7071     if (UseAVX > 0) {
 7072       int vlen_enc = vector_length_encoding(this);
 7073       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7074     } else {
 7075       assert(Matcher::vector_length(this) == 2, "");
 7076       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7077       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 7078     }
 7079   %}
 7080   ins_pipe( pipe_slow );
 7081 %}
 7082 
 7083 // Longs vector constant shift
 7084 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
 7085   match(Set dst (LShiftVL src (LShiftCntV shift)));
 7086   match(Set dst (URShiftVL src (RShiftCntV shift)));
 7087   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
 7088   ins_encode %{
 7089     int opcode = this->ideal_Opcode();
 7090     if (UseAVX > 0) {
 7091       int vector_len = vector_length_encoding(this);
 7092       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 7093     } else {
 7094       assert(Matcher::vector_length(this) == 2, "");
 7095       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7096       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 7097     }
 7098   %}
 7099   ins_pipe( pipe_slow );
 7100 %}
 7101 
 7102 // -------------------ArithmeticRightShift -----------------------------------
 7103 // Long vector arithmetic right shift
 7104 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp) %{
 7105   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
 7106   match(Set dst (RShiftVL src shift));
 7107   effect(TEMP dst, TEMP tmp);
 7108   format %{ "vshiftq $dst,$src,$shift" %}
 7109   ins_encode %{
 7110     uint vlen = Matcher::vector_length(this);
 7111     if (vlen == 2) {
 7112       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7113       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
 7114       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 7115       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
 7116       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
 7117       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
 7118     } else {
 7119       assert(vlen == 4, "sanity");
 7120       assert(UseAVX > 1, "required");
 7121       int vlen_enc = Assembler::AVX_256bit;
 7122       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7123       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 7124       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7125       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 7126       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 7127     }
 7128   %}
 7129   ins_pipe( pipe_slow );
 7130 %}
 7131 
 7132 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
 7133   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
 7134   match(Set dst (RShiftVL src shift));
 7135   format %{ "vshiftq $dst,$src,$shift" %}
 7136   ins_encode %{
 7137     int vlen_enc = vector_length_encoding(this);
 7138     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7139   %}
 7140   ins_pipe( pipe_slow );
 7141 %}
 7142 
 7143 // ------------------- Variable Shift -----------------------------
 7144 // Byte variable shift
 7145 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7146   predicate(Matcher::vector_length(n) <= 8 &&
 7147             n->as_ShiftV()->is_var_shift() &&
 7148             !VM_Version::supports_avx512bw());
 7149   match(Set dst ( LShiftVB src shift));
 7150   match(Set dst ( RShiftVB src shift));
 7151   match(Set dst (URShiftVB src shift));
 7152   effect(TEMP dst, TEMP vtmp);
 7153   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7154   ins_encode %{
 7155     assert(UseAVX >= 2, "required");
 7156 
 7157     int opcode = this->ideal_Opcode();
 7158     int vlen_enc = Assembler::AVX_128bit;
 7159     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7160     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7161   %}
 7162   ins_pipe( pipe_slow );
 7163 %}
 7164 
 7165 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7166   predicate(Matcher::vector_length(n) == 16 &&
 7167             n->as_ShiftV()->is_var_shift() &&
 7168             !VM_Version::supports_avx512bw());
 7169   match(Set dst ( LShiftVB src shift));
 7170   match(Set dst ( RShiftVB src shift));
 7171   match(Set dst (URShiftVB src shift));
 7172   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7173   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7174   ins_encode %{
 7175     assert(UseAVX >= 2, "required");
 7176 
 7177     int opcode = this->ideal_Opcode();
 7178     int vlen_enc = Assembler::AVX_128bit;
 7179     // Shift lower half and get word result in dst
 7180     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7181 
 7182     // Shift upper half and get word result in vtmp1
 7183     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7184     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7185     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7186 
 7187     // Merge and down convert the two word results to byte in dst
 7188     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7189   %}
 7190   ins_pipe( pipe_slow );
 7191 %}
 7192 
 7193 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4) %{
 7194   predicate(Matcher::vector_length(n) == 32 &&
 7195             n->as_ShiftV()->is_var_shift() &&
 7196             !VM_Version::supports_avx512bw());
 7197   match(Set dst ( LShiftVB src shift));
 7198   match(Set dst ( RShiftVB src shift));
 7199   match(Set dst (URShiftVB src shift));
 7200   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4);
 7201   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 as TEMP" %}
 7202   ins_encode %{
 7203     assert(UseAVX >= 2, "required");
 7204 
 7205     int opcode = this->ideal_Opcode();
 7206     int vlen_enc = Assembler::AVX_128bit;
 7207     // Process lower 128 bits and get result in dst
 7208     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7209     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7210     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7211     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7212     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7213 
 7214     // Process higher 128 bits and get result in vtmp3
 7215     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7216     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7217     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister);
 7218     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
 7219     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
 7220     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7221     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7222 
 7223     // Merge the two results in dst
 7224     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7225   %}
 7226   ins_pipe( pipe_slow );
 7227 %}
 7228 
 7229 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp) %{
 7230   predicate(Matcher::vector_length(n) <= 32 &&
 7231             n->as_ShiftV()->is_var_shift() &&
 7232             VM_Version::supports_avx512bw());
 7233   match(Set dst ( LShiftVB src shift));
 7234   match(Set dst ( RShiftVB src shift));
 7235   match(Set dst (URShiftVB src shift));
 7236   effect(TEMP dst, TEMP vtmp);
 7237   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7238   ins_encode %{
 7239     assert(UseAVX > 2, "required");
 7240 
 7241     int opcode = this->ideal_Opcode();
 7242     int vlen_enc = vector_length_encoding(this);
 7243     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7244   %}
 7245   ins_pipe( pipe_slow );
 7246 %}
 7247 
 7248 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7249   predicate(Matcher::vector_length(n) == 64 &&
 7250             n->as_ShiftV()->is_var_shift() &&
 7251             VM_Version::supports_avx512bw());
 7252   match(Set dst ( LShiftVB src shift));
 7253   match(Set dst ( RShiftVB src shift));
 7254   match(Set dst (URShiftVB src shift));
 7255   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7256   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7257   ins_encode %{
 7258     assert(UseAVX > 2, "required");
 7259 
 7260     int opcode = this->ideal_Opcode();
 7261     int vlen_enc = Assembler::AVX_256bit;
 7262     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7263     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7264     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7265     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7266     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7267   %}
 7268   ins_pipe( pipe_slow );
 7269 %}
 7270 
 7271 // Short variable shift
 7272 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7273   predicate(Matcher::vector_length(n) <= 8 &&
 7274             n->as_ShiftV()->is_var_shift() &&
 7275             !VM_Version::supports_avx512bw());
 7276   match(Set dst ( LShiftVS src shift));
 7277   match(Set dst ( RShiftVS src shift));
 7278   match(Set dst (URShiftVS src shift));
 7279   effect(TEMP dst, TEMP vtmp);
 7280   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7281   ins_encode %{
 7282     assert(UseAVX >= 2, "required");
 7283 
 7284     int opcode = this->ideal_Opcode();
 7285     bool sign = (opcode != Op_URShiftVS);
 7286     int vlen_enc = Assembler::AVX_256bit;
 7287     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
 7288     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
 7289     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 7290     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7291     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
 7292     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7293   %}
 7294   ins_pipe( pipe_slow );
 7295 %}
 7296 
 7297 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7298   predicate(Matcher::vector_length(n) == 16 &&
 7299             n->as_ShiftV()->is_var_shift() &&
 7300             !VM_Version::supports_avx512bw());
 7301   match(Set dst ( LShiftVS src shift));
 7302   match(Set dst ( RShiftVS src shift));
 7303   match(Set dst (URShiftVS src shift));
 7304   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7305   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7306   ins_encode %{
 7307     assert(UseAVX >= 2, "required");
 7308 
 7309     int opcode = this->ideal_Opcode();
 7310     bool sign = (opcode != Op_URShiftVS);
 7311     int vlen_enc = Assembler::AVX_256bit;
 7312     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
 7313     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7314     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7315     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7316     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7317 
 7318     // Shift upper half, with result in dst using vtmp1 as TEMP
 7319     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
 7320     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
 7321     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7322     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7323     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7324     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7325 
 7326     // Merge lower and upper half result into dst
 7327     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7328     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 7329   %}
 7330   ins_pipe( pipe_slow );
 7331 %}
 7332 
 7333 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
 7334   predicate(n->as_ShiftV()->is_var_shift() &&
 7335             VM_Version::supports_avx512bw());
 7336   match(Set dst ( LShiftVS src shift));
 7337   match(Set dst ( RShiftVS src shift));
 7338   match(Set dst (URShiftVS src shift));
 7339   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
 7340   ins_encode %{
 7341     assert(UseAVX > 2, "required");
 7342 
 7343     int opcode = this->ideal_Opcode();
 7344     int vlen_enc = vector_length_encoding(this);
 7345     if (!VM_Version::supports_avx512vl()) {
 7346       vlen_enc = Assembler::AVX_512bit;
 7347     }
 7348     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7349   %}
 7350   ins_pipe( pipe_slow );
 7351 %}
 7352 
 7353 //Integer variable shift
 7354 instruct vshiftI_var(vec dst, vec src, vec shift) %{
 7355   predicate(n->as_ShiftV()->is_var_shift());
 7356   match(Set dst ( LShiftVI src shift));
 7357   match(Set dst ( RShiftVI src shift));
 7358   match(Set dst (URShiftVI src shift));
 7359   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
 7360   ins_encode %{
 7361     assert(UseAVX >= 2, "required");
 7362 
 7363     int opcode = this->ideal_Opcode();
 7364     int vlen_enc = vector_length_encoding(this);
 7365     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7366   %}
 7367   ins_pipe( pipe_slow );
 7368 %}
 7369 
 7370 //Long variable shift
 7371 instruct vshiftL_var(vec dst, vec src, vec shift) %{
 7372   predicate(n->as_ShiftV()->is_var_shift());
 7373   match(Set dst ( LShiftVL src shift));
 7374   match(Set dst (URShiftVL src shift));
 7375   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
 7376   ins_encode %{
 7377     assert(UseAVX >= 2, "required");
 7378 
 7379     int opcode = this->ideal_Opcode();
 7380     int vlen_enc = vector_length_encoding(this);
 7381     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7382   %}
 7383   ins_pipe( pipe_slow );
 7384 %}
 7385 
 7386 //Long variable right shift arithmetic
 7387 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
 7388   predicate(Matcher::vector_length(n) <= 4 &&
 7389             n->as_ShiftV()->is_var_shift() &&
 7390             UseAVX == 2);
 7391   match(Set dst (RShiftVL src shift));
 7392   effect(TEMP dst, TEMP vtmp);
 7393   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
 7394   ins_encode %{
 7395     int opcode = this->ideal_Opcode();
 7396     int vlen_enc = vector_length_encoding(this);
 7397     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
 7398                  $vtmp$$XMMRegister);
 7399   %}
 7400   ins_pipe( pipe_slow );
 7401 %}
 7402 
 7403 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
 7404   predicate(n->as_ShiftV()->is_var_shift() &&
 7405             UseAVX > 2);
 7406   match(Set dst (RShiftVL src shift));
 7407   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
 7408   ins_encode %{
 7409     int opcode = this->ideal_Opcode();
 7410     int vlen_enc = vector_length_encoding(this);
 7411     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7412   %}
 7413   ins_pipe( pipe_slow );
 7414 %}
 7415 
 7416 // --------------------------------- AND --------------------------------------
 7417 
 7418 instruct vand(vec dst, vec src) %{
 7419   predicate(UseAVX == 0);
 7420   match(Set dst (AndV dst src));
 7421   format %{ "pand    $dst,$src\t! and vectors" %}
 7422   ins_encode %{
 7423     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 7424   %}
 7425   ins_pipe( pipe_slow );
 7426 %}
 7427 
 7428 instruct vand_reg(vec dst, vec src1, vec src2) %{
 7429   predicate(UseAVX > 0);
 7430   match(Set dst (AndV src1 src2));
 7431   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
 7432   ins_encode %{
 7433     int vlen_enc = vector_length_encoding(this);
 7434     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7435   %}
 7436   ins_pipe( pipe_slow );
 7437 %}
 7438 
 7439 instruct vand_mem(vec dst, vec src, memory mem) %{
 7440   predicate((UseAVX > 0) &&
 7441             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7442   match(Set dst (AndV src (LoadVector mem)));
 7443   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
 7444   ins_encode %{
 7445     int vlen_enc = vector_length_encoding(this);
 7446     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7447   %}
 7448   ins_pipe( pipe_slow );
 7449 %}
 7450 
 7451 // --------------------------------- OR ---------------------------------------
 7452 
 7453 instruct vor(vec dst, vec src) %{
 7454   predicate(UseAVX == 0);
 7455   match(Set dst (OrV dst src));
 7456   format %{ "por     $dst,$src\t! or vectors" %}
 7457   ins_encode %{
 7458     __ por($dst$$XMMRegister, $src$$XMMRegister);
 7459   %}
 7460   ins_pipe( pipe_slow );
 7461 %}
 7462 
 7463 instruct vor_reg(vec dst, vec src1, vec src2) %{
 7464   predicate(UseAVX > 0);
 7465   match(Set dst (OrV src1 src2));
 7466   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
 7467   ins_encode %{
 7468     int vlen_enc = vector_length_encoding(this);
 7469     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7470   %}
 7471   ins_pipe( pipe_slow );
 7472 %}
 7473 
 7474 instruct vor_mem(vec dst, vec src, memory mem) %{
 7475   predicate((UseAVX > 0) &&
 7476             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7477   match(Set dst (OrV src (LoadVector mem)));
 7478   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
 7479   ins_encode %{
 7480     int vlen_enc = vector_length_encoding(this);
 7481     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7482   %}
 7483   ins_pipe( pipe_slow );
 7484 %}
 7485 
 7486 // --------------------------------- XOR --------------------------------------
 7487 
 7488 instruct vxor(vec dst, vec src) %{
 7489   predicate(UseAVX == 0);
 7490   match(Set dst (XorV dst src));
 7491   format %{ "pxor    $dst,$src\t! xor vectors" %}
 7492   ins_encode %{
 7493     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
 7494   %}
 7495   ins_pipe( pipe_slow );
 7496 %}
 7497 
 7498 instruct vxor_reg(vec dst, vec src1, vec src2) %{
 7499   predicate(UseAVX > 0);
 7500   match(Set dst (XorV src1 src2));
 7501   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
 7502   ins_encode %{
 7503     int vlen_enc = vector_length_encoding(this);
 7504     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7505   %}
 7506   ins_pipe( pipe_slow );
 7507 %}
 7508 
 7509 instruct vxor_mem(vec dst, vec src, memory mem) %{
 7510   predicate((UseAVX > 0) &&
 7511             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7512   match(Set dst (XorV src (LoadVector mem)));
 7513   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
 7514   ins_encode %{
 7515     int vlen_enc = vector_length_encoding(this);
 7516     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7517   %}
 7518   ins_pipe( pipe_slow );
 7519 %}
 7520 
 7521 // --------------------------------- VectorCast --------------------------------------
 7522 
 7523 instruct vcastBtoX(vec dst, vec src) %{
 7524   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_DOUBLE);
 7525   match(Set dst (VectorCastB2X src));
 7526   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7527   ins_encode %{
 7528     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7529     int vlen_enc = vector_length_encoding(this);
 7530     __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7531   %}
 7532   ins_pipe( pipe_slow );
 7533 %}
 7534 
 7535 instruct vcastBtoD(legVec dst, legVec src) %{
 7536   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7537   match(Set dst (VectorCastB2X src));
 7538   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7539   ins_encode %{
 7540     int vlen_enc = vector_length_encoding(this);
 7541     __ vconvert_b2x(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7542   %}
 7543   ins_pipe( pipe_slow );
 7544 %}
 7545 
 7546 instruct castStoX(vec dst, vec src) %{
 7547   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7548             Matcher::vector_length(n->in(1)) <= 8 && // src
 7549             Matcher::vector_element_basic_type(n) == T_BYTE);
 7550   match(Set dst (VectorCastS2X src));
 7551   format %{ "vector_cast_s2x $dst,$src" %}
 7552   ins_encode %{
 7553     assert(UseAVX > 0, "required");
 7554 
 7555     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, noreg);
 7556     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7557   %}
 7558   ins_pipe( pipe_slow );
 7559 %}
 7560 
 7561 instruct vcastStoX(vec dst, vec src, vec vtmp) %{
 7562   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7563             Matcher::vector_length(n->in(1)) == 16 && // src
 7564             Matcher::vector_element_basic_type(n) == T_BYTE);
 7565   effect(TEMP dst, TEMP vtmp);
 7566   match(Set dst (VectorCastS2X src));
 7567   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp as TEMP" %}
 7568   ins_encode %{
 7569     assert(UseAVX > 0, "required");
 7570 
 7571     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
 7572     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7573     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 7574     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7575   %}
 7576   ins_pipe( pipe_slow );
 7577 %}
 7578 
 7579 instruct vcastStoX_evex(vec dst, vec src) %{
 7580   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
 7581             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7582   match(Set dst (VectorCastS2X src));
 7583   format %{ "vector_cast_s2x $dst,$src\t!" %}
 7584   ins_encode %{
 7585     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7586     int src_vlen_enc = vector_length_encoding(this, $src);
 7587     int vlen_enc = vector_length_encoding(this);
 7588     switch (to_elem_bt) {
 7589       case T_BYTE:
 7590         if (!VM_Version::supports_avx512vl()) {
 7591           vlen_enc = Assembler::AVX_512bit;
 7592         }
 7593         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7594         break;
 7595       case T_INT:
 7596         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7597         break;
 7598       case T_FLOAT:
 7599         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7600         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7601         break;
 7602       case T_LONG:
 7603         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7604         break;
 7605       case T_DOUBLE: {
 7606         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 7607         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 7608         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7609         break;
 7610       }
 7611       default:
 7612         ShouldNotReachHere();
 7613     }
 7614   %}
 7615   ins_pipe( pipe_slow );
 7616 %}
 7617 
 7618 instruct castItoX(vec dst, vec src) %{
 7619   predicate(UseAVX <= 2 &&
 7620             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
 7621             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7622   match(Set dst (VectorCastI2X src));
 7623   format %{ "vector_cast_i2x $dst,$src" %}
 7624   ins_encode %{
 7625     assert(UseAVX > 0, "required");
 7626 
 7627     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7628     int vlen_enc = vector_length_encoding(this, $src);
 7629 
 7630     if (to_elem_bt == T_BYTE) {
 7631       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7632       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7633       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7634     } else {
 7635       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7636       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7637       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7638     }
 7639   %}
 7640   ins_pipe( pipe_slow );
 7641 %}
 7642 
 7643 instruct vcastItoX(vec dst, vec src, vec vtmp) %{
 7644   predicate(UseAVX <= 2 &&
 7645             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
 7646             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7647   match(Set dst (VectorCastI2X src));
 7648   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp as TEMP" %}
 7649   effect(TEMP dst, TEMP vtmp);
 7650   ins_encode %{
 7651     assert(UseAVX > 0, "required");
 7652 
 7653     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7654     int vlen_enc = vector_length_encoding(this, $src);
 7655 
 7656     if (to_elem_bt == T_BYTE) {
 7657       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7658       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7659       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7660       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7661     } else {
 7662       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7663       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7664       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7665       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7666     }
 7667   %}
 7668   ins_pipe( pipe_slow );
 7669 %}
 7670 
 7671 instruct vcastItoX_evex(vec dst, vec src) %{
 7672   predicate(UseAVX > 2 ||
 7673             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7674   match(Set dst (VectorCastI2X src));
 7675   format %{ "vector_cast_i2x $dst,$src\t!" %}
 7676   ins_encode %{
 7677     assert(UseAVX > 0, "required");
 7678 
 7679     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
 7680     int src_vlen_enc = vector_length_encoding(this, $src);
 7681     int dst_vlen_enc = vector_length_encoding(this);
 7682     switch (dst_elem_bt) {
 7683       case T_BYTE:
 7684         if (!VM_Version::supports_avx512vl()) {
 7685           src_vlen_enc = Assembler::AVX_512bit;
 7686         }
 7687         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7688         break;
 7689       case T_SHORT:
 7690         if (!VM_Version::supports_avx512vl()) {
 7691           src_vlen_enc = Assembler::AVX_512bit;
 7692         }
 7693         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7694         break;
 7695       case T_FLOAT:
 7696         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7697         break;
 7698       case T_LONG:
 7699         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7700         break;
 7701       case T_DOUBLE:
 7702         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7703         break;
 7704       default:
 7705         ShouldNotReachHere();
 7706     }
 7707   %}
 7708   ins_pipe( pipe_slow );
 7709 %}
 7710 
 7711 instruct vcastLtoBS(vec dst, vec src) %{
 7712   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
 7713             UseAVX <= 2);
 7714   match(Set dst (VectorCastL2X src));
 7715   format %{ "vector_cast_l2x  $dst,$src" %}
 7716   ins_encode %{
 7717     assert(UseAVX > 0, "required");
 7718 
 7719     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7720     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
 7721     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
 7722                                                       : ExternalAddress(vector_int_to_short_mask());
 7723     if (vlen <= 16) {
 7724       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
 7725       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7726       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7727     } else {
 7728       assert(vlen <= 32, "required");
 7729       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
 7730       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
 7731       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7732       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7733     }
 7734     if (to_elem_bt == T_BYTE) {
 7735       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7736     }
 7737   %}
 7738   ins_pipe( pipe_slow );
 7739 %}
 7740 
 7741 instruct vcastLtoX_evex(vec dst, vec src) %{
 7742   predicate(UseAVX > 2 ||
 7743             (Matcher::vector_element_basic_type(n) == T_INT ||
 7744              Matcher::vector_element_basic_type(n) == T_FLOAT ||
 7745              Matcher::vector_element_basic_type(n) == T_DOUBLE));
 7746   match(Set dst (VectorCastL2X src));
 7747   format %{ "vector_cast_l2x  $dst,$src\t!" %}
 7748   ins_encode %{
 7749     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7750     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7751     int vlen_enc = vector_length_encoding(this, $src);
 7752     switch (to_elem_bt) {
 7753       case T_BYTE:
 7754         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7755           vlen_enc = Assembler::AVX_512bit;
 7756         }
 7757         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7758         break;
 7759       case T_SHORT:
 7760         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7761           vlen_enc = Assembler::AVX_512bit;
 7762         }
 7763         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7764         break;
 7765       case T_INT:
 7766         if (vlen == 8) {
 7767           if ($dst$$XMMRegister != $src$$XMMRegister) {
 7768             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7769           }
 7770         } else if (vlen == 16) {
 7771           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
 7772         } else if (vlen == 32) {
 7773           if (UseAVX > 2) {
 7774             if (!VM_Version::supports_avx512vl()) {
 7775               vlen_enc = Assembler::AVX_512bit;
 7776             }
 7777             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7778           } else {
 7779             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
 7780             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 7781           }
 7782         } else { // vlen == 64
 7783           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7784         }
 7785         break;
 7786       case T_FLOAT:
 7787         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7788         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7789         break;
 7790       case T_DOUBLE:
 7791         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7792         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7793         break;
 7794 
 7795       default: assert(false, "%s", type2name(to_elem_bt));
 7796     }
 7797   %}
 7798   ins_pipe( pipe_slow );
 7799 %}
 7800 
 7801 instruct vcastFtoD_reg(vec dst, vec src) %{
 7802   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7803   match(Set dst (VectorCastF2X src));
 7804   format %{ "vector_cast_f2d  $dst,$src\t!" %}
 7805   ins_encode %{
 7806     int vlen_enc = vector_length_encoding(this);
 7807     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7808   %}
 7809   ins_pipe( pipe_slow );
 7810 %}
 7811 
 7812 
 7813 instruct castFtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7814   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7815             type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
 7816   match(Set dst (VectorCastF2X src));
 7817   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7818   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
 7819   ins_encode %{
 7820     int vlen_enc = vector_length_encoding(this, $src);
 7821     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7822     // JDK-8292878 removed the need for an explicit scratch register needed to load greater than
 7823     // 32 bit addresses for register indirect addressing mode since stub constants
 7824     // are part of code cache and there is a cap of 2G on ReservedCodeCacheSize currently.
 7825     // However, targets are free to increase this limit, but having a large code cache size
 7826     // greater than 2G looks unreasonable in practical scenario, on the hind side with given
 7827     // cap we save a temporary register allocation which in limiting case can prevent
 7828     // spilling in high register pressure blocks.
 7829     __ vector_castF2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7830                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 7831                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7832   %}
 7833   ins_pipe( pipe_slow );
 7834 %}
 7835 
 7836 instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7837   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7838             is_integral_type(Matcher::vector_element_basic_type(n)));
 7839   match(Set dst (VectorCastF2X src));
 7840   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7841   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7842   ins_encode %{
 7843     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7844     if (to_elem_bt == T_LONG) {
 7845       int vlen_enc = vector_length_encoding(this);
 7846       __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7847                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7848                              ExternalAddress(vector_double_signflip()), noreg, vlen_enc);
 7849     } else {
 7850       int vlen_enc = vector_length_encoding(this, $src);
 7851       __ vector_castF2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7852                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7853                              ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7854     }
 7855   %}
 7856   ins_pipe( pipe_slow );
 7857 %}
 7858 
 7859 instruct vcastDtoF_reg(vec dst, vec src) %{
 7860   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 7861   match(Set dst (VectorCastD2X src));
 7862   format %{ "vector_cast_d2x  $dst,$src\t!" %}
 7863   ins_encode %{
 7864     int vlen_enc = vector_length_encoding(this, $src);
 7865     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7866   %}
 7867   ins_pipe( pipe_slow );
 7868 %}
 7869 
 7870 instruct castDtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, vec xtmp5, rFlagsReg cr) %{
 7871   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7872             is_integral_type(Matcher::vector_element_basic_type(n)));
 7873   match(Set dst (VectorCastD2X src));
 7874   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP xtmp5, KILL cr);
 7875   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $xtmp5 as TEMP" %}
 7876   ins_encode %{
 7877     int vlen_enc = vector_length_encoding(this, $src);
 7878     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7879     __ vector_castD2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7880                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, $xtmp5$$XMMRegister,
 7881                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7882   %}
 7883   ins_pipe( pipe_slow );
 7884 %}
 7885 
 7886 instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7887   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7888             is_integral_type(Matcher::vector_element_basic_type(n)));
 7889   match(Set dst (VectorCastD2X src));
 7890   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7891   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7892   ins_encode %{
 7893     int vlen_enc = vector_length_encoding(this, $src);
 7894     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7895     AddressLiteral signflip = VM_Version::supports_avx512dq() ? ExternalAddress(vector_double_signflip()) :
 7896                               ExternalAddress(vector_float_signflip());
 7897     __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7898                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, signflip, noreg, vlen_enc);
 7899   %}
 7900   ins_pipe( pipe_slow );
 7901 %}
 7902 
 7903 instruct vucast(vec dst, vec src) %{
 7904   match(Set dst (VectorUCastB2X src));
 7905   match(Set dst (VectorUCastS2X src));
 7906   match(Set dst (VectorUCastI2X src));
 7907   format %{ "vector_ucast $dst,$src\t!" %}
 7908   ins_encode %{
 7909     assert(UseAVX > 0, "required");
 7910 
 7911     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
 7912     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7913     int vlen_enc = vector_length_encoding(this);
 7914     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
 7915   %}
 7916   ins_pipe( pipe_slow );
 7917 %}
 7918 
 7919 instruct vround_float_avx(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7920   predicate(!VM_Version::supports_avx512vl() &&
 7921             Matcher::vector_length_in_bytes(n) < 64 &&
 7922             Matcher::vector_element_basic_type(n) == T_INT);
 7923   match(Set dst (RoundVF src));
 7924   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7925   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
 7926   ins_encode %{
 7927     int vlen_enc = vector_length_encoding(this);
 7928     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7929     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister,
 7930                               ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7931                               $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
 7932   %}
 7933   ins_pipe( pipe_slow );
 7934 %}
 7935 
 7936 instruct vround_float_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7937   predicate((VM_Version::supports_avx512vl() ||
 7938              Matcher::vector_length_in_bytes(n) == 64) &&
 7939              Matcher::vector_element_basic_type(n) == T_INT);
 7940   match(Set dst (RoundVF src));
 7941   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7942   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7943   ins_encode %{
 7944     int vlen_enc = vector_length_encoding(this);
 7945     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7946     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister,
 7947                                ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7948                                $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7949   %}
 7950   ins_pipe( pipe_slow );
 7951 %}
 7952 
 7953 instruct vround_reg_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7954   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 7955   match(Set dst (RoundVD src));
 7956   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2,  KILL cr);
 7957   format %{ "vector_round_long $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7958   ins_encode %{
 7959     int vlen_enc = vector_length_encoding(this);
 7960     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7961     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister,
 7962                                 ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), new_mxcsr, vlen_enc,
 7963                                 $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7964   %}
 7965   ins_pipe( pipe_slow );
 7966 %}
 7967 
 7968 // --------------------------------- VectorMaskCmp --------------------------------------
 7969 
 7970 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7971   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7972             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
 7973             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7974             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7975   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7976   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7977   ins_encode %{
 7978     int vlen_enc = vector_length_encoding(this, $src1);
 7979     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7980     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7981       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7982     } else {
 7983       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7984     }
 7985   %}
 7986   ins_pipe( pipe_slow );
 7987 %}
 7988 
 7989 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7990   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
 7991             n->bottom_type()->isa_vectmask() == nullptr &&
 7992             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7993   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7994   effect(TEMP ktmp);
 7995   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7996   ins_encode %{
 7997     int vlen_enc = Assembler::AVX_512bit;
 7998     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7999     KRegister mask = k0; // The comparison itself is not being masked.
 8000     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 8001       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 8002       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 8003     } else {
 8004       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 8005       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 8006     }
 8007   %}
 8008   ins_pipe( pipe_slow );
 8009 %}
 8010 
 8011 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
 8012   predicate(n->bottom_type()->isa_vectmask() &&
 8013             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 8014   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8015   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
 8016   ins_encode %{
 8017     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 8018     int vlen_enc = vector_length_encoding(this, $src1);
 8019     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 8020     KRegister mask = k0; // The comparison itself is not being masked.
 8021     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 8022       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 8023     } else {
 8024       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 8025     }
 8026   %}
 8027   ins_pipe( pipe_slow );
 8028 %}
 8029 
 8030 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 8031   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 8032             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 8033             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 8034             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 8035             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 8036             (n->in(2)->get_int() == BoolTest::eq ||
 8037              n->in(2)->get_int() == BoolTest::lt ||
 8038              n->in(2)->get_int() == BoolTest::gt)); // cond
 8039   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8040   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 8041   ins_encode %{
 8042     int vlen_enc = vector_length_encoding(this, $src1);
 8043     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8044     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 8045     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
 8046   %}
 8047   ins_pipe( pipe_slow );
 8048 %}
 8049 
 8050 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 8051   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 8052             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 8053             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 8054             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 8055             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 8056             (n->in(2)->get_int() == BoolTest::ne ||
 8057              n->in(2)->get_int() == BoolTest::le ||
 8058              n->in(2)->get_int() == BoolTest::ge)); // cond
 8059   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8060   effect(TEMP dst, TEMP xtmp);
 8061   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 8062   ins_encode %{
 8063     int vlen_enc = vector_length_encoding(this, $src1);
 8064     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8065     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 8066     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 8067   %}
 8068   ins_pipe( pipe_slow );
 8069 %}
 8070 
 8071 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 8072   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 8073             Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 8074             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 8075             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 8076             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8077   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8078   effect(TEMP dst, TEMP xtmp);
 8079   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 8080   ins_encode %{
 8081     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
 8082     int vlen_enc = vector_length_encoding(this, $src1);
 8083     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8084     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 8085 
 8086     if (vlen_enc == Assembler::AVX_128bit) {
 8087       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 8088     } else {
 8089       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 8090     }
 8091     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 8092     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8093     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 8094   %}
 8095   ins_pipe( pipe_slow );
 8096 %}
 8097 
 8098 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 8099   predicate((n->bottom_type()->isa_vectmask() == nullptr &&
 8100              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
 8101              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8102   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8103   effect(TEMP ktmp);
 8104   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 8105   ins_encode %{
 8106     assert(UseAVX > 2, "required");
 8107 
 8108     int vlen_enc = vector_length_encoding(this, $src1);
 8109     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8110     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 8111     KRegister mask = k0; // The comparison itself is not being masked.
 8112     bool merge = false;
 8113     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 8114 
 8115     switch (src1_elem_bt) {
 8116       case T_INT: {
 8117         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8118         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 8119         break;
 8120       }
 8121       case T_LONG: {
 8122         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8123         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 8124         break;
 8125       }
 8126       default: assert(false, "%s", type2name(src1_elem_bt));
 8127     }
 8128   %}
 8129   ins_pipe( pipe_slow );
 8130 %}
 8131 
 8132 
 8133 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
 8134   predicate(n->bottom_type()->isa_vectmask() &&
 8135             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8136   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8137   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
 8138   ins_encode %{
 8139     assert(UseAVX > 2, "required");
 8140     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 8141 
 8142     int vlen_enc = vector_length_encoding(this, $src1);
 8143     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8144     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 8145     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 8146 
 8147     // Comparison i
 8148     switch (src1_elem_bt) {
 8149       case T_BYTE: {
 8150         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8151         break;
 8152       }
 8153       case T_SHORT: {
 8154         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8155         break;
 8156       }
 8157       case T_INT: {
 8158         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8159         break;
 8160       }
 8161       case T_LONG: {
 8162         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8163         break;
 8164       }
 8165       default: assert(false, "%s", type2name(src1_elem_bt));
 8166     }
 8167   %}
 8168   ins_pipe( pipe_slow );
 8169 %}
 8170 
 8171 // Extract
 8172 
 8173 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
 8174   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
 8175   match(Set dst (ExtractI src idx));
 8176   match(Set dst (ExtractS src idx));
 8177   match(Set dst (ExtractB src idx));
 8178   format %{ "extractI $dst,$src,$idx\t!" %}
 8179   ins_encode %{
 8180     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8181 
 8182     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8183     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8184   %}
 8185   ins_pipe( pipe_slow );
 8186 %}
 8187 
 8188 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
 8189   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
 8190             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
 8191   match(Set dst (ExtractI src idx));
 8192   match(Set dst (ExtractS src idx));
 8193   match(Set dst (ExtractB src idx));
 8194   effect(TEMP vtmp);
 8195   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8196   ins_encode %{
 8197     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8198 
 8199     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8200     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8201     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
 8202   %}
 8203   ins_pipe( pipe_slow );
 8204 %}
 8205 
 8206 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
 8207   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
 8208   match(Set dst (ExtractL src idx));
 8209   format %{ "extractL $dst,$src,$idx\t!" %}
 8210   ins_encode %{
 8211     assert(UseSSE >= 4, "required");
 8212     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8213 
 8214     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8215   %}
 8216   ins_pipe( pipe_slow );
 8217 %}
 8218 
 8219 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
 8220   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8221             Matcher::vector_length(n->in(1)) == 8);  // src
 8222   match(Set dst (ExtractL src idx));
 8223   effect(TEMP vtmp);
 8224   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8225   ins_encode %{
 8226     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8227 
 8228     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8229     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
 8230   %}
 8231   ins_pipe( pipe_slow );
 8232 %}
 8233 
 8234 instruct extractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8235   predicate(Matcher::vector_length(n->in(1)) <= 4);
 8236   match(Set dst (ExtractF src idx));
 8237   effect(TEMP dst, TEMP vtmp);
 8238   format %{ "extractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8239   ins_encode %{
 8240     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8241 
 8242     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $vtmp$$XMMRegister);
 8243   %}
 8244   ins_pipe( pipe_slow );
 8245 %}
 8246 
 8247 instruct vextractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8248   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
 8249             Matcher::vector_length(n->in(1)/*src*/) == 16);
 8250   match(Set dst (ExtractF src idx));
 8251   effect(TEMP vtmp);
 8252   format %{ "vextractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8253   ins_encode %{
 8254     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8255 
 8256     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8257     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8258   %}
 8259   ins_pipe( pipe_slow );
 8260 %}
 8261 
 8262 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
 8263   predicate(Matcher::vector_length(n->in(1)) == 2); // src
 8264   match(Set dst (ExtractD src idx));
 8265   format %{ "extractD $dst,$src,$idx\t!" %}
 8266   ins_encode %{
 8267     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8268 
 8269     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8270   %}
 8271   ins_pipe( pipe_slow );
 8272 %}
 8273 
 8274 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
 8275   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8276             Matcher::vector_length(n->in(1)) == 8);  // src
 8277   match(Set dst (ExtractD src idx));
 8278   effect(TEMP vtmp);
 8279   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8280   ins_encode %{
 8281     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8282 
 8283     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8284     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8285   %}
 8286   ins_pipe( pipe_slow );
 8287 %}
 8288 
 8289 // --------------------------------- Vector Blend --------------------------------------
 8290 
 8291 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 8292   predicate(UseAVX == 0);
 8293   match(Set dst (VectorBlend (Binary dst src) mask));
 8294   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
 8295   effect(TEMP tmp);
 8296   ins_encode %{
 8297     assert(UseSSE >= 4, "required");
 8298 
 8299     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
 8300       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
 8301     }
 8302     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
 8303   %}
 8304   ins_pipe( pipe_slow );
 8305 %}
 8306 
 8307 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8308   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8309             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8310             Matcher::vector_length_in_bytes(n) <= 32 &&
 8311             is_integral_type(Matcher::vector_element_basic_type(n)));
 8312   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8313   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8314   ins_encode %{
 8315     int vlen_enc = vector_length_encoding(this);
 8316     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8317   %}
 8318   ins_pipe( pipe_slow );
 8319 %}
 8320 
 8321 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8322   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8323             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8324             Matcher::vector_length_in_bytes(n) <= 32 &&
 8325             !is_integral_type(Matcher::vector_element_basic_type(n)));
 8326   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8327   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8328   ins_encode %{
 8329     int vlen_enc = vector_length_encoding(this);
 8330     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8331   %}
 8332   ins_pipe( pipe_slow );
 8333 %}
 8334 
 8335 instruct vblendvp(legVec dst, legVec src1, legVec src2, legVec mask, legVec vtmp) %{
 8336   predicate(UseAVX > 0 && EnableX86ECoreOpts &&
 8337             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8338             Matcher::vector_length_in_bytes(n) <= 32);
 8339   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8340   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $vtmp as TEMP" %}
 8341   effect(TEMP vtmp, TEMP dst);
 8342   ins_encode %{
 8343     int vlen_enc = vector_length_encoding(this);
 8344     __ vpandn($vtmp$$XMMRegister, $mask$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 8345     __ vpand ($dst$$XMMRegister,  $mask$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8346     __ vpor  ($dst$$XMMRegister,  $dst$$XMMRegister,  $vtmp$$XMMRegister, vlen_enc);
 8347   %}
 8348   ins_pipe( pipe_slow );
 8349 %}
 8350 
 8351 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
 8352   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 8353             n->in(2)->bottom_type()->isa_vectmask() == nullptr);
 8354   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8355   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8356   effect(TEMP ktmp);
 8357   ins_encode %{
 8358      int vlen_enc = Assembler::AVX_512bit;
 8359      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8360     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, noreg);
 8361     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8362   %}
 8363   ins_pipe( pipe_slow );
 8364 %}
 8365 
 8366 
 8367 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask) %{
 8368   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
 8369             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
 8370              VM_Version::supports_avx512bw()));
 8371   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8372   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8373   ins_encode %{
 8374     int vlen_enc = vector_length_encoding(this);
 8375     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8376     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8377   %}
 8378   ins_pipe( pipe_slow );
 8379 %}
 8380 
 8381 // --------------------------------- ABS --------------------------------------
 8382 // a = |a|
 8383 instruct vabsB_reg(vec dst, vec src) %{
 8384   match(Set dst (AbsVB  src));
 8385   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
 8386   ins_encode %{
 8387     uint vlen = Matcher::vector_length(this);
 8388     if (vlen <= 16) {
 8389       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8390     } else {
 8391       int vlen_enc = vector_length_encoding(this);
 8392       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8393     }
 8394   %}
 8395   ins_pipe( pipe_slow );
 8396 %}
 8397 
 8398 instruct vabsS_reg(vec dst, vec src) %{
 8399   match(Set dst (AbsVS  src));
 8400   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
 8401   ins_encode %{
 8402     uint vlen = Matcher::vector_length(this);
 8403     if (vlen <= 8) {
 8404       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8405     } else {
 8406       int vlen_enc = vector_length_encoding(this);
 8407       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8408     }
 8409   %}
 8410   ins_pipe( pipe_slow );
 8411 %}
 8412 
 8413 instruct vabsI_reg(vec dst, vec src) %{
 8414   match(Set dst (AbsVI  src));
 8415   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
 8416   ins_encode %{
 8417     uint vlen = Matcher::vector_length(this);
 8418     if (vlen <= 4) {
 8419       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8420     } else {
 8421       int vlen_enc = vector_length_encoding(this);
 8422       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8423     }
 8424   %}
 8425   ins_pipe( pipe_slow );
 8426 %}
 8427 
 8428 instruct vabsL_reg(vec dst, vec src) %{
 8429   match(Set dst (AbsVL  src));
 8430   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
 8431   ins_encode %{
 8432     assert(UseAVX > 2, "required");
 8433     int vlen_enc = vector_length_encoding(this);
 8434     if (!VM_Version::supports_avx512vl()) {
 8435       vlen_enc = Assembler::AVX_512bit;
 8436     }
 8437     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8438   %}
 8439   ins_pipe( pipe_slow );
 8440 %}
 8441 
 8442 // --------------------------------- ABSNEG --------------------------------------
 8443 
 8444 instruct vabsnegF(vec dst, vec src) %{
 8445   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
 8446   match(Set dst (AbsVF src));
 8447   match(Set dst (NegVF src));
 8448   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
 8449   ins_cost(150);
 8450   ins_encode %{
 8451     int opcode = this->ideal_Opcode();
 8452     int vlen = Matcher::vector_length(this);
 8453     if (vlen == 2) {
 8454       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8455     } else {
 8456       assert(vlen == 8 || vlen == 16, "required");
 8457       int vlen_enc = vector_length_encoding(this);
 8458       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8459     }
 8460   %}
 8461   ins_pipe( pipe_slow );
 8462 %}
 8463 
 8464 instruct vabsneg4F(vec dst) %{
 8465   predicate(Matcher::vector_length(n) == 4);
 8466   match(Set dst (AbsVF dst));
 8467   match(Set dst (NegVF dst));
 8468   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
 8469   ins_cost(150);
 8470   ins_encode %{
 8471     int opcode = this->ideal_Opcode();
 8472     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister);
 8473   %}
 8474   ins_pipe( pipe_slow );
 8475 %}
 8476 
 8477 instruct vabsnegD(vec dst, vec src) %{
 8478   match(Set dst (AbsVD  src));
 8479   match(Set dst (NegVD  src));
 8480   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
 8481   ins_encode %{
 8482     int opcode = this->ideal_Opcode();
 8483     uint vlen = Matcher::vector_length(this);
 8484     if (vlen == 2) {
 8485       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8486     } else {
 8487       int vlen_enc = vector_length_encoding(this);
 8488       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8489     }
 8490   %}
 8491   ins_pipe( pipe_slow );
 8492 %}
 8493 
 8494 //------------------------------------- VectorTest --------------------------------------------
 8495 
 8496 instruct vptest_lt16(rFlagsRegU cr, legVec src1, legVec src2, legVec vtmp) %{
 8497   predicate(Matcher::vector_length_in_bytes(n->in(1)) < 16);
 8498   match(Set cr (VectorTest src1 src2));
 8499   effect(TEMP vtmp);
 8500   format %{ "vptest_lt16  $src1, $src2\t! using $vtmp as TEMP" %}
 8501   ins_encode %{
 8502     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8503     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8504     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister, vlen);
 8505   %}
 8506   ins_pipe( pipe_slow );
 8507 %}
 8508 
 8509 instruct vptest_ge16(rFlagsRegU cr, legVec src1, legVec src2) %{
 8510   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16);
 8511   match(Set cr (VectorTest src1 src2));
 8512   format %{ "vptest_ge16  $src1, $src2\n\t" %}
 8513   ins_encode %{
 8514     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8515     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8516     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, vlen);
 8517   %}
 8518   ins_pipe( pipe_slow );
 8519 %}
 8520 
 8521 instruct ktest_alltrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8522   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8523              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8524             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 8525   match(Set cr (VectorTest src1 src2));
 8526   effect(TEMP tmp);
 8527   format %{ "ktest_alltrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8528   ins_encode %{
 8529     uint masklen = Matcher::vector_length(this, $src1);
 8530     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8531     __ andl($tmp$$Register, (1 << masklen) - 1);
 8532     __ cmpl($tmp$$Register, (1 << masklen) - 1);
 8533   %}
 8534   ins_pipe( pipe_slow );
 8535 %}
 8536 
 8537 instruct ktest_anytrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8538   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8539              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8540             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8541   match(Set cr (VectorTest src1 src2));
 8542   effect(TEMP tmp);
 8543   format %{ "ktest_anytrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8544   ins_encode %{
 8545     uint masklen = Matcher::vector_length(this, $src1);
 8546     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8547     __ andl($tmp$$Register, (1 << masklen) - 1);
 8548   %}
 8549   ins_pipe( pipe_slow );
 8550 %}
 8551 
 8552 instruct ktest_ge8(rFlagsRegU cr, kReg src1, kReg src2) %{
 8553   predicate(Matcher::vector_length(n->in(1)) >= 16 ||
 8554             (Matcher::vector_length(n->in(1)) == 8 && VM_Version::supports_avx512dq()));
 8555   match(Set cr (VectorTest src1 src2));
 8556   format %{ "ktest_ge8  $src1, $src2\n\t" %}
 8557   ins_encode %{
 8558     uint masklen = Matcher::vector_length(this, $src1);
 8559     __ kortest(masklen, $src1$$KRegister, $src1$$KRegister);
 8560   %}
 8561   ins_pipe( pipe_slow );
 8562 %}
 8563 
 8564 //------------------------------------- LoadMask --------------------------------------------
 8565 
 8566 instruct loadMask(legVec dst, legVec src) %{
 8567   predicate(n->bottom_type()->isa_vectmask() == nullptr && !VM_Version::supports_avx512vlbw());
 8568   match(Set dst (VectorLoadMask src));
 8569   effect(TEMP dst);
 8570   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
 8571   ins_encode %{
 8572     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8573     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8574     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
 8575   %}
 8576   ins_pipe( pipe_slow );
 8577 %}
 8578 
 8579 instruct loadMask64(kReg dst, vec src, vec xtmp) %{
 8580   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8581   match(Set dst (VectorLoadMask src));
 8582   effect(TEMP xtmp);
 8583   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp as TEMP" %}
 8584   ins_encode %{
 8585     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8586                         true, Assembler::AVX_512bit);
 8587   %}
 8588   ins_pipe( pipe_slow );
 8589 %}
 8590 
 8591 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
 8592   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8593   match(Set dst (VectorLoadMask src));
 8594   effect(TEMP xtmp);
 8595   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
 8596   ins_encode %{
 8597     int vlen_enc = vector_length_encoding(in(1));
 8598     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8599                         false, vlen_enc);
 8600   %}
 8601   ins_pipe( pipe_slow );
 8602 %}
 8603 
 8604 //------------------------------------- StoreMask --------------------------------------------
 8605 
 8606 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
 8607   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8608   match(Set dst (VectorStoreMask src size));
 8609   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8610   ins_encode %{
 8611     int vlen = Matcher::vector_length(this);
 8612     if (vlen <= 16 && UseAVX <= 2) {
 8613       assert(UseSSE >= 3, "required");
 8614       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8615     } else {
 8616       assert(UseAVX > 0, "required");
 8617       int src_vlen_enc = vector_length_encoding(this, $src);
 8618       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8619     }
 8620   %}
 8621   ins_pipe( pipe_slow );
 8622 %}
 8623 
 8624 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
 8625   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8626   match(Set dst (VectorStoreMask src size));
 8627   effect(TEMP_DEF dst, TEMP xtmp);
 8628   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8629   ins_encode %{
 8630     int vlen_enc = Assembler::AVX_128bit;
 8631     int vlen = Matcher::vector_length(this);
 8632     if (vlen <= 8) {
 8633       assert(UseSSE >= 3, "required");
 8634       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8635       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8636       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8637     } else {
 8638       assert(UseAVX > 0, "required");
 8639       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8640       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8641       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8642     }
 8643   %}
 8644   ins_pipe( pipe_slow );
 8645 %}
 8646 
 8647 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
 8648   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8649   match(Set dst (VectorStoreMask src size));
 8650   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8651   effect(TEMP_DEF dst, TEMP xtmp);
 8652   ins_encode %{
 8653     int vlen_enc = Assembler::AVX_128bit;
 8654     int vlen = Matcher::vector_length(this);
 8655     if (vlen <= 4) {
 8656       assert(UseSSE >= 3, "required");
 8657       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8658       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8659       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8660       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8661     } else {
 8662       assert(UseAVX > 0, "required");
 8663       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8664       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8665       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8666       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8667       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8668     }
 8669   %}
 8670   ins_pipe( pipe_slow );
 8671 %}
 8672 
 8673 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
 8674   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
 8675   match(Set dst (VectorStoreMask src size));
 8676   effect(TEMP_DEF dst, TEMP xtmp);
 8677   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8678   ins_encode %{
 8679     assert(UseSSE >= 3, "required");
 8680     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8681     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
 8682     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
 8683     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8684     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8685   %}
 8686   ins_pipe( pipe_slow );
 8687 %}
 8688 
 8689 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
 8690   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
 8691   match(Set dst (VectorStoreMask src size));
 8692   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
 8693   effect(TEMP_DEF dst, TEMP vtmp);
 8694   ins_encode %{
 8695     int vlen_enc = Assembler::AVX_128bit;
 8696     __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
 8697     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 8698     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
 8699     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8700     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8701     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8702     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8703   %}
 8704   ins_pipe( pipe_slow );
 8705 %}
 8706 
 8707 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
 8708   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8709   match(Set dst (VectorStoreMask src size));
 8710   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8711   ins_encode %{
 8712     int src_vlen_enc = vector_length_encoding(this, $src);
 8713     int dst_vlen_enc = vector_length_encoding(this);
 8714     if (!VM_Version::supports_avx512vl()) {
 8715       src_vlen_enc = Assembler::AVX_512bit;
 8716     }
 8717     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8718     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8719   %}
 8720   ins_pipe( pipe_slow );
 8721 %}
 8722 
 8723 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
 8724   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8725   match(Set dst (VectorStoreMask src size));
 8726   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8727   ins_encode %{
 8728     int src_vlen_enc = vector_length_encoding(this, $src);
 8729     int dst_vlen_enc = vector_length_encoding(this);
 8730     if (!VM_Version::supports_avx512vl()) {
 8731       src_vlen_enc = Assembler::AVX_512bit;
 8732     }
 8733     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8734     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8735   %}
 8736   ins_pipe( pipe_slow );
 8737 %}
 8738 
 8739 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size) %{
 8740   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8741   match(Set dst (VectorStoreMask mask size));
 8742   effect(TEMP_DEF dst);
 8743   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8744   ins_encode %{
 8745     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
 8746     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
 8747                  false, Assembler::AVX_512bit, noreg);
 8748     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
 8749   %}
 8750   ins_pipe( pipe_slow );
 8751 %}
 8752 
 8753 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
 8754   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8755   match(Set dst (VectorStoreMask mask size));
 8756   effect(TEMP_DEF dst);
 8757   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8758   ins_encode %{
 8759     int dst_vlen_enc = vector_length_encoding(this);
 8760     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
 8761     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8762   %}
 8763   ins_pipe( pipe_slow );
 8764 %}
 8765 
 8766 instruct vmaskcast_evex(kReg dst) %{
 8767   match(Set dst (VectorMaskCast dst));
 8768   ins_cost(0);
 8769   format %{ "vector_mask_cast $dst" %}
 8770   ins_encode %{
 8771     // empty
 8772   %}
 8773   ins_pipe(empty);
 8774 %}
 8775 
 8776 instruct vmaskcast(vec dst) %{
 8777   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
 8778   match(Set dst (VectorMaskCast dst));
 8779   ins_cost(0);
 8780   format %{ "vector_mask_cast $dst" %}
 8781   ins_encode %{
 8782     // empty
 8783   %}
 8784   ins_pipe(empty);
 8785 %}
 8786 
 8787 instruct vmaskcast_avx(vec dst, vec src) %{
 8788   predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)));
 8789   match(Set dst (VectorMaskCast src));
 8790   format %{ "vector_mask_cast $dst, $src" %}
 8791   ins_encode %{
 8792     int vlen = Matcher::vector_length(this);
 8793     BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
 8794     BasicType dst_bt = Matcher::vector_element_basic_type(this);
 8795     __ vector_mask_cast($dst$$XMMRegister, $src$$XMMRegister, dst_bt, src_bt, vlen);
 8796   %}
 8797   ins_pipe(pipe_slow);
 8798 %}
 8799 
 8800 //-------------------------------- Load Iota Indices ----------------------------------
 8801 
 8802 instruct loadIotaIndices(vec dst, immI_0 src) %{
 8803   match(Set dst (VectorLoadConst src));
 8804   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
 8805   ins_encode %{
 8806      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8807      BasicType bt = Matcher::vector_element_basic_type(this);
 8808      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, bt);
 8809   %}
 8810   ins_pipe( pipe_slow );
 8811 %}
 8812 
 8813 instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
 8814   match(Set dst (PopulateIndex src1 src2));
 8815   effect(TEMP dst, TEMP vtmp);
 8816   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8817   ins_encode %{
 8818      assert($src2$$constant == 1, "required");
 8819      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8820      int vlen_enc = vector_length_encoding(this);
 8821      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8822      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8823      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8824      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8825   %}
 8826   ins_pipe( pipe_slow );
 8827 %}
 8828 
 8829 instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
 8830   match(Set dst (PopulateIndex src1 src2));
 8831   effect(TEMP dst, TEMP vtmp);
 8832   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8833   ins_encode %{
 8834      assert($src2$$constant == 1, "required");
 8835      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8836      int vlen_enc = vector_length_encoding(this);
 8837      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8838      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8839      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8840      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8841   %}
 8842   ins_pipe( pipe_slow );
 8843 %}
 8844 
 8845 //-------------------------------- Rearrange ----------------------------------
 8846 
 8847 // LoadShuffle/Rearrange for Byte
 8848 instruct rearrangeB(vec dst, vec shuffle) %{
 8849   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8850             Matcher::vector_length(n) < 32);
 8851   match(Set dst (VectorRearrange dst shuffle));
 8852   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8853   ins_encode %{
 8854     assert(UseSSE >= 4, "required");
 8855     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8856   %}
 8857   ins_pipe( pipe_slow );
 8858 %}
 8859 
 8860 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8861   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8862             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
 8863   match(Set dst (VectorRearrange src shuffle));
 8864   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8865   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8866   ins_encode %{
 8867     assert(UseAVX >= 2, "required");
 8868     // Swap src into vtmp1
 8869     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8870     // Shuffle swapped src to get entries from other 128 bit lane
 8871     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8872     // Shuffle original src to get entries from self 128 bit lane
 8873     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8874     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8875     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8876     // Perform the blend
 8877     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8878   %}
 8879   ins_pipe( pipe_slow );
 8880 %}
 8881 
 8882 
 8883 instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
 8884   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8885             Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
 8886   match(Set dst (VectorRearrange src shuffle));
 8887   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 8888   format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
 8889   ins_encode %{
 8890     int vlen_enc = vector_length_encoding(this);
 8891     __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
 8892                        $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
 8893                        $rtmp$$Register, $ktmp$$KRegister, vlen_enc);
 8894   %}
 8895   ins_pipe( pipe_slow );
 8896 %}
 8897 
 8898 instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
 8899   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8900             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
 8901   match(Set dst (VectorRearrange src shuffle));
 8902   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8903   ins_encode %{
 8904     int vlen_enc = vector_length_encoding(this);
 8905     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8906   %}
 8907   ins_pipe( pipe_slow );
 8908 %}
 8909 
 8910 // LoadShuffle/Rearrange for Short
 8911 
 8912 instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
 8913   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8914             !VM_Version::supports_avx512bw());
 8915   match(Set dst (VectorLoadShuffle src));
 8916   effect(TEMP dst, TEMP vtmp);
 8917   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8918   ins_encode %{
 8919     // Create a byte shuffle mask from short shuffle mask
 8920     // only byte shuffle instruction available on these platforms
 8921     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8922     if (UseAVX == 0) {
 8923       assert(vlen_in_bytes <= 16, "required");
 8924       // Multiply each shuffle by two to get byte index
 8925       __ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
 8926       __ psllw($vtmp$$XMMRegister, 1);
 8927 
 8928       // Duplicate to create 2 copies of byte index
 8929       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8930       __ psllw($dst$$XMMRegister, 8);
 8931       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
 8932 
 8933       // Add one to get alternate byte index
 8934       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), noreg);
 8935       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8936     } else {
 8937       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
 8938       int vlen_enc = vector_length_encoding(this);
 8939       // Multiply each shuffle by two to get byte index
 8940       __ vpsllw($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
 8941 
 8942       // Duplicate to create 2 copies of byte index
 8943       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
 8944       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8945 
 8946       // Add one to get alternate byte index
 8947       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, noreg);
 8948     }
 8949   %}
 8950   ins_pipe( pipe_slow );
 8951 %}
 8952 
 8953 instruct rearrangeS(vec dst, vec shuffle) %{
 8954   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8955             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
 8956   match(Set dst (VectorRearrange dst shuffle));
 8957   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8958   ins_encode %{
 8959     assert(UseSSE >= 4, "required");
 8960     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8961   %}
 8962   ins_pipe( pipe_slow );
 8963 %}
 8964 
 8965 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8966   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8967             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
 8968   match(Set dst (VectorRearrange src shuffle));
 8969   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8970   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8971   ins_encode %{
 8972     assert(UseAVX >= 2, "required");
 8973     // Swap src into vtmp1
 8974     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8975     // Shuffle swapped src to get entries from other 128 bit lane
 8976     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8977     // Shuffle original src to get entries from self 128 bit lane
 8978     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8979     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8980     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8981     // Perform the blend
 8982     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8983   %}
 8984   ins_pipe( pipe_slow );
 8985 %}
 8986 
 8987 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
 8988   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8989             VM_Version::supports_avx512bw());
 8990   match(Set dst (VectorRearrange src shuffle));
 8991   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8992   ins_encode %{
 8993     int vlen_enc = vector_length_encoding(this);
 8994     if (!VM_Version::supports_avx512vl()) {
 8995       vlen_enc = Assembler::AVX_512bit;
 8996     }
 8997     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8998   %}
 8999   ins_pipe( pipe_slow );
 9000 %}
 9001 
 9002 // LoadShuffle/Rearrange for Integer and Float
 9003 
 9004 instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
 9005   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 9006             Matcher::vector_length(n) == 4 && UseAVX == 0);
 9007   match(Set dst (VectorLoadShuffle src));
 9008   effect(TEMP dst, TEMP vtmp);
 9009   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 9010   ins_encode %{
 9011     assert(UseSSE >= 4, "required");
 9012 
 9013     // Create a byte shuffle mask from int shuffle mask
 9014     // only byte shuffle instruction available on these platforms
 9015 
 9016     // Duplicate and multiply each shuffle by 4
 9017     __ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
 9018     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 9019     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 9020     __ psllw($vtmp$$XMMRegister, 2);
 9021 
 9022     // Duplicate again to create 4 copies of byte index
 9023     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 9024     __ psllw($dst$$XMMRegister, 8);
 9025     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
 9026 
 9027     // Add 3,2,1,0 to get alternate byte index
 9028     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), noreg);
 9029     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 9030   %}
 9031   ins_pipe( pipe_slow );
 9032 %}
 9033 
 9034 instruct rearrangeI(vec dst, vec shuffle) %{
 9035   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 9036             UseAVX == 0);
 9037   match(Set dst (VectorRearrange dst shuffle));
 9038   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 9039   ins_encode %{
 9040     assert(UseSSE >= 4, "required");
 9041     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 9042   %}
 9043   ins_pipe( pipe_slow );
 9044 %}
 9045 
 9046 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
 9047   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 9048             UseAVX > 0);
 9049   match(Set dst (VectorRearrange src shuffle));
 9050   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9051   ins_encode %{
 9052     int vlen_enc = vector_length_encoding(this);
 9053     BasicType bt = Matcher::vector_element_basic_type(this);
 9054     __ vector_rearrange_int_float(bt, $dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9055   %}
 9056   ins_pipe( pipe_slow );
 9057 %}
 9058 
 9059 // LoadShuffle/Rearrange for Long and Double
 9060 
 9061 instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
 9062   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9063             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 9064   match(Set dst (VectorLoadShuffle src));
 9065   effect(TEMP dst, TEMP vtmp);
 9066   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 9067   ins_encode %{
 9068     assert(UseAVX >= 2, "required");
 9069 
 9070     int vlen_enc = vector_length_encoding(this);
 9071     // Create a double word shuffle mask from long shuffle mask
 9072     // only double word shuffle instruction available on these platforms
 9073 
 9074     // Multiply each shuffle by two to get double word index
 9075     __ vpsllq($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
 9076 
 9077     // Duplicate each double word shuffle
 9078     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
 9079     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 9080 
 9081     // Add one to get alternate double word index
 9082     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, noreg);
 9083   %}
 9084   ins_pipe( pipe_slow );
 9085 %}
 9086 
 9087 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
 9088   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9089             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 9090   match(Set dst (VectorRearrange src shuffle));
 9091   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9092   ins_encode %{
 9093     assert(UseAVX >= 2, "required");
 9094 
 9095     int vlen_enc = vector_length_encoding(this);
 9096     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9097   %}
 9098   ins_pipe( pipe_slow );
 9099 %}
 9100 
 9101 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
 9102   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9103             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 9104   match(Set dst (VectorRearrange src shuffle));
 9105   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9106   ins_encode %{
 9107     assert(UseAVX > 2, "required");
 9108 
 9109     int vlen_enc = vector_length_encoding(this);
 9110     if (vlen_enc == Assembler::AVX_128bit) {
 9111       vlen_enc = Assembler::AVX_256bit;
 9112     }
 9113     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9114   %}
 9115   ins_pipe( pipe_slow );
 9116 %}
 9117 
 9118 // --------------------------------- FMA --------------------------------------
 9119 // a * b + c
 9120 
 9121 instruct vfmaF_reg(vec a, vec b, vec c) %{
 9122   match(Set c (FmaVF  c (Binary a b)));
 9123   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 9124   ins_cost(150);
 9125   ins_encode %{
 9126     assert(UseFMA, "not enabled");
 9127     int vlen_enc = vector_length_encoding(this);
 9128     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9129   %}
 9130   ins_pipe( pipe_slow );
 9131 %}
 9132 
 9133 instruct vfmaF_mem(vec a, memory b, vec c) %{
 9134   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9135   match(Set c (FmaVF  c (Binary a (LoadVector b))));
 9136   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 9137   ins_cost(150);
 9138   ins_encode %{
 9139     assert(UseFMA, "not enabled");
 9140     int vlen_enc = vector_length_encoding(this);
 9141     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9142   %}
 9143   ins_pipe( pipe_slow );
 9144 %}
 9145 
 9146 instruct vfmaD_reg(vec a, vec b, vec c) %{
 9147   match(Set c (FmaVD  c (Binary a b)));
 9148   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9149   ins_cost(150);
 9150   ins_encode %{
 9151     assert(UseFMA, "not enabled");
 9152     int vlen_enc = vector_length_encoding(this);
 9153     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9154   %}
 9155   ins_pipe( pipe_slow );
 9156 %}
 9157 
 9158 instruct vfmaD_mem(vec a, memory b, vec c) %{
 9159   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9160   match(Set c (FmaVD  c (Binary a (LoadVector b))));
 9161   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9162   ins_cost(150);
 9163   ins_encode %{
 9164     assert(UseFMA, "not enabled");
 9165     int vlen_enc = vector_length_encoding(this);
 9166     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9167   %}
 9168   ins_pipe( pipe_slow );
 9169 %}
 9170 
 9171 // --------------------------------- Vector Multiply Add --------------------------------------
 9172 
 9173 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
 9174   predicate(UseAVX == 0);
 9175   match(Set dst (MulAddVS2VI dst src1));
 9176   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
 9177   ins_encode %{
 9178     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
 9179   %}
 9180   ins_pipe( pipe_slow );
 9181 %}
 9182 
 9183 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
 9184   predicate(UseAVX > 0);
 9185   match(Set dst (MulAddVS2VI src1 src2));
 9186   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
 9187   ins_encode %{
 9188     int vlen_enc = vector_length_encoding(this);
 9189     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9190   %}
 9191   ins_pipe( pipe_slow );
 9192 %}
 9193 
 9194 // --------------------------------- Vector Multiply Add Add ----------------------------------
 9195 
 9196 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
 9197   predicate(VM_Version::supports_avx512_vnni());
 9198   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
 9199   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
 9200   ins_encode %{
 9201     assert(UseAVX > 2, "required");
 9202     int vlen_enc = vector_length_encoding(this);
 9203     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9204   %}
 9205   ins_pipe( pipe_slow );
 9206   ins_cost(10);
 9207 %}
 9208 
 9209 // --------------------------------- PopCount --------------------------------------
 9210 
 9211 instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
 9212   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9213   match(Set dst (PopCountVI src));
 9214   match(Set dst (PopCountVL src));
 9215   format %{ "vector_popcount_integral $dst, $src" %}
 9216   ins_encode %{
 9217     int opcode = this->ideal_Opcode();
 9218     int vlen_enc = vector_length_encoding(this, $src);
 9219     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9220     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
 9221   %}
 9222   ins_pipe( pipe_slow );
 9223 %}
 9224 
 9225 instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9226   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9227   match(Set dst (PopCountVI src mask));
 9228   match(Set dst (PopCountVL src mask));
 9229   format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
 9230   ins_encode %{
 9231     int vlen_enc = vector_length_encoding(this, $src);
 9232     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9233     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9234     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
 9235   %}
 9236   ins_pipe( pipe_slow );
 9237 %}
 9238 
 9239 instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
 9240   predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9241   match(Set dst (PopCountVI src));
 9242   match(Set dst (PopCountVL src));
 9243   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9244   format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
 9245   ins_encode %{
 9246     int opcode = this->ideal_Opcode();
 9247     int vlen_enc = vector_length_encoding(this, $src);
 9248     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9249     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9250                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
 9251   %}
 9252   ins_pipe( pipe_slow );
 9253 %}
 9254 
 9255 // --------------------------------- Vector Trailing Zeros Count --------------------------------------
 9256 
 9257 instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
 9258   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9259                                               Matcher::vector_length_in_bytes(n->in(1))));
 9260   match(Set dst (CountTrailingZerosV src));
 9261   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 9262   ins_cost(400);
 9263   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
 9264   ins_encode %{
 9265     int vlen_enc = vector_length_encoding(this, $src);
 9266     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9267     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9268                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9269   %}
 9270   ins_pipe( pipe_slow );
 9271 %}
 9272 
 9273 instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9274   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9275             VM_Version::supports_avx512cd() &&
 9276             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9277   match(Set dst (CountTrailingZerosV src));
 9278   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9279   ins_cost(400);
 9280   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
 9281   ins_encode %{
 9282     int vlen_enc = vector_length_encoding(this, $src);
 9283     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9284     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9285                                         $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9286   %}
 9287   ins_pipe( pipe_slow );
 9288 %}
 9289 
 9290 instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
 9291   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9292   match(Set dst (CountTrailingZerosV src));
 9293   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
 9294   ins_cost(400);
 9295   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
 9296   ins_encode %{
 9297     int vlen_enc = vector_length_encoding(this, $src);
 9298     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9299     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9300                                         $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 9301                                         $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
 9302   %}
 9303   ins_pipe( pipe_slow );
 9304 %}
 9305 
 9306 instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9307   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9308   match(Set dst (CountTrailingZerosV src));
 9309   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9310   format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9311   ins_encode %{
 9312     int vlen_enc = vector_length_encoding(this, $src);
 9313     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9314     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9315                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9316   %}
 9317   ins_pipe( pipe_slow );
 9318 %}
 9319 
 9320 
 9321 // --------------------------------- Bitwise Ternary Logic ----------------------------------
 9322 
 9323 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
 9324   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
 9325   effect(TEMP dst);
 9326   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9327   ins_encode %{
 9328     int vector_len = vector_length_encoding(this);
 9329     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
 9330   %}
 9331   ins_pipe( pipe_slow );
 9332 %}
 9333 
 9334 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
 9335   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
 9336   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
 9337   effect(TEMP dst);
 9338   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9339   ins_encode %{
 9340     int vector_len = vector_length_encoding(this);
 9341     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
 9342   %}
 9343   ins_pipe( pipe_slow );
 9344 %}
 9345 
 9346 // --------------------------------- Rotation Operations ----------------------------------
 9347 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
 9348   match(Set dst (RotateLeftV src shift));
 9349   match(Set dst (RotateRightV src shift));
 9350   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
 9351   ins_encode %{
 9352     int opcode      = this->ideal_Opcode();
 9353     int vector_len  = vector_length_encoding(this);
 9354     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9355     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 9356   %}
 9357   ins_pipe( pipe_slow );
 9358 %}
 9359 
 9360 instruct vprorate(vec dst, vec src, vec shift) %{
 9361   match(Set dst (RotateLeftV src shift));
 9362   match(Set dst (RotateRightV src shift));
 9363   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
 9364   ins_encode %{
 9365     int opcode      = this->ideal_Opcode();
 9366     int vector_len  = vector_length_encoding(this);
 9367     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9368     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
 9369   %}
 9370   ins_pipe( pipe_slow );
 9371 %}
 9372 
 9373 // ---------------------------------- Masked Operations ------------------------------------
 9374 instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
 9375   predicate(!n->in(3)->bottom_type()->isa_vectmask());
 9376   match(Set dst (LoadVectorMasked mem mask));
 9377   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9378   ins_encode %{
 9379     BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
 9380     int vlen_enc = vector_length_encoding(this);
 9381     __ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
 9382   %}
 9383   ins_pipe( pipe_slow );
 9384 %}
 9385 
 9386 
 9387 instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
 9388   predicate(n->in(3)->bottom_type()->isa_vectmask());
 9389   match(Set dst (LoadVectorMasked mem mask));
 9390   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9391   ins_encode %{
 9392     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
 9393     int vector_len = vector_length_encoding(this);
 9394     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
 9395   %}
 9396   ins_pipe( pipe_slow );
 9397 %}
 9398 
 9399 instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
 9400   predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9401   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9402   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9403   ins_encode %{
 9404     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9405     int vlen_enc = vector_length_encoding(src_node);
 9406     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9407     __ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 9408   %}
 9409   ins_pipe( pipe_slow );
 9410 %}
 9411 
 9412 instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
 9413   predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9414   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9415   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9416   ins_encode %{
 9417     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9418     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9419     int vlen_enc = vector_length_encoding(src_node);
 9420     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
 9421   %}
 9422   ins_pipe( pipe_slow );
 9423 %}
 9424 
 9425 instruct verify_vector_alignment(rRegP addr, immL32 mask, rFlagsReg cr) %{
 9426   match(Set addr (VerifyVectorAlignment addr mask));
 9427   effect(KILL cr);
 9428   format %{ "verify_vector_alignment $addr $mask \t! verify alignment" %}
 9429   ins_encode %{
 9430     Label Lskip;
 9431     // check if masked bits of addr are zero
 9432     __ testq($addr$$Register, $mask$$constant);
 9433     __ jccb(Assembler::equal, Lskip);
 9434     __ stop("verify_vector_alignment found a misaligned vector memory access");
 9435     __ bind(Lskip);
 9436   %}
 9437   ins_pipe(pipe_slow);
 9438 %}
 9439 
 9440 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 9441   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
 9442   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
 9443   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
 9444   ins_encode %{
 9445     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
 9446     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
 9447 
 9448     Label DONE;
 9449     int vlen_enc = vector_length_encoding(this, $src1);
 9450     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9451 
 9452     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
 9453     __ mov64($dst$$Register, -1L);
 9454     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
 9455     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
 9456     __ jccb(Assembler::carrySet, DONE);
 9457     __ kmovql($dst$$Register, $ktmp1$$KRegister);
 9458     __ notq($dst$$Register);
 9459     __ tzcntq($dst$$Register, $dst$$Register);
 9460     __ bind(DONE);
 9461   %}
 9462   ins_pipe( pipe_slow );
 9463 %}
 9464 
 9465 
 9466 instruct vmask_gen(kReg dst, rRegL len, rRegL temp, rFlagsReg cr) %{
 9467   match(Set dst (VectorMaskGen len));
 9468   effect(TEMP temp, KILL cr);
 9469   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
 9470   ins_encode %{
 9471     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
 9472   %}
 9473   ins_pipe( pipe_slow );
 9474 %}
 9475 
 9476 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
 9477   match(Set dst (VectorMaskGen len));
 9478   format %{ "vector_mask_gen $len \t! vector mask generator" %}
 9479   effect(TEMP temp);
 9480   ins_encode %{
 9481     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
 9482     __ kmovql($dst$$KRegister, $temp$$Register);
 9483   %}
 9484   ins_pipe( pipe_slow );
 9485 %}
 9486 
 9487 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
 9488   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9489   match(Set dst (VectorMaskToLong mask));
 9490   effect(TEMP dst, KILL cr);
 9491   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
 9492   ins_encode %{
 9493     int opcode = this->ideal_Opcode();
 9494     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9495     int mask_len = Matcher::vector_length(this, $mask);
 9496     int mask_size = mask_len * type2aelembytes(mbt);
 9497     int vlen_enc = vector_length_encoding(this, $mask);
 9498     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9499                              $dst$$Register, mask_len, mask_size, vlen_enc);
 9500   %}
 9501   ins_pipe( pipe_slow );
 9502 %}
 9503 
 9504 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
 9505   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9506   match(Set dst (VectorMaskToLong mask));
 9507   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
 9508   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9509   ins_encode %{
 9510     int opcode = this->ideal_Opcode();
 9511     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9512     int mask_len = Matcher::vector_length(this, $mask);
 9513     int vlen_enc = vector_length_encoding(this, $mask);
 9514     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9515                              $dst$$Register, mask_len, mbt, vlen_enc);
 9516   %}
 9517   ins_pipe( pipe_slow );
 9518 %}
 9519 
 9520 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
 9521   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9522   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
 9523   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
 9524   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9525   ins_encode %{
 9526     int opcode = this->ideal_Opcode();
 9527     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9528     int mask_len = Matcher::vector_length(this, $mask);
 9529     int vlen_enc = vector_length_encoding(this, $mask);
 9530     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9531                              $dst$$Register, mask_len, mbt, vlen_enc);
 9532   %}
 9533   ins_pipe( pipe_slow );
 9534 %}
 9535 
 9536 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9537   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9538   match(Set dst (VectorMaskTrueCount mask));
 9539   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9540   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
 9541   ins_encode %{
 9542     int opcode = this->ideal_Opcode();
 9543     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9544     int mask_len = Matcher::vector_length(this, $mask);
 9545     int mask_size = mask_len * type2aelembytes(mbt);
 9546     int vlen_enc = vector_length_encoding(this, $mask);
 9547     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9548                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9549   %}
 9550   ins_pipe( pipe_slow );
 9551 %}
 9552 
 9553 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9554   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9555   match(Set dst (VectorMaskTrueCount mask));
 9556   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9557   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9558   ins_encode %{
 9559     int opcode = this->ideal_Opcode();
 9560     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9561     int mask_len = Matcher::vector_length(this, $mask);
 9562     int vlen_enc = vector_length_encoding(this, $mask);
 9563     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9564                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9565   %}
 9566   ins_pipe( pipe_slow );
 9567 %}
 9568 
 9569 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9570   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9571   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
 9572   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9573   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9574   ins_encode %{
 9575     int opcode = this->ideal_Opcode();
 9576     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9577     int mask_len = Matcher::vector_length(this, $mask);
 9578     int vlen_enc = vector_length_encoding(this, $mask);
 9579     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9580                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9581   %}
 9582   ins_pipe( pipe_slow );
 9583 %}
 9584 
 9585 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9586   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9587   match(Set dst (VectorMaskFirstTrue mask));
 9588   match(Set dst (VectorMaskLastTrue mask));
 9589   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9590   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
 9591   ins_encode %{
 9592     int opcode = this->ideal_Opcode();
 9593     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9594     int mask_len = Matcher::vector_length(this, $mask);
 9595     int mask_size = mask_len * type2aelembytes(mbt);
 9596     int vlen_enc = vector_length_encoding(this, $mask);
 9597     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9598                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9599   %}
 9600   ins_pipe( pipe_slow );
 9601 %}
 9602 
 9603 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9604   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9605   match(Set dst (VectorMaskFirstTrue mask));
 9606   match(Set dst (VectorMaskLastTrue mask));
 9607   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9608   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9609   ins_encode %{
 9610     int opcode = this->ideal_Opcode();
 9611     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9612     int mask_len = Matcher::vector_length(this, $mask);
 9613     int vlen_enc = vector_length_encoding(this, $mask);
 9614     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9615                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9616   %}
 9617   ins_pipe( pipe_slow );
 9618 %}
 9619 
 9620 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9621   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9622   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
 9623   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
 9624   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9625   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9626   ins_encode %{
 9627     int opcode = this->ideal_Opcode();
 9628     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9629     int mask_len = Matcher::vector_length(this, $mask);
 9630     int vlen_enc = vector_length_encoding(this, $mask);
 9631     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9632                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9633   %}
 9634   ins_pipe( pipe_slow );
 9635 %}
 9636 
 9637 // --------------------------------- Compress/Expand Operations ---------------------------
 9638 instruct vcompress_reg_avx(vec dst, vec src, vec mask, rRegI rtmp, rRegL rscratch, vec perm, vec xtmp, rFlagsReg cr) %{
 9639   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 9640   match(Set dst (CompressV src mask));
 9641   match(Set dst (ExpandV src mask));
 9642   effect(TEMP_DEF dst, TEMP perm, TEMP xtmp, TEMP rtmp, TEMP rscratch, KILL cr);
 9643   format %{ "vector_compress $dst, $src, $mask \t!using $xtmp, $rtmp, $rscratch and $perm as TEMP" %}
 9644   ins_encode %{
 9645     int opcode = this->ideal_Opcode();
 9646     int vlen_enc = vector_length_encoding(this);
 9647     BasicType bt  = Matcher::vector_element_basic_type(this);
 9648     __ vector_compress_expand_avx2(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$XMMRegister, $rtmp$$Register,
 9649                                    $rscratch$$Register, $perm$$XMMRegister, $xtmp$$XMMRegister, bt, vlen_enc);
 9650   %}
 9651   ins_pipe( pipe_slow );
 9652 %}
 9653 
 9654 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
 9655   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 9656   match(Set dst (CompressV src mask));
 9657   match(Set dst (ExpandV src mask));
 9658   format %{ "vector_compress_expand $dst, $src, $mask" %}
 9659   ins_encode %{
 9660     int opcode = this->ideal_Opcode();
 9661     int vector_len = vector_length_encoding(this);
 9662     BasicType bt  = Matcher::vector_element_basic_type(this);
 9663     __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
 9664   %}
 9665   ins_pipe( pipe_slow );
 9666 %}
 9667 
 9668 instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
 9669   match(Set dst (CompressM mask));
 9670   effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
 9671   format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
 9672   ins_encode %{
 9673     assert(this->in(1)->bottom_type()->isa_vectmask(), "");
 9674     int mask_len = Matcher::vector_length(this);
 9675     __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
 9676   %}
 9677   ins_pipe( pipe_slow );
 9678 %}
 9679 
 9680 // -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
 9681 
 9682 instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9683   predicate(!VM_Version::supports_gfni());
 9684   match(Set dst (ReverseV src));
 9685   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9686   format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9687   ins_encode %{
 9688     int vec_enc = vector_length_encoding(this);
 9689     BasicType bt = Matcher::vector_element_basic_type(this);
 9690     __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9691                           $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9692   %}
 9693   ins_pipe( pipe_slow );
 9694 %}
 9695 
 9696 instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp) %{
 9697   predicate(VM_Version::supports_gfni());
 9698   match(Set dst (ReverseV src));
 9699   effect(TEMP dst, TEMP xtmp);
 9700   format %{ "vector_reverse_bit_gfni $dst, $src!\t using $xtmp as TEMP" %}
 9701   ins_encode %{
 9702     int vec_enc = vector_length_encoding(this);
 9703     BasicType bt  = Matcher::vector_element_basic_type(this);
 9704     InternalAddress addr = $constantaddress(jlong(0x8040201008040201));
 9705     __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, addr, vec_enc,
 9706                                $xtmp$$XMMRegister);
 9707   %}
 9708   ins_pipe( pipe_slow );
 9709 %}
 9710 
 9711 instruct vreverse_byte_reg(vec dst, vec src) %{
 9712   predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
 9713   match(Set dst (ReverseBytesV src));
 9714   effect(TEMP dst);
 9715   format %{ "vector_reverse_byte $dst, $src" %}
 9716   ins_encode %{
 9717     int vec_enc = vector_length_encoding(this);
 9718     BasicType bt = Matcher::vector_element_basic_type(this);
 9719     __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, vec_enc);
 9720   %}
 9721   ins_pipe( pipe_slow );
 9722 %}
 9723 
 9724 instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9725   predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
 9726   match(Set dst (ReverseBytesV src));
 9727   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9728   format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9729   ins_encode %{
 9730     int vec_enc = vector_length_encoding(this);
 9731     BasicType bt = Matcher::vector_element_basic_type(this);
 9732     __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9733                              $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9734   %}
 9735   ins_pipe( pipe_slow );
 9736 %}
 9737 
 9738 // ---------------------------------- Vector Count Leading Zeros -----------------------------------
 9739 
 9740 instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
 9741   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9742                                               Matcher::vector_length_in_bytes(n->in(1))));
 9743   match(Set dst (CountLeadingZerosV src));
 9744   format %{ "vector_count_leading_zeros $dst, $src" %}
 9745   ins_encode %{
 9746      int vlen_enc = vector_length_encoding(this, $src);
 9747      BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9748      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9749                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
 9750   %}
 9751   ins_pipe( pipe_slow );
 9752 %}
 9753 
 9754 instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9755   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9756                                               Matcher::vector_length_in_bytes(n->in(1))));
 9757   match(Set dst (CountLeadingZerosV src mask));
 9758   format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
 9759   ins_encode %{
 9760     int vlen_enc = vector_length_encoding(this, $src);
 9761     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9762     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9763     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
 9764                                        xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
 9765   %}
 9766   ins_pipe( pipe_slow );
 9767 %}
 9768 
 9769 instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
 9770   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9771             VM_Version::supports_avx512cd() &&
 9772             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9773   match(Set dst (CountLeadingZerosV src));
 9774   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 9775   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
 9776   ins_encode %{
 9777     int vlen_enc = vector_length_encoding(this, $src);
 9778     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9779     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9780                                        $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
 9781   %}
 9782   ins_pipe( pipe_slow );
 9783 %}
 9784 
 9785 instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
 9786   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9787   match(Set dst (CountLeadingZerosV src));
 9788   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 9789   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
 9790   ins_encode %{
 9791     int vlen_enc = vector_length_encoding(this, $src);
 9792     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9793     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9794                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
 9795                                        $rtmp$$Register, true, vlen_enc);
 9796   %}
 9797   ins_pipe( pipe_slow );
 9798 %}
 9799 
 9800 instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
 9801   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
 9802             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9803   match(Set dst (CountLeadingZerosV src));
 9804   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
 9805   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
 9806   ins_encode %{
 9807     int vlen_enc = vector_length_encoding(this, $src);
 9808     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9809     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9810                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
 9811   %}
 9812   ins_pipe( pipe_slow );
 9813 %}
 9814 
 9815 instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9816   predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
 9817             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9818   match(Set dst (CountLeadingZerosV src));
 9819   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9820   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9821   ins_encode %{
 9822     int vlen_enc = vector_length_encoding(this, $src);
 9823     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9824     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9825                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9826   %}
 9827   ins_pipe( pipe_slow );
 9828 %}
 9829 
 9830 // ---------------------------------- Vector Masked Operations ------------------------------------
 9831 
 9832 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
 9833   match(Set dst (AddVB (Binary dst src2) mask));
 9834   match(Set dst (AddVS (Binary dst src2) mask));
 9835   match(Set dst (AddVI (Binary dst src2) mask));
 9836   match(Set dst (AddVL (Binary dst src2) mask));
 9837   match(Set dst (AddVF (Binary dst src2) mask));
 9838   match(Set dst (AddVD (Binary dst src2) mask));
 9839   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9840   ins_encode %{
 9841     int vlen_enc = vector_length_encoding(this);
 9842     BasicType bt = Matcher::vector_element_basic_type(this);
 9843     int opc = this->ideal_Opcode();
 9844     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9845                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9846   %}
 9847   ins_pipe( pipe_slow );
 9848 %}
 9849 
 9850 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
 9851   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
 9852   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
 9853   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
 9854   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
 9855   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
 9856   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
 9857   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9858   ins_encode %{
 9859     int vlen_enc = vector_length_encoding(this);
 9860     BasicType bt = Matcher::vector_element_basic_type(this);
 9861     int opc = this->ideal_Opcode();
 9862     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9863                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9864   %}
 9865   ins_pipe( pipe_slow );
 9866 %}
 9867 
 9868 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
 9869   match(Set dst (XorV (Binary dst src2) mask));
 9870   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9871   ins_encode %{
 9872     int vlen_enc = vector_length_encoding(this);
 9873     BasicType bt = Matcher::vector_element_basic_type(this);
 9874     int opc = this->ideal_Opcode();
 9875     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9876                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9877   %}
 9878   ins_pipe( pipe_slow );
 9879 %}
 9880 
 9881 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
 9882   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
 9883   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9884   ins_encode %{
 9885     int vlen_enc = vector_length_encoding(this);
 9886     BasicType bt = Matcher::vector_element_basic_type(this);
 9887     int opc = this->ideal_Opcode();
 9888     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9889                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9890   %}
 9891   ins_pipe( pipe_slow );
 9892 %}
 9893 
 9894 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
 9895   match(Set dst (OrV (Binary dst src2) mask));
 9896   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9897   ins_encode %{
 9898     int vlen_enc = vector_length_encoding(this);
 9899     BasicType bt = Matcher::vector_element_basic_type(this);
 9900     int opc = this->ideal_Opcode();
 9901     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9902                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9903   %}
 9904   ins_pipe( pipe_slow );
 9905 %}
 9906 
 9907 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
 9908   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
 9909   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9910   ins_encode %{
 9911     int vlen_enc = vector_length_encoding(this);
 9912     BasicType bt = Matcher::vector_element_basic_type(this);
 9913     int opc = this->ideal_Opcode();
 9914     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9915                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9916   %}
 9917   ins_pipe( pipe_slow );
 9918 %}
 9919 
 9920 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
 9921   match(Set dst (AndV (Binary dst src2) mask));
 9922   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9923   ins_encode %{
 9924     int vlen_enc = vector_length_encoding(this);
 9925     BasicType bt = Matcher::vector_element_basic_type(this);
 9926     int opc = this->ideal_Opcode();
 9927     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9928                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9929   %}
 9930   ins_pipe( pipe_slow );
 9931 %}
 9932 
 9933 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
 9934   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
 9935   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9936   ins_encode %{
 9937     int vlen_enc = vector_length_encoding(this);
 9938     BasicType bt = Matcher::vector_element_basic_type(this);
 9939     int opc = this->ideal_Opcode();
 9940     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9941                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9942   %}
 9943   ins_pipe( pipe_slow );
 9944 %}
 9945 
 9946 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
 9947   match(Set dst (SubVB (Binary dst src2) mask));
 9948   match(Set dst (SubVS (Binary dst src2) mask));
 9949   match(Set dst (SubVI (Binary dst src2) mask));
 9950   match(Set dst (SubVL (Binary dst src2) mask));
 9951   match(Set dst (SubVF (Binary dst src2) mask));
 9952   match(Set dst (SubVD (Binary dst src2) mask));
 9953   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9954   ins_encode %{
 9955     int vlen_enc = vector_length_encoding(this);
 9956     BasicType bt = Matcher::vector_element_basic_type(this);
 9957     int opc = this->ideal_Opcode();
 9958     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9959                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9960   %}
 9961   ins_pipe( pipe_slow );
 9962 %}
 9963 
 9964 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
 9965   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
 9966   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
 9967   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
 9968   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
 9969   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
 9970   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
 9971   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9972   ins_encode %{
 9973     int vlen_enc = vector_length_encoding(this);
 9974     BasicType bt = Matcher::vector_element_basic_type(this);
 9975     int opc = this->ideal_Opcode();
 9976     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9977                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9978   %}
 9979   ins_pipe( pipe_slow );
 9980 %}
 9981 
 9982 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
 9983   match(Set dst (MulVS (Binary dst src2) mask));
 9984   match(Set dst (MulVI (Binary dst src2) mask));
 9985   match(Set dst (MulVL (Binary dst src2) mask));
 9986   match(Set dst (MulVF (Binary dst src2) mask));
 9987   match(Set dst (MulVD (Binary dst src2) mask));
 9988   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9989   ins_encode %{
 9990     int vlen_enc = vector_length_encoding(this);
 9991     BasicType bt = Matcher::vector_element_basic_type(this);
 9992     int opc = this->ideal_Opcode();
 9993     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9994                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9995   %}
 9996   ins_pipe( pipe_slow );
 9997 %}
 9998 
 9999 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
10000   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
10001   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
10002   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
10003   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
10004   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
10005   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
10006   ins_encode %{
10007     int vlen_enc = vector_length_encoding(this);
10008     BasicType bt = Matcher::vector_element_basic_type(this);
10009     int opc = this->ideal_Opcode();
10010     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10011                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10012   %}
10013   ins_pipe( pipe_slow );
10014 %}
10015 
10016 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
10017   match(Set dst (SqrtVF dst mask));
10018   match(Set dst (SqrtVD dst mask));
10019   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
10020   ins_encode %{
10021     int vlen_enc = vector_length_encoding(this);
10022     BasicType bt = Matcher::vector_element_basic_type(this);
10023     int opc = this->ideal_Opcode();
10024     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10025                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
10026   %}
10027   ins_pipe( pipe_slow );
10028 %}
10029 
10030 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
10031   match(Set dst (DivVF (Binary dst src2) mask));
10032   match(Set dst (DivVD (Binary dst src2) mask));
10033   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
10034   ins_encode %{
10035     int vlen_enc = vector_length_encoding(this);
10036     BasicType bt = Matcher::vector_element_basic_type(this);
10037     int opc = this->ideal_Opcode();
10038     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10039                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10040   %}
10041   ins_pipe( pipe_slow );
10042 %}
10043 
10044 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
10045   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
10046   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
10047   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
10048   ins_encode %{
10049     int vlen_enc = vector_length_encoding(this);
10050     BasicType bt = Matcher::vector_element_basic_type(this);
10051     int opc = this->ideal_Opcode();
10052     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10053                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10054   %}
10055   ins_pipe( pipe_slow );
10056 %}
10057 
10058 
10059 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
10060   match(Set dst (RotateLeftV (Binary dst shift) mask));
10061   match(Set dst (RotateRightV (Binary dst shift) mask));
10062   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
10063   ins_encode %{
10064     int vlen_enc = vector_length_encoding(this);
10065     BasicType bt = Matcher::vector_element_basic_type(this);
10066     int opc = this->ideal_Opcode();
10067     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10068                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10069   %}
10070   ins_pipe( pipe_slow );
10071 %}
10072 
10073 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
10074   match(Set dst (RotateLeftV (Binary dst src2) mask));
10075   match(Set dst (RotateRightV (Binary dst src2) mask));
10076   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
10077   ins_encode %{
10078     int vlen_enc = vector_length_encoding(this);
10079     BasicType bt = Matcher::vector_element_basic_type(this);
10080     int opc = this->ideal_Opcode();
10081     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10082                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10083   %}
10084   ins_pipe( pipe_slow );
10085 %}
10086 
10087 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10088   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
10089   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
10090   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
10091   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
10092   ins_encode %{
10093     int vlen_enc = vector_length_encoding(this);
10094     BasicType bt = Matcher::vector_element_basic_type(this);
10095     int opc = this->ideal_Opcode();
10096     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10097                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10098   %}
10099   ins_pipe( pipe_slow );
10100 %}
10101 
10102 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
10103   predicate(!n->as_ShiftV()->is_var_shift());
10104   match(Set dst (LShiftVS (Binary dst src2) mask));
10105   match(Set dst (LShiftVI (Binary dst src2) mask));
10106   match(Set dst (LShiftVL (Binary dst src2) mask));
10107   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
10108   ins_encode %{
10109     int vlen_enc = vector_length_encoding(this);
10110     BasicType bt = Matcher::vector_element_basic_type(this);
10111     int opc = this->ideal_Opcode();
10112     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10113                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10114   %}
10115   ins_pipe( pipe_slow );
10116 %}
10117 
10118 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10119   predicate(n->as_ShiftV()->is_var_shift());
10120   match(Set dst (LShiftVS (Binary dst src2) mask));
10121   match(Set dst (LShiftVI (Binary dst src2) mask));
10122   match(Set dst (LShiftVL (Binary dst src2) mask));
10123   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
10124   ins_encode %{
10125     int vlen_enc = vector_length_encoding(this);
10126     BasicType bt = Matcher::vector_element_basic_type(this);
10127     int opc = this->ideal_Opcode();
10128     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10129                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10130   %}
10131   ins_pipe( pipe_slow );
10132 %}
10133 
10134 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10135   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
10136   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
10137   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
10138   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
10139   ins_encode %{
10140     int vlen_enc = vector_length_encoding(this);
10141     BasicType bt = Matcher::vector_element_basic_type(this);
10142     int opc = this->ideal_Opcode();
10143     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10144                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10145   %}
10146   ins_pipe( pipe_slow );
10147 %}
10148 
10149 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
10150   predicate(!n->as_ShiftV()->is_var_shift());
10151   match(Set dst (RShiftVS (Binary dst src2) mask));
10152   match(Set dst (RShiftVI (Binary dst src2) mask));
10153   match(Set dst (RShiftVL (Binary dst src2) mask));
10154   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10155   ins_encode %{
10156     int vlen_enc = vector_length_encoding(this);
10157     BasicType bt = Matcher::vector_element_basic_type(this);
10158     int opc = this->ideal_Opcode();
10159     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10160                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10161   %}
10162   ins_pipe( pipe_slow );
10163 %}
10164 
10165 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10166   predicate(n->as_ShiftV()->is_var_shift());
10167   match(Set dst (RShiftVS (Binary dst src2) mask));
10168   match(Set dst (RShiftVI (Binary dst src2) mask));
10169   match(Set dst (RShiftVL (Binary dst src2) mask));
10170   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10171   ins_encode %{
10172     int vlen_enc = vector_length_encoding(this);
10173     BasicType bt = Matcher::vector_element_basic_type(this);
10174     int opc = this->ideal_Opcode();
10175     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10176                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10177   %}
10178   ins_pipe( pipe_slow );
10179 %}
10180 
10181 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10182   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
10183   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
10184   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
10185   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
10186   ins_encode %{
10187     int vlen_enc = vector_length_encoding(this);
10188     BasicType bt = Matcher::vector_element_basic_type(this);
10189     int opc = this->ideal_Opcode();
10190     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10191                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10192   %}
10193   ins_pipe( pipe_slow );
10194 %}
10195 
10196 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
10197   predicate(!n->as_ShiftV()->is_var_shift());
10198   match(Set dst (URShiftVS (Binary dst src2) mask));
10199   match(Set dst (URShiftVI (Binary dst src2) mask));
10200   match(Set dst (URShiftVL (Binary dst src2) mask));
10201   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10202   ins_encode %{
10203     int vlen_enc = vector_length_encoding(this);
10204     BasicType bt = Matcher::vector_element_basic_type(this);
10205     int opc = this->ideal_Opcode();
10206     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10207                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10208   %}
10209   ins_pipe( pipe_slow );
10210 %}
10211 
10212 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10213   predicate(n->as_ShiftV()->is_var_shift());
10214   match(Set dst (URShiftVS (Binary dst src2) mask));
10215   match(Set dst (URShiftVI (Binary dst src2) mask));
10216   match(Set dst (URShiftVL (Binary dst src2) mask));
10217   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10218   ins_encode %{
10219     int vlen_enc = vector_length_encoding(this);
10220     BasicType bt = Matcher::vector_element_basic_type(this);
10221     int opc = this->ideal_Opcode();
10222     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10223                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10224   %}
10225   ins_pipe( pipe_slow );
10226 %}
10227 
10228 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
10229   match(Set dst (MaxV (Binary dst src2) mask));
10230   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10231   ins_encode %{
10232     int vlen_enc = vector_length_encoding(this);
10233     BasicType bt = Matcher::vector_element_basic_type(this);
10234     int opc = this->ideal_Opcode();
10235     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10236                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10237   %}
10238   ins_pipe( pipe_slow );
10239 %}
10240 
10241 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
10242   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
10243   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10244   ins_encode %{
10245     int vlen_enc = vector_length_encoding(this);
10246     BasicType bt = Matcher::vector_element_basic_type(this);
10247     int opc = this->ideal_Opcode();
10248     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10249                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10250   %}
10251   ins_pipe( pipe_slow );
10252 %}
10253 
10254 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
10255   match(Set dst (MinV (Binary dst src2) mask));
10256   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10257   ins_encode %{
10258     int vlen_enc = vector_length_encoding(this);
10259     BasicType bt = Matcher::vector_element_basic_type(this);
10260     int opc = this->ideal_Opcode();
10261     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10262                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10263   %}
10264   ins_pipe( pipe_slow );
10265 %}
10266 
10267 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
10268   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
10269   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10270   ins_encode %{
10271     int vlen_enc = vector_length_encoding(this);
10272     BasicType bt = Matcher::vector_element_basic_type(this);
10273     int opc = this->ideal_Opcode();
10274     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10275                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10276   %}
10277   ins_pipe( pipe_slow );
10278 %}
10279 
10280 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
10281   match(Set dst (VectorRearrange (Binary dst src2) mask));
10282   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
10283   ins_encode %{
10284     int vlen_enc = vector_length_encoding(this);
10285     BasicType bt = Matcher::vector_element_basic_type(this);
10286     int opc = this->ideal_Opcode();
10287     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10288                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10289   %}
10290   ins_pipe( pipe_slow );
10291 %}
10292 
10293 instruct vabs_masked(vec dst, kReg mask) %{
10294   match(Set dst (AbsVB dst mask));
10295   match(Set dst (AbsVS dst mask));
10296   match(Set dst (AbsVI dst mask));
10297   match(Set dst (AbsVL dst mask));
10298   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
10299   ins_encode %{
10300     int vlen_enc = vector_length_encoding(this);
10301     BasicType bt = Matcher::vector_element_basic_type(this);
10302     int opc = this->ideal_Opcode();
10303     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10304                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
10305   %}
10306   ins_pipe( pipe_slow );
10307 %}
10308 
10309 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
10310   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
10311   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
10312   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10313   ins_encode %{
10314     assert(UseFMA, "Needs FMA instructions support.");
10315     int vlen_enc = vector_length_encoding(this);
10316     BasicType bt = Matcher::vector_element_basic_type(this);
10317     int opc = this->ideal_Opcode();
10318     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10319                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
10320   %}
10321   ins_pipe( pipe_slow );
10322 %}
10323 
10324 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
10325   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
10326   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
10327   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10328   ins_encode %{
10329     assert(UseFMA, "Needs FMA instructions support.");
10330     int vlen_enc = vector_length_encoding(this);
10331     BasicType bt = Matcher::vector_element_basic_type(this);
10332     int opc = this->ideal_Opcode();
10333     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10334                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
10335   %}
10336   ins_pipe( pipe_slow );
10337 %}
10338 
10339 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask) %{
10340   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
10341   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask" %}
10342   ins_encode %{
10343     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
10344     int vlen_enc = vector_length_encoding(this, $src1);
10345     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
10346 
10347     // Comparison i
10348     switch (src1_elem_bt) {
10349       case T_BYTE: {
10350         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10351         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10352         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10353         break;
10354       }
10355       case T_SHORT: {
10356         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10357         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10358         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10359         break;
10360       }
10361       case T_INT: {
10362         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10363         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10364         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10365         break;
10366       }
10367       case T_LONG: {
10368         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10369         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10370         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10371         break;
10372       }
10373       case T_FLOAT: {
10374         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10375         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10376         break;
10377       }
10378       case T_DOUBLE: {
10379         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10380         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10381         break;
10382       }
10383       default: assert(false, "%s", type2name(src1_elem_bt)); break;
10384     }
10385   %}
10386   ins_pipe( pipe_slow );
10387 %}
10388 
10389 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
10390   predicate(Matcher::vector_length(n) <= 32);
10391   match(Set dst (MaskAll src));
10392   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
10393   ins_encode %{
10394     int mask_len = Matcher::vector_length(this);
10395     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
10396   %}
10397   ins_pipe( pipe_slow );
10398 %}
10399 
10400 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
10401   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
10402   match(Set dst (XorVMask src (MaskAll cnt)));
10403   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
10404   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
10405   ins_encode %{
10406     uint masklen = Matcher::vector_length(this);
10407     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
10408   %}
10409   ins_pipe( pipe_slow );
10410 %}
10411 
10412 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
10413   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
10414             (Matcher::vector_length(n) == 16) ||
10415             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
10416   match(Set dst (XorVMask src (MaskAll cnt)));
10417   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
10418   ins_encode %{
10419     uint masklen = Matcher::vector_length(this);
10420     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
10421   %}
10422   ins_pipe( pipe_slow );
10423 %}
10424 
10425 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
10426   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) <= 8);
10427   match(Set dst (VectorLongToMask src));
10428   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
10429   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
10430   ins_encode %{
10431     int mask_len = Matcher::vector_length(this);
10432     int vec_enc  = vector_length_encoding(mask_len);
10433     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10434                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
10435   %}
10436   ins_pipe( pipe_slow );
10437 %}
10438 
10439 
10440 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
10441   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) > 8);
10442   match(Set dst (VectorLongToMask src));
10443   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
10444   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
10445   ins_encode %{
10446     int mask_len = Matcher::vector_length(this);
10447     assert(mask_len <= 32, "invalid mask length");
10448     int vec_enc  = vector_length_encoding(mask_len);
10449     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10450                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
10451   %}
10452   ins_pipe( pipe_slow );
10453 %}
10454 
10455 instruct long_to_mask_evex(kReg dst, rRegL src) %{
10456   predicate(n->bottom_type()->isa_vectmask());
10457   match(Set dst (VectorLongToMask src));
10458   format %{ "long_to_mask_evex $dst, $src\t!" %}
10459   ins_encode %{
10460     __ kmov($dst$$KRegister, $src$$Register);
10461   %}
10462   ins_pipe( pipe_slow );
10463 %}
10464 
10465 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10466   match(Set dst (AndVMask src1 src2));
10467   match(Set dst (OrVMask src1 src2));
10468   match(Set dst (XorVMask src1 src2));
10469   effect(TEMP kscratch);
10470   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10471   ins_encode %{
10472     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10473     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10474     assert(Type::equals(mask1->bottom_type(), mask2->bottom_type()), "Mask types must be equal");
10475     uint masklen = Matcher::vector_length(this);
10476     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10477     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10478   %}
10479   ins_pipe( pipe_slow );
10480 %}
10481 
10482 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10483   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10484   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10485   ins_encode %{
10486     int vlen_enc = vector_length_encoding(this);
10487     BasicType bt = Matcher::vector_element_basic_type(this);
10488     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10489                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10490   %}
10491   ins_pipe( pipe_slow );
10492 %}
10493 
10494 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10495   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10496   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10497   ins_encode %{
10498     int vlen_enc = vector_length_encoding(this);
10499     BasicType bt = Matcher::vector_element_basic_type(this);
10500     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10501                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10502   %}
10503   ins_pipe( pipe_slow );
10504 %}
10505 
10506 instruct castMM(kReg dst)
10507 %{
10508   match(Set dst (CastVV dst));
10509 
10510   size(0);
10511   format %{ "# castVV of $dst" %}
10512   ins_encode(/* empty encoding */);
10513   ins_cost(0);
10514   ins_pipe(empty);
10515 %}
10516 
10517 instruct castVV(vec dst)
10518 %{
10519   match(Set dst (CastVV dst));
10520 
10521   size(0);
10522   format %{ "# castVV of $dst" %}
10523   ins_encode(/* empty encoding */);
10524   ins_cost(0);
10525   ins_pipe(empty);
10526 %}
10527 
10528 instruct castVVLeg(legVec dst)
10529 %{
10530   match(Set dst (CastVV dst));
10531 
10532   size(0);
10533   format %{ "# castVV of $dst" %}
10534   ins_encode(/* empty encoding */);
10535   ins_cost(0);
10536   ins_pipe(empty);
10537 %}
10538 
10539 instruct FloatClassCheck_reg_reg_vfpclass(rRegI dst, regF src, kReg ktmp, rFlagsReg cr)
10540 %{
10541   match(Set dst (IsInfiniteF src));
10542   effect(TEMP ktmp, KILL cr);
10543   format %{ "float_class_check $dst, $src" %}
10544   ins_encode %{
10545     __ vfpclassss($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10546     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10547   %}
10548   ins_pipe(pipe_slow);
10549 %}
10550 
10551 instruct DoubleClassCheck_reg_reg_vfpclass(rRegI dst, regD src, kReg ktmp, rFlagsReg cr)
10552 %{
10553   match(Set dst (IsInfiniteD src));
10554   effect(TEMP ktmp, KILL cr);
10555   format %{ "double_class_check $dst, $src" %}
10556   ins_encode %{
10557     __ vfpclasssd($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10558     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10559   %}
10560   ins_pipe(pipe_slow);
10561 %}
10562 
10563 instruct vector_addsub_saturating_subword_reg(vec dst, vec src1, vec src2)
10564 %{
10565   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10566             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10567   match(Set dst (SaturatingAddV src1 src2));
10568   match(Set dst (SaturatingSubV src1 src2));
10569   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10570   ins_encode %{
10571     int vlen_enc = vector_length_encoding(this);
10572     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10573     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10574                             $src1$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10575   %}
10576   ins_pipe(pipe_slow);
10577 %}
10578 
10579 instruct vector_addsub_saturating_unsigned_subword_reg(vec dst, vec src1, vec src2)
10580 %{
10581   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10582             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10583   match(Set dst (SaturatingAddV src1 src2));
10584   match(Set dst (SaturatingSubV src1 src2));
10585   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10586   ins_encode %{
10587     int vlen_enc = vector_length_encoding(this);
10588     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10589     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10590                             $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10591   %}
10592   ins_pipe(pipe_slow);
10593 %}
10594 
10595 instruct vector_addsub_saturating_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2)
10596 %{
10597   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10598             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10599             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10600   match(Set dst (SaturatingAddV src1 src2));
10601   match(Set dst (SaturatingSubV src1 src2));
10602   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2);
10603   format %{ "vector_addsub_saturating_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
10604   ins_encode %{
10605     int vlen_enc = vector_length_encoding(this);
10606     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10607     __ vector_addsub_dq_saturating_evex(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10608                                         $src1$$XMMRegister, $src2$$XMMRegister,
10609                                         $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10610                                         $ktmp1$$KRegister, $ktmp2$$KRegister, vlen_enc);
10611   %}
10612   ins_pipe(pipe_slow);
10613 %}
10614 
10615 instruct vector_addsub_saturating_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4)
10616 %{
10617   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10618             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10619             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10620   match(Set dst (SaturatingAddV src1 src2));
10621   match(Set dst (SaturatingSubV src1 src2));
10622   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4);
10623   format %{ "vector_addsub_saturating_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
10624   ins_encode %{
10625     int vlen_enc = vector_length_encoding(this);
10626     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10627     __ vector_addsub_dq_saturating_avx(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10628                                        $src2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10629                                        $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, vlen_enc);
10630   %}
10631   ins_pipe(pipe_slow);
10632 %}
10633 
10634 instruct vector_add_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp)
10635 %{
10636   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10637             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10638             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10639   match(Set dst (SaturatingAddV src1 src2));
10640   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp);
10641   format %{ "vector_add_saturating_unsigned_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $ktmp as TEMP" %}
10642   ins_encode %{
10643     int vlen_enc = vector_length_encoding(this);
10644     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10645     __ vector_add_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10646                                               $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10647   %}
10648   ins_pipe(pipe_slow);
10649 %}
10650 
10651 instruct vector_add_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3)
10652 %{
10653   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10654             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10655             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10656   match(Set dst (SaturatingAddV src1 src2));
10657   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
10658   format %{ "vector_add_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
10659   ins_encode %{
10660     int vlen_enc = vector_length_encoding(this);
10661     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10662     __ vector_add_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10663                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, vlen_enc);
10664   %}
10665   ins_pipe(pipe_slow);
10666 %}
10667 
10668 instruct vector_sub_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, kReg ktmp)
10669 %{
10670   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10671             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10672             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10673   match(Set dst (SaturatingSubV src1 src2));
10674   effect(TEMP ktmp);
10675   format %{ "vector_sub_saturating_unsigned_evex $dst, $src1, $src2 \t! using $ktmp as TEMP" %}
10676   ins_encode %{
10677     int vlen_enc = vector_length_encoding(this);
10678     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10679     __ vector_sub_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10680                                               $src2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10681   %}
10682   ins_pipe(pipe_slow);
10683 %}
10684 
10685 instruct vector_sub_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2)
10686 %{
10687   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10688             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10689             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10690   match(Set dst (SaturatingSubV src1 src2));
10691   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
10692   format %{ "vector_sub_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1 and $xtmp2 as TEMP" %}
10693   ins_encode %{
10694     int vlen_enc = vector_length_encoding(this);
10695     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10696     __ vector_sub_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10697                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
10698   %}
10699   ins_pipe(pipe_slow);
10700 %}
10701 
10702 instruct vector_addsub_saturating_subword_mem(vec dst, vec src1, memory src2)
10703 %{
10704   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10705             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10706   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10707   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10708   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10709   ins_encode %{
10710     int vlen_enc = vector_length_encoding(this);
10711     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10712     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10713                             $src1$$XMMRegister, $src2$$Address, false, vlen_enc);
10714   %}
10715   ins_pipe(pipe_slow);
10716 %}
10717 
10718 instruct vector_addsub_saturating_unsigned_subword_mem(vec dst, vec src1, memory src2)
10719 %{
10720   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10721             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10722   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10723   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10724   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10725   ins_encode %{
10726     int vlen_enc = vector_length_encoding(this);
10727     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10728     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10729                             $src1$$XMMRegister, $src2$$Address, true, vlen_enc);
10730   %}
10731   ins_pipe(pipe_slow);
10732 %}
10733 
10734 instruct vector_addsub_saturating_subword_masked_reg(vec dst, vec src, kReg mask) %{
10735   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10736             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10737   match(Set dst (SaturatingAddV (Binary dst src) mask));
10738   match(Set dst (SaturatingSubV (Binary dst src) mask));
10739   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10740   ins_encode %{
10741     int vlen_enc = vector_length_encoding(this);
10742     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10743     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10744                               $dst$$XMMRegister, $src$$XMMRegister, false, true, vlen_enc);
10745   %}
10746   ins_pipe( pipe_slow );
10747 %}
10748 
10749 instruct vector_addsub_saturating_unsigned_subword_masked_reg(vec dst, vec src, kReg mask) %{
10750   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10751             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10752   match(Set dst (SaturatingAddV (Binary dst src) mask));
10753   match(Set dst (SaturatingSubV (Binary dst src) mask));
10754   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10755   ins_encode %{
10756     int vlen_enc = vector_length_encoding(this);
10757     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10758     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10759                               $dst$$XMMRegister, $src$$XMMRegister, true, true, vlen_enc);
10760   %}
10761   ins_pipe( pipe_slow );
10762 %}
10763 
10764 instruct vector_addsub_saturating_subword_masked_mem(vec dst, memory src, kReg mask) %{
10765   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10766             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10767   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10768   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10769   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10770   ins_encode %{
10771     int vlen_enc = vector_length_encoding(this);
10772     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10773     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10774                               $dst$$XMMRegister, $src$$Address, false, true, vlen_enc);
10775   %}
10776   ins_pipe( pipe_slow );
10777 %}
10778 
10779 instruct vector_addsub_saturating_unsigned_subword_masked_mem(vec dst, memory src, kReg mask) %{
10780   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10781             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10782   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10783   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10784   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10785   ins_encode %{
10786     int vlen_enc = vector_length_encoding(this);
10787     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10788     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10789                               $dst$$XMMRegister, $src$$Address, true, true, vlen_enc);
10790   %}
10791   ins_pipe( pipe_slow );
10792 %}
10793 
10794 instruct vector_selectfrom_twovectors_reg_evex(vec index, vec src1, vec src2)
10795 %{
10796   match(Set index (SelectFromTwoVector (Binary index src1) src2));
10797   format %{ "select_from_two_vector $index, $src1, $src2 \t!" %}
10798   ins_encode %{
10799     int vlen_enc = vector_length_encoding(this);
10800     BasicType bt = Matcher::vector_element_basic_type(this);
10801     __ select_from_two_vectors_evex(bt, $index$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
10802   %}
10803   ins_pipe(pipe_slow);
10804 %}
10805 
10806 instruct reinterpretS2HF(regF dst, rRegI src)
10807 %{
10808   match(Set dst (ReinterpretS2HF src));
10809   format %{ "vmovw $dst, $src" %}
10810   ins_encode %{
10811     __ vmovw($dst$$XMMRegister, $src$$Register);
10812   %}
10813   ins_pipe(pipe_slow);
10814 %}
10815 
10816 instruct reinterpretHF2S(rRegI dst, regF src)
10817 %{
10818   match(Set dst (ReinterpretHF2S src));
10819   format %{ "vmovw $dst, $src" %}
10820   ins_encode %{
10821     __ vmovw($dst$$Register, $src$$XMMRegister);
10822   %}
10823   ins_pipe(pipe_slow);
10824 %}
10825 
10826 instruct convF2HFAndS2HF(regF dst, regF src)
10827 %{
10828   match(Set dst (ReinterpretS2HF (ConvF2HF src)));
10829   format %{ "convF2HFAndS2HF $dst, $src" %}
10830   ins_encode %{
10831     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
10832   %}
10833   ins_pipe(pipe_slow);
10834 %}
10835 
10836 instruct convHF2SAndHF2F(regF dst, regF src)
10837 %{
10838   match(Set dst (ConvHF2F (ReinterpretHF2S src)));
10839   format %{ "convHF2SAndHF2F $dst, $src" %}
10840   ins_encode %{
10841     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, Assembler::AVX_128bit);
10842   %}
10843   ins_pipe(pipe_slow);
10844 %}
10845 
10846 instruct scalar_sqrt_HF_reg(regF dst, regF src)
10847 %{
10848   match(Set dst (SqrtHF src));
10849   format %{ "scalar_sqrt_fp16 $dst, $src" %}
10850   ins_encode %{
10851     __ vsqrtsh($dst$$XMMRegister, $src$$XMMRegister);
10852   %}
10853   ins_pipe(pipe_slow);
10854 %}
10855 
10856 instruct scalar_binOps_HF_reg(regF dst, regF src1, regF src2)
10857 %{
10858   match(Set dst (AddHF src1 src2));
10859   match(Set dst (DivHF src1 src2));
10860   match(Set dst (MulHF src1 src2));
10861   match(Set dst (SubHF src1 src2));
10862   format %{ "scalar_binop_fp16 $dst, $src1, $src2" %}
10863   ins_encode %{
10864     int opcode = this->ideal_Opcode();
10865     __ efp16sh(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
10866   %}
10867   ins_pipe(pipe_slow);
10868 %}
10869 
10870 instruct scalar_minmax_HF_avx10_reg(regF dst, regF src1, regF src2)
10871 %{
10872   predicate(VM_Version::supports_avx10_2());
10873   match(Set dst (MaxHF src1 src2));
10874   match(Set dst (MinHF src1 src2));
10875   format %{ "scalar_min_max_fp16 $dst, $src1, $src2" %}
10876   ins_encode %{
10877     int function = this->ideal_Opcode() == Op_MinHF ? AVX10_MINMAX_MIN_COMPARE_SIGN : AVX10_MINMAX_MAX_COMPARE_SIGN;
10878     __ eminmaxsh($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, function);
10879   %}
10880   ins_pipe( pipe_slow );
10881 %}
10882 
10883 instruct scalar_minmax_HF_reg(regF dst, regF src1, regF src2, kReg ktmp, regF xtmp1, regF xtmp2)
10884 %{
10885   predicate(!VM_Version::supports_avx10_2());
10886   match(Set dst (MaxHF src1 src2));
10887   match(Set dst (MinHF src1 src2));
10888   effect(TEMP_DEF dst, TEMP ktmp, TEMP xtmp1, TEMP xtmp2);
10889   format %{ "scalar_min_max_fp16 $dst, $src1, $src2\t using $ktmp, $xtmp1 and $xtmp2 as TEMP" %}
10890   ins_encode %{
10891     int opcode = this->ideal_Opcode();
10892     __ scalar_max_min_fp16(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $ktmp$$KRegister,
10893                            $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
10894   %}
10895   ins_pipe( pipe_slow );
10896 %}
10897 
10898 instruct scalar_fma_HF_reg(regF dst, regF src1, regF src2)
10899 %{
10900   match(Set dst (FmaHF  src2 (Binary dst src1)));
10901   effect(DEF dst);
10902   format %{ "scalar_fma_fp16 $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10903   ins_encode %{
10904     __ vfmadd132sh($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister);
10905   %}
10906   ins_pipe( pipe_slow );
10907 %}
10908 
10909 
10910 instruct vector_sqrt_HF_reg(vec dst, vec src)
10911 %{
10912   match(Set dst (SqrtVHF src));
10913   format %{ "vector_sqrt_fp16 $dst, $src" %}
10914   ins_encode %{
10915     int vlen_enc = vector_length_encoding(this);
10916     __ evsqrtph($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
10917   %}
10918   ins_pipe(pipe_slow);
10919 %}
10920 
10921 instruct vector_sqrt_HF_mem(vec dst, memory src)
10922 %{
10923   match(Set dst (SqrtVHF (VectorReinterpret (LoadVector src))));
10924   format %{ "vector_sqrt_fp16_mem $dst, $src" %}
10925   ins_encode %{
10926     int vlen_enc = vector_length_encoding(this);
10927     __ evsqrtph($dst$$XMMRegister, $src$$Address, vlen_enc);
10928   %}
10929   ins_pipe(pipe_slow);
10930 %}
10931 
10932 instruct vector_binOps_HF_reg(vec dst, vec src1, vec src2)
10933 %{
10934   match(Set dst (AddVHF src1 src2));
10935   match(Set dst (DivVHF src1 src2));
10936   match(Set dst (MulVHF src1 src2));
10937   match(Set dst (SubVHF src1 src2));
10938   format %{ "vector_binop_fp16 $dst, $src1, $src2" %}
10939   ins_encode %{
10940     int vlen_enc = vector_length_encoding(this);
10941     int opcode = this->ideal_Opcode();
10942     __ evfp16ph(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
10943   %}
10944   ins_pipe(pipe_slow);
10945 %}
10946 
10947 
10948 instruct vector_binOps_HF_mem(vec dst, vec src1, memory src2)
10949 %{
10950   match(Set dst (AddVHF src1 (VectorReinterpret (LoadVector src2))));
10951   match(Set dst (DivVHF src1 (VectorReinterpret (LoadVector src2))));
10952   match(Set dst (MulVHF src1 (VectorReinterpret (LoadVector src2))));
10953   match(Set dst (SubVHF src1 (VectorReinterpret (LoadVector src2))));
10954   format %{ "vector_binop_fp16_mem $dst, $src1, $src2" %}
10955   ins_encode %{
10956     int vlen_enc = vector_length_encoding(this);
10957     int opcode = this->ideal_Opcode();
10958     __ evfp16ph(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address, vlen_enc);
10959   %}
10960   ins_pipe(pipe_slow);
10961 %}
10962 
10963 instruct vector_fma_HF_reg(vec dst, vec src1, vec src2)
10964 %{
10965   match(Set dst (FmaVHF src2 (Binary dst src1)));
10966   format %{ "vector_fma_fp16 $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10967   ins_encode %{
10968     int vlen_enc = vector_length_encoding(this);
10969     __ evfmadd132ph($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
10970   %}
10971   ins_pipe( pipe_slow );
10972 %}
10973 
10974 instruct vector_fma_HF_mem(vec dst, memory src1, vec src2)
10975 %{
10976   match(Set dst (FmaVHF src2 (Binary dst (VectorReinterpret (LoadVector src1)))));
10977   format %{ "vector_fma_fp16_mem $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10978   ins_encode %{
10979     int vlen_enc = vector_length_encoding(this);
10980     __ evfmadd132ph($dst$$XMMRegister, $src2$$XMMRegister, $src1$$Address, vlen_enc);
10981   %}
10982   ins_pipe( pipe_slow );
10983 %}
10984 
10985 instruct vector_minmax_HF_avx10_mem(vec dst, vec src1, memory src2)
10986 %{
10987   predicate(VM_Version::supports_avx10_2());
10988   match(Set dst (MinVHF src1 (VectorReinterpret (LoadVector src2))));
10989   match(Set dst (MaxVHF src1 (VectorReinterpret (LoadVector src2))));
10990   format %{ "vector_min_max_fp16_mem $dst, $src1, $src2" %}
10991   ins_encode %{
10992     int vlen_enc = vector_length_encoding(this);
10993     int function =  this->ideal_Opcode() == Op_MinVHF ? AVX10_MINMAX_MIN_COMPARE_SIGN : AVX10_MINMAX_MAX_COMPARE_SIGN;
10994     __ evminmaxph($dst$$XMMRegister, k0, $src1$$XMMRegister, $src2$$Address, true, function, vlen_enc);
10995   %}
10996   ins_pipe( pipe_slow );
10997 %}
10998 
10999 instruct vector_minmax_HF_avx10_reg(vec dst, vec src1, vec src2)
11000 %{
11001   predicate(VM_Version::supports_avx10_2());
11002   match(Set dst (MinVHF src1 src2));
11003   match(Set dst (MaxVHF src1 src2));
11004   format %{ "vector_min_max_fp16 $dst, $src1, $src2" %}
11005   ins_encode %{
11006     int vlen_enc = vector_length_encoding(this);
11007     int function =  this->ideal_Opcode() == Op_MinVHF ? AVX10_MINMAX_MIN_COMPARE_SIGN : AVX10_MINMAX_MAX_COMPARE_SIGN;
11008     __ evminmaxph($dst$$XMMRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, true, function, vlen_enc);
11009   %}
11010   ins_pipe( pipe_slow );
11011 %}
11012 
11013 instruct vector_minmax_HF_reg(vec dst, vec src1, vec src2, kReg ktmp, vec xtmp1, vec xtmp2)
11014 %{
11015   predicate(!VM_Version::supports_avx10_2());
11016   match(Set dst (MinVHF src1 src2));
11017   match(Set dst (MaxVHF src1 src2));
11018   effect(TEMP_DEF dst, TEMP ktmp, TEMP xtmp1, TEMP xtmp2);
11019   format %{ "vector_min_max_fp16 $dst, $src1, $src2\t using $ktmp, $xtmp1 and $xtmp2 as TEMP" %}
11020   ins_encode %{
11021     int vlen_enc = vector_length_encoding(this);
11022     int opcode = this->ideal_Opcode();
11023     __ vector_max_min_fp16(opcode, $dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, $ktmp$$KRegister,
11024                            $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
11025   %}
11026   ins_pipe( pipe_slow );
11027 %}