1 //
    2 // Copyright (c) 2011, 2024, Oracle and/or its affiliates. All rights reserved.
    3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4 //
    5 // This code is free software; you can redistribute it and/or modify it
    6 // under the terms of the GNU General Public License version 2 only, as
    7 // published by the Free Software Foundation.
    8 //
    9 // This code is distributed in the hope that it will be useful, but WITHOUT
   10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12 // version 2 for more details (a copy is included in the LICENSE file that
   13 // accompanied this code).
   14 //
   15 // You should have received a copy of the GNU General Public License version
   16 // 2 along with this work; if not, write to the Free Software Foundation,
   17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18 //
   19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20 // or visit www.oracle.com if you need additional information or have any
   21 // questions.
   22 //
   23 //
   24 
   25 // X86 Common Architecture Description File
   26 
   27 //----------REGISTER DEFINITION BLOCK------------------------------------------
   28 // This information is used by the matcher and the register allocator to
   29 // describe individual registers and classes of registers within the target
   30 // architecture.
   31 
   32 register %{
   33 //----------Architecture Description Register Definitions----------------------
   34 // General Registers
   35 // "reg_def"  name ( register save type, C convention save type,
   36 //                   ideal register type, encoding );
   37 // Register Save Types:
   38 //
   39 // NS  = No-Save:       The register allocator assumes that these registers
   40 //                      can be used without saving upon entry to the method, &
   41 //                      that they do not need to be saved at call sites.
   42 //
   43 // SOC = Save-On-Call:  The register allocator assumes that these registers
   44 //                      can be used without saving upon entry to the method,
   45 //                      but that they must be saved at call sites.
   46 //
   47 // SOE = Save-On-Entry: The register allocator assumes that these registers
   48 //                      must be saved before using them upon entry to the
   49 //                      method, but they do not need to be saved at call
   50 //                      sites.
   51 //
   52 // AS  = Always-Save:   The register allocator assumes that these registers
   53 //                      must be saved before using them upon entry to the
   54 //                      method, & that they must be saved at call sites.
   55 //
   56 // Ideal Register Type is used to determine how to save & restore a
   57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
   58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
   59 //
   60 // The encoding number is the actual bit-pattern placed into the opcodes.
   61 
   62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
   63 // Word a in each register holds a Float, words ab hold a Double.
   64 // The whole registers are used in SSE4.2 version intrinsics,
   65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
   66 // UseXMMForArrayCopy and UseSuperword flags).
   67 // For pre EVEX enabled architectures:
   68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
   69 // For EVEX enabled architectures:
   70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
   71 //
   72 // Linux ABI:   No register preserved across function calls
   73 //              XMM0-XMM7 might hold parameters
   74 // Windows ABI: XMM6-XMM15 preserved across function calls
   75 //              XMM0-XMM3 might hold parameters
   76 
   77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
   79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
   80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
   81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
   82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
   83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
   84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
   85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
   86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
   87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
   88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
   89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
   90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
   91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
   92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
   93 
   94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
   96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
   97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
   98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
   99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
  103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
  104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
  105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
  106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
  107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
  108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
  109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
  110 
  111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
  120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
  121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
  122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
  123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
  124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
  125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
  126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
  127 
  128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
  129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
  130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
  131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
  132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
  133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
  134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
  135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
  136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
  137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
  138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
  139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
  140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
  141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
  142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
  143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
  144 
  145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
  146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
  147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
  148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
  149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
  150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
  151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
  152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
  153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
  154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
  155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
  156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
  157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
  158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
  159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
  160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
  161 
  162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
  163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
  164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
  165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
  166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
  167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
  168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
  169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
  170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
  171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
  172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
  173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
  174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
  175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
  176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
  177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
  178 
  179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
  180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
  181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
  182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
  183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
  184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
  185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
  186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
  187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
  188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
  189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
  190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
  191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
  192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
  193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
  194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
  195 
  196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
  197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
  198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
  199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
  200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
  201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
  202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
  203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
  204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
  205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
  206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
  207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
  208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
  209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
  210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
  211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
  212 
  213 #ifdef _LP64
  214 
  215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
  216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
  217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
  218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
  219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
  220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
  221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
  222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
  223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
  224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
  225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
  226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
  227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
  228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
  229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
  230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
  231 
  232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
  233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
  234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
  235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
  236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
  237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
  238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
  239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
  240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
  241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
  242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
  243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
  244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
  245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
  246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
  247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
  248 
  249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
  250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
  251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
  252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
  253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
  254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
  255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
  256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
  257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
  258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
  259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
  260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
  261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
  262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
  263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
  264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
  265 
  266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
  267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
  268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
  269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
  270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
  271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
  272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
  273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
  274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
  275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
  276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
  277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
  278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
  279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
  280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
  281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
  282 
  283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
  284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
  285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
  286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
  287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
  288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
  289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
  290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
  291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
  292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
  293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
  294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
  295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
  296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
  297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
  298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
  299 
  300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
  301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
  302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
  303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
  304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
  305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
  306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
  307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
  308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
  309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
  310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
  311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
  312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
  313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
  314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
  315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
  316 
  317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
  318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
  319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
  320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
  321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
  322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
  323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
  324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
  325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
  326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
  327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
  328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
  329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
  330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
  331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
  332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
  333 
  334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
  335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
  336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
  337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
  338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
  339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
  340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
  341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
  342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
  343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
  344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
  345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
  346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
  347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
  348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
  349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
  350 
  351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
  352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
  353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
  354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
  355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
  356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
  357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
  358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
  359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
  360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
  361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
  362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
  363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
  364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
  365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
  366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
  367 
  368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
  369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
  370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
  371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
  372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
  373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
  374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
  375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
  376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
  377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
  378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
  379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
  380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
  381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
  382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
  383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
  384 
  385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
  386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
  387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
  388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
  389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
  390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
  391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
  392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
  393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
  394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
  395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
  396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
  397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
  398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
  399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
  400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
  401 
  402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
  403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
  404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
  405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
  406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
  407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
  408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
  409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
  410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
  411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
  412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
  413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
  414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
  415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
  416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
  417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
  418 
  419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
  420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
  421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
  422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
  423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
  424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
  425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
  426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
  427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
  428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
  429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
  430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
  431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
  432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
  433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
  434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
  435 
  436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
  437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
  438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
  439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
  440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
  441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
  442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
  443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
  444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
  445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
  446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
  447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
  448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
  449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
  450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
  451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
  452 
  453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
  454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
  455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
  456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
  457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
  458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
  459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
  460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
  461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
  462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
  463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
  464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
  465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
  466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
  467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
  468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
  469 
  470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
  471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
  472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
  473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
  474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
  475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
  476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
  477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
  478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
  479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
  480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
  481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
  482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
  483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
  484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
  485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
  486 
  487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
  488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
  489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
  490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
  491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
  492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
  493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
  494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
  495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
  496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
  497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
  498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
  499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
  500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
  501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
  502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
  503 
  504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
  505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
  506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
  507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
  508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
  509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
  510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
  511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
  512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
  513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
  514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
  515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
  516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
  517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
  518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
  519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
  520 
  521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
  522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
  523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
  524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
  525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
  526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
  527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
  528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
  529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
  530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
  531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
  532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
  533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
  534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
  535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
  536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
  537 
  538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
  539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
  540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
  541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
  542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
  543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
  544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
  545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
  546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
  547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
  548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
  549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
  550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
  551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
  552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
  553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
  554 
  555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
  556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
  557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
  558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
  559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
  560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
  561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
  562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
  563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
  564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
  565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
  566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
  567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
  568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
  569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
  570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
  571 
  572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
  573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
  574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
  575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
  576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
  577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
  578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
  579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
  580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
  581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
  582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
  583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
  584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
  585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
  586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
  587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
  588 
  589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
  590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
  591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
  592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
  593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
  594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
  595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
  596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
  597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
  598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
  599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
  600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
  601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
  602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
  603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
  604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
  605 
  606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
  607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
  608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
  609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
  610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
  611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
  612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
  613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
  614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
  615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
  616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
  617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
  618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
  619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
  620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
  621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
  622 
  623 #endif // _LP64
  624 
  625 #ifdef _LP64
  626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
  627 #else
  628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
  629 #endif // _LP64
  630 
  631 // AVX3 Mask Registers.
  632 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
  633 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
  634 
  635 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
  636 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
  637 
  638 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
  639 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
  640 
  641 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
  642 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
  643 
  644 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
  645 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
  646 
  647 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
  648 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
  649 
  650 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
  651 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
  652 
  653 
  654 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
  655                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
  656                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
  657                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
  658                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
  659                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
  660                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
  661                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
  662 #ifdef _LP64
  663                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
  664                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
  665                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
  666                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
  667                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
  668                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
  669                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
  670                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
  671                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
  672                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
  673                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
  674                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
  675                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
  676                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
  677                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
  678                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
  679                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
  680                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
  681                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
  682                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
  683                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
  684                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
  685                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
  686                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
  687 #endif
  688                       );
  689 
  690 alloc_class chunk2(K7, K7_H,
  691                    K6, K6_H,
  692                    K5, K5_H,
  693                    K4, K4_H,
  694                    K3, K3_H,
  695                    K2, K2_H,
  696                    K1, K1_H);
  697 
  698 reg_class  vectmask_reg(K1, K1_H,
  699                         K2, K2_H,
  700                         K3, K3_H,
  701                         K4, K4_H,
  702                         K5, K5_H,
  703                         K6, K6_H,
  704                         K7, K7_H);
  705 
  706 reg_class vectmask_reg_K1(K1, K1_H);
  707 reg_class vectmask_reg_K2(K2, K2_H);
  708 reg_class vectmask_reg_K3(K3, K3_H);
  709 reg_class vectmask_reg_K4(K4, K4_H);
  710 reg_class vectmask_reg_K5(K5, K5_H);
  711 reg_class vectmask_reg_K6(K6, K6_H);
  712 reg_class vectmask_reg_K7(K7, K7_H);
  713 
  714 // flags allocation class should be last.
  715 alloc_class chunk3(RFLAGS);
  716 
  717 
  718 // Singleton class for condition codes
  719 reg_class int_flags(RFLAGS);
  720 
  721 // Class for pre evex float registers
  722 reg_class float_reg_legacy(XMM0,
  723                     XMM1,
  724                     XMM2,
  725                     XMM3,
  726                     XMM4,
  727                     XMM5,
  728                     XMM6,
  729                     XMM7
  730 #ifdef _LP64
  731                    ,XMM8,
  732                     XMM9,
  733                     XMM10,
  734                     XMM11,
  735                     XMM12,
  736                     XMM13,
  737                     XMM14,
  738                     XMM15
  739 #endif
  740                     );
  741 
  742 // Class for evex float registers
  743 reg_class float_reg_evex(XMM0,
  744                     XMM1,
  745                     XMM2,
  746                     XMM3,
  747                     XMM4,
  748                     XMM5,
  749                     XMM6,
  750                     XMM7
  751 #ifdef _LP64
  752                    ,XMM8,
  753                     XMM9,
  754                     XMM10,
  755                     XMM11,
  756                     XMM12,
  757                     XMM13,
  758                     XMM14,
  759                     XMM15,
  760                     XMM16,
  761                     XMM17,
  762                     XMM18,
  763                     XMM19,
  764                     XMM20,
  765                     XMM21,
  766                     XMM22,
  767                     XMM23,
  768                     XMM24,
  769                     XMM25,
  770                     XMM26,
  771                     XMM27,
  772                     XMM28,
  773                     XMM29,
  774                     XMM30,
  775                     XMM31
  776 #endif
  777                     );
  778 
  779 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
  780 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  781 
  782 // Class for pre evex double registers
  783 reg_class double_reg_legacy(XMM0,  XMM0b,
  784                      XMM1,  XMM1b,
  785                      XMM2,  XMM2b,
  786                      XMM3,  XMM3b,
  787                      XMM4,  XMM4b,
  788                      XMM5,  XMM5b,
  789                      XMM6,  XMM6b,
  790                      XMM7,  XMM7b
  791 #ifdef _LP64
  792                     ,XMM8,  XMM8b,
  793                      XMM9,  XMM9b,
  794                      XMM10, XMM10b,
  795                      XMM11, XMM11b,
  796                      XMM12, XMM12b,
  797                      XMM13, XMM13b,
  798                      XMM14, XMM14b,
  799                      XMM15, XMM15b
  800 #endif
  801                      );
  802 
  803 // Class for evex double registers
  804 reg_class double_reg_evex(XMM0,  XMM0b,
  805                      XMM1,  XMM1b,
  806                      XMM2,  XMM2b,
  807                      XMM3,  XMM3b,
  808                      XMM4,  XMM4b,
  809                      XMM5,  XMM5b,
  810                      XMM6,  XMM6b,
  811                      XMM7,  XMM7b
  812 #ifdef _LP64
  813                     ,XMM8,  XMM8b,
  814                      XMM9,  XMM9b,
  815                      XMM10, XMM10b,
  816                      XMM11, XMM11b,
  817                      XMM12, XMM12b,
  818                      XMM13, XMM13b,
  819                      XMM14, XMM14b,
  820                      XMM15, XMM15b,
  821                      XMM16, XMM16b,
  822                      XMM17, XMM17b,
  823                      XMM18, XMM18b,
  824                      XMM19, XMM19b,
  825                      XMM20, XMM20b,
  826                      XMM21, XMM21b,
  827                      XMM22, XMM22b,
  828                      XMM23, XMM23b,
  829                      XMM24, XMM24b,
  830                      XMM25, XMM25b,
  831                      XMM26, XMM26b,
  832                      XMM27, XMM27b,
  833                      XMM28, XMM28b,
  834                      XMM29, XMM29b,
  835                      XMM30, XMM30b,
  836                      XMM31, XMM31b
  837 #endif
  838                      );
  839 
  840 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
  841 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  842 
  843 // Class for pre evex 32bit vector registers
  844 reg_class vectors_reg_legacy(XMM0,
  845                       XMM1,
  846                       XMM2,
  847                       XMM3,
  848                       XMM4,
  849                       XMM5,
  850                       XMM6,
  851                       XMM7
  852 #ifdef _LP64
  853                      ,XMM8,
  854                       XMM9,
  855                       XMM10,
  856                       XMM11,
  857                       XMM12,
  858                       XMM13,
  859                       XMM14,
  860                       XMM15
  861 #endif
  862                       );
  863 
  864 // Class for evex 32bit vector registers
  865 reg_class vectors_reg_evex(XMM0,
  866                       XMM1,
  867                       XMM2,
  868                       XMM3,
  869                       XMM4,
  870                       XMM5,
  871                       XMM6,
  872                       XMM7
  873 #ifdef _LP64
  874                      ,XMM8,
  875                       XMM9,
  876                       XMM10,
  877                       XMM11,
  878                       XMM12,
  879                       XMM13,
  880                       XMM14,
  881                       XMM15,
  882                       XMM16,
  883                       XMM17,
  884                       XMM18,
  885                       XMM19,
  886                       XMM20,
  887                       XMM21,
  888                       XMM22,
  889                       XMM23,
  890                       XMM24,
  891                       XMM25,
  892                       XMM26,
  893                       XMM27,
  894                       XMM28,
  895                       XMM29,
  896                       XMM30,
  897                       XMM31
  898 #endif
  899                       );
  900 
  901 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
  902 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  903 
  904 // Class for all 64bit vector registers
  905 reg_class vectord_reg_legacy(XMM0,  XMM0b,
  906                       XMM1,  XMM1b,
  907                       XMM2,  XMM2b,
  908                       XMM3,  XMM3b,
  909                       XMM4,  XMM4b,
  910                       XMM5,  XMM5b,
  911                       XMM6,  XMM6b,
  912                       XMM7,  XMM7b
  913 #ifdef _LP64
  914                      ,XMM8,  XMM8b,
  915                       XMM9,  XMM9b,
  916                       XMM10, XMM10b,
  917                       XMM11, XMM11b,
  918                       XMM12, XMM12b,
  919                       XMM13, XMM13b,
  920                       XMM14, XMM14b,
  921                       XMM15, XMM15b
  922 #endif
  923                       );
  924 
  925 // Class for all 64bit vector registers
  926 reg_class vectord_reg_evex(XMM0,  XMM0b,
  927                       XMM1,  XMM1b,
  928                       XMM2,  XMM2b,
  929                       XMM3,  XMM3b,
  930                       XMM4,  XMM4b,
  931                       XMM5,  XMM5b,
  932                       XMM6,  XMM6b,
  933                       XMM7,  XMM7b
  934 #ifdef _LP64
  935                      ,XMM8,  XMM8b,
  936                       XMM9,  XMM9b,
  937                       XMM10, XMM10b,
  938                       XMM11, XMM11b,
  939                       XMM12, XMM12b,
  940                       XMM13, XMM13b,
  941                       XMM14, XMM14b,
  942                       XMM15, XMM15b,
  943                       XMM16, XMM16b,
  944                       XMM17, XMM17b,
  945                       XMM18, XMM18b,
  946                       XMM19, XMM19b,
  947                       XMM20, XMM20b,
  948                       XMM21, XMM21b,
  949                       XMM22, XMM22b,
  950                       XMM23, XMM23b,
  951                       XMM24, XMM24b,
  952                       XMM25, XMM25b,
  953                       XMM26, XMM26b,
  954                       XMM27, XMM27b,
  955                       XMM28, XMM28b,
  956                       XMM29, XMM29b,
  957                       XMM30, XMM30b,
  958                       XMM31, XMM31b
  959 #endif
  960                       );
  961 
  962 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
  963 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  964 
  965 // Class for all 128bit vector registers
  966 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
  967                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  968                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  969                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  970                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  971                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  972                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  973                       XMM7,  XMM7b,  XMM7c,  XMM7d
  974 #ifdef _LP64
  975                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  976                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  977                       XMM10, XMM10b, XMM10c, XMM10d,
  978                       XMM11, XMM11b, XMM11c, XMM11d,
  979                       XMM12, XMM12b, XMM12c, XMM12d,
  980                       XMM13, XMM13b, XMM13c, XMM13d,
  981                       XMM14, XMM14b, XMM14c, XMM14d,
  982                       XMM15, XMM15b, XMM15c, XMM15d
  983 #endif
  984                       );
  985 
  986 // Class for all 128bit vector registers
  987 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
  988                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  989                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  990                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  991                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  992                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  993                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  994                       XMM7,  XMM7b,  XMM7c,  XMM7d
  995 #ifdef _LP64
  996                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  997                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  998                       XMM10, XMM10b, XMM10c, XMM10d,
  999                       XMM11, XMM11b, XMM11c, XMM11d,
 1000                       XMM12, XMM12b, XMM12c, XMM12d,
 1001                       XMM13, XMM13b, XMM13c, XMM13d,
 1002                       XMM14, XMM14b, XMM14c, XMM14d,
 1003                       XMM15, XMM15b, XMM15c, XMM15d,
 1004                       XMM16, XMM16b, XMM16c, XMM16d,
 1005                       XMM17, XMM17b, XMM17c, XMM17d,
 1006                       XMM18, XMM18b, XMM18c, XMM18d,
 1007                       XMM19, XMM19b, XMM19c, XMM19d,
 1008                       XMM20, XMM20b, XMM20c, XMM20d,
 1009                       XMM21, XMM21b, XMM21c, XMM21d,
 1010                       XMM22, XMM22b, XMM22c, XMM22d,
 1011                       XMM23, XMM23b, XMM23c, XMM23d,
 1012                       XMM24, XMM24b, XMM24c, XMM24d,
 1013                       XMM25, XMM25b, XMM25c, XMM25d,
 1014                       XMM26, XMM26b, XMM26c, XMM26d,
 1015                       XMM27, XMM27b, XMM27c, XMM27d,
 1016                       XMM28, XMM28b, XMM28c, XMM28d,
 1017                       XMM29, XMM29b, XMM29c, XMM29d,
 1018                       XMM30, XMM30b, XMM30c, XMM30d,
 1019                       XMM31, XMM31b, XMM31c, XMM31d
 1020 #endif
 1021                       );
 1022 
 1023 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 1024 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1025 
 1026 // Class for all 256bit vector registers
 1027 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1028                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1029                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1030                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1031                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1032                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1033                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1034                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1035 #ifdef _LP64
 1036                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1037                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1038                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1039                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1040                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1041                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1042                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1043                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 1044 #endif
 1045                       );
 1046 
 1047 // Class for all 256bit vector registers
 1048 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1049                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1050                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1051                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1052                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1053                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1054                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1055                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1056 #ifdef _LP64
 1057                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1058                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1059                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1060                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1061                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1062                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1063                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1064                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
 1065                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
 1066                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
 1067                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
 1068                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
 1069                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
 1070                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
 1071                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
 1072                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
 1073                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
 1074                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
 1075                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
 1076                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
 1077                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
 1078                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
 1079                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
 1080                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
 1081 #endif
 1082                       );
 1083 
 1084 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
 1085 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1086 
 1087 // Class for all 512bit vector registers
 1088 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1089                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1090                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1091                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1092                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1093                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1094                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1095                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1096 #ifdef _LP64
 1097                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1098                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1099                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1100                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1101                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1102                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1103                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1104                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1105                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 1106                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 1107                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 1108                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 1109                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 1110                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 1111                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 1112                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 1113                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 1114                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 1115                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 1116                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 1117                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 1118                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 1119                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 1120                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 1121 #endif
 1122                       );
 1123 
 1124 // Class for restricted 512bit vector registers
 1125 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1126                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1127                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1128                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1129                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1130                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1131                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1132                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1133 #ifdef _LP64
 1134                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1135                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1136                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1137                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1138                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1139                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1140                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1141                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1142 #endif
 1143                       );
 1144 
 1145 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
 1146 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 1147 
 1148 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 1149 %}
 1150 
 1151 
 1152 //----------SOURCE BLOCK-------------------------------------------------------
 1153 // This is a block of C++ code which provides values, functions, and
 1154 // definitions necessary in the rest of the architecture description
 1155 
 1156 source_hpp %{
 1157 // Header information of the source block.
 1158 // Method declarations/definitions which are used outside
 1159 // the ad-scope can conveniently be defined here.
 1160 //
 1161 // To keep related declarations/definitions/uses close together,
 1162 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 1163 
 1164 #include "runtime/vm_version.hpp"
 1165 
 1166 class NativeJump;
 1167 
 1168 class CallStubImpl {
 1169 
 1170   //--------------------------------------------------------------
 1171   //---<  Used for optimization in Compile::shorten_branches  >---
 1172   //--------------------------------------------------------------
 1173 
 1174  public:
 1175   // Size of call trampoline stub.
 1176   static uint size_call_trampoline() {
 1177     return 0; // no call trampolines on this platform
 1178   }
 1179 
 1180   // number of relocations needed by a call trampoline stub
 1181   static uint reloc_call_trampoline() {
 1182     return 0; // no call trampolines on this platform
 1183   }
 1184 };
 1185 
 1186 class HandlerImpl {
 1187 
 1188  public:
 1189 
 1190   static int emit_exception_handler(C2_MacroAssembler *masm);
 1191   static int emit_deopt_handler(C2_MacroAssembler* masm);
 1192 
 1193   static uint size_exception_handler() {
 1194     // NativeCall instruction size is the same as NativeJump.
 1195     // exception handler starts out as jump and can be patched to
 1196     // a call be deoptimization.  (4932387)
 1197     // Note that this value is also credited (in output.cpp) to
 1198     // the size of the code section.
 1199     return NativeJump::instruction_size;
 1200   }
 1201 
 1202 #ifdef _LP64
 1203   static uint size_deopt_handler() {
 1204     // three 5 byte instructions plus one move for unreachable address.
 1205     return 15+3;
 1206   }
 1207 #else
 1208   static uint size_deopt_handler() {
 1209     // NativeCall instruction size is the same as NativeJump.
 1210     // exception handler starts out as jump and can be patched to
 1211     // a call be deoptimization.  (4932387)
 1212     // Note that this value is also credited (in output.cpp) to
 1213     // the size of the code section.
 1214     return 5 + NativeJump::instruction_size; // pushl(); jmp;
 1215   }
 1216 #endif
 1217 };
 1218 
 1219 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
 1220   switch(bytes) {
 1221     case  4: // fall-through
 1222     case  8: // fall-through
 1223     case 16: return Assembler::AVX_128bit;
 1224     case 32: return Assembler::AVX_256bit;
 1225     case 64: return Assembler::AVX_512bit;
 1226 
 1227     default: {
 1228       ShouldNotReachHere();
 1229       return Assembler::AVX_NoVec;
 1230     }
 1231   }
 1232 }
 1233 
 1234 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
 1235   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
 1236 }
 1237 
 1238 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
 1239   uint def_idx = use->operand_index(opnd);
 1240   Node* def = use->in(def_idx);
 1241   return vector_length_encoding(def);
 1242 }
 1243 
 1244 static inline bool is_vector_popcount_predicate(BasicType bt) {
 1245   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1246          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1247 }
 1248 
 1249 static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
 1250   return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
 1251            (VM_Version::supports_avx512vl() || vlen_bytes == 64);
 1252 }
 1253 
 1254 class Node::PD {
 1255 public:
 1256   enum NodeFlags {
 1257     Flag_intel_jcc_erratum    = Node::_last_flag << 1,
 1258     Flag_sets_carry_flag      = Node::_last_flag << 2,
 1259     Flag_sets_parity_flag     = Node::_last_flag << 3,
 1260     Flag_sets_zero_flag       = Node::_last_flag << 4,
 1261     Flag_sets_overflow_flag   = Node::_last_flag << 5,
 1262     Flag_sets_sign_flag       = Node::_last_flag << 6,
 1263     Flag_clears_carry_flag    = Node::_last_flag << 7,
 1264     Flag_clears_parity_flag   = Node::_last_flag << 8,
 1265     Flag_clears_zero_flag     = Node::_last_flag << 9,
 1266     Flag_clears_overflow_flag = Node::_last_flag << 10,
 1267     Flag_clears_sign_flag     = Node::_last_flag << 11,
 1268     _last_flag                = Flag_clears_sign_flag
 1269   };
 1270 };
 1271 
 1272 %} // end source_hpp
 1273 
 1274 source %{
 1275 
 1276 #include "opto/addnode.hpp"
 1277 #include "c2_intelJccErratum_x86.hpp"
 1278 
 1279 void PhaseOutput::pd_perform_mach_node_analysis() {
 1280   if (VM_Version::has_intel_jcc_erratum()) {
 1281     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
 1282     _buf_sizes._code += extra_padding;
 1283   }
 1284 }
 1285 
 1286 int MachNode::pd_alignment_required() const {
 1287   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
 1288     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
 1289     return IntelJccErratum::largest_jcc_size() + 1;
 1290   } else {
 1291     return 1;
 1292   }
 1293 }
 1294 
 1295 int MachNode::compute_padding(int current_offset) const {
 1296   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
 1297     Compile* C = Compile::current();
 1298     PhaseOutput* output = C->output();
 1299     Block* block = output->block();
 1300     int index = output->index();
 1301     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
 1302   } else {
 1303     return 0;
 1304   }
 1305 }
 1306 
 1307 // Emit exception handler code.
 1308 // Stuff framesize into a register and call a VM stub routine.
 1309 int HandlerImpl::emit_exception_handler(C2_MacroAssembler* masm) {
 1310 
 1311   // Note that the code buffer's insts_mark is always relative to insts.
 1312   // That's why we must use the macroassembler to generate a handler.
 1313   address base = __ start_a_stub(size_exception_handler());
 1314   if (base == nullptr) {
 1315     ciEnv::current()->record_failure("CodeCache is full");
 1316     return 0;  // CodeBuffer::expand failed
 1317   }
 1318   int offset = __ offset();
 1319   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 1320   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 1321   __ end_a_stub();
 1322   return offset;
 1323 }
 1324 
 1325 // Emit deopt handler code.
 1326 int HandlerImpl::emit_deopt_handler(C2_MacroAssembler* masm) {
 1327 
 1328   // Note that the code buffer's insts_mark is always relative to insts.
 1329   // That's why we must use the macroassembler to generate a handler.
 1330   address base = __ start_a_stub(size_deopt_handler());
 1331   if (base == nullptr) {
 1332     ciEnv::current()->record_failure("CodeCache is full");
 1333     return 0;  // CodeBuffer::expand failed
 1334   }
 1335   int offset = __ offset();
 1336 
 1337 #ifdef _LP64
 1338   address the_pc = (address) __ pc();
 1339   Label next;
 1340   // push a "the_pc" on the stack without destroying any registers
 1341   // as they all may be live.
 1342 
 1343   // push address of "next"
 1344   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 1345   __ bind(next);
 1346   // adjust it so it matches "the_pc"
 1347   __ subptr(Address(rsp, 0), __ offset() - offset);
 1348 #else
 1349   InternalAddress here(__ pc());
 1350   __ pushptr(here.addr(), noreg);
 1351 #endif
 1352 
 1353   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 1354   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
 1355   __ end_a_stub();
 1356   return offset;
 1357 }
 1358 
 1359 static Assembler::Width widthForType(BasicType bt) {
 1360   if (bt == T_BYTE) {
 1361     return Assembler::B;
 1362   } else if (bt == T_SHORT) {
 1363     return Assembler::W;
 1364   } else if (bt == T_INT) {
 1365     return Assembler::D;
 1366   } else {
 1367     assert(bt == T_LONG, "not a long: %s", type2name(bt));
 1368     return Assembler::Q;
 1369   }
 1370 }
 1371 
 1372 //=============================================================================
 1373 
 1374   // Float masks come from different places depending on platform.
 1375 #ifdef _LP64
 1376   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 1377   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 1378   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 1379   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 1380 #else
 1381   static address float_signmask()  { return (address)float_signmask_pool; }
 1382   static address float_signflip()  { return (address)float_signflip_pool; }
 1383   static address double_signmask() { return (address)double_signmask_pool; }
 1384   static address double_signflip() { return (address)double_signflip_pool; }
 1385 #endif
 1386   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
 1387   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
 1388   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
 1389   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
 1390   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
 1391   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
 1392   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
 1393   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
 1394   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
 1395   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
 1396   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
 1397   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
 1398   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
 1399   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
 1400   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
 1401 
 1402 //=============================================================================
 1403 bool Matcher::match_rule_supported(int opcode) {
 1404   if (!has_match_rule(opcode)) {
 1405     return false; // no match rule present
 1406   }
 1407   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1408   switch (opcode) {
 1409     case Op_AbsVL:
 1410     case Op_StoreVectorScatter:
 1411       if (UseAVX < 3) {
 1412         return false;
 1413       }
 1414       break;
 1415     case Op_PopCountI:
 1416     case Op_PopCountL:
 1417       if (!UsePopCountInstruction) {
 1418         return false;
 1419       }
 1420       break;
 1421     case Op_PopCountVI:
 1422       if (UseAVX < 2) {
 1423         return false;
 1424       }
 1425       break;
 1426     case Op_CompressV:
 1427     case Op_ExpandV:
 1428     case Op_PopCountVL:
 1429       if (UseAVX < 2) {
 1430         return false;
 1431       }
 1432       break;
 1433     case Op_MulVI:
 1434       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
 1435         return false;
 1436       }
 1437       break;
 1438     case Op_MulVL:
 1439       if (UseSSE < 4) { // only with SSE4_1 or AVX
 1440         return false;
 1441       }
 1442       break;
 1443     case Op_MulReductionVL:
 1444       if (VM_Version::supports_avx512dq() == false) {
 1445         return false;
 1446       }
 1447       break;
 1448     case Op_AddReductionVL:
 1449       if (UseSSE < 2) { // requires at least SSE2
 1450         return false;
 1451       }
 1452       break;
 1453     case Op_AbsVB:
 1454     case Op_AbsVS:
 1455     case Op_AbsVI:
 1456     case Op_AddReductionVI:
 1457     case Op_AndReductionV:
 1458     case Op_OrReductionV:
 1459     case Op_XorReductionV:
 1460       if (UseSSE < 3) { // requires at least SSSE3
 1461         return false;
 1462       }
 1463       break;
 1464     case Op_VectorLoadShuffle:
 1465     case Op_VectorRearrange:
 1466     case Op_MulReductionVI:
 1467       if (UseSSE < 4) { // requires at least SSE4
 1468         return false;
 1469       }
 1470       break;
 1471     case Op_IsInfiniteF:
 1472     case Op_IsInfiniteD:
 1473       if (!VM_Version::supports_avx512dq()) {
 1474         return false;
 1475       }
 1476       break;
 1477     case Op_SqrtVD:
 1478     case Op_SqrtVF:
 1479     case Op_VectorMaskCmp:
 1480     case Op_VectorCastB2X:
 1481     case Op_VectorCastS2X:
 1482     case Op_VectorCastI2X:
 1483     case Op_VectorCastL2X:
 1484     case Op_VectorCastF2X:
 1485     case Op_VectorCastD2X:
 1486     case Op_VectorUCastB2X:
 1487     case Op_VectorUCastS2X:
 1488     case Op_VectorUCastI2X:
 1489     case Op_VectorMaskCast:
 1490       if (UseAVX < 1) { // enabled for AVX only
 1491         return false;
 1492       }
 1493       break;
 1494     case Op_PopulateIndex:
 1495       if (!is_LP64 || (UseAVX < 2)) {
 1496         return false;
 1497       }
 1498       break;
 1499     case Op_RoundVF:
 1500       if (UseAVX < 2) { // enabled for AVX2 only
 1501         return false;
 1502       }
 1503       break;
 1504     case Op_RoundVD:
 1505       if (UseAVX < 3) {
 1506         return false;  // enabled for AVX3 only
 1507       }
 1508       break;
 1509     case Op_CompareAndSwapL:
 1510 #ifdef _LP64
 1511     case Op_CompareAndSwapP:
 1512 #endif
 1513       break;
 1514     case Op_StrIndexOf:
 1515       if (!UseSSE42Intrinsics) {
 1516         return false;
 1517       }
 1518       break;
 1519     case Op_StrIndexOfChar:
 1520       if (!UseSSE42Intrinsics) {
 1521         return false;
 1522       }
 1523       break;
 1524     case Op_OnSpinWait:
 1525       if (VM_Version::supports_on_spin_wait() == false) {
 1526         return false;
 1527       }
 1528       break;
 1529     case Op_MulVB:
 1530     case Op_LShiftVB:
 1531     case Op_RShiftVB:
 1532     case Op_URShiftVB:
 1533     case Op_VectorInsert:
 1534     case Op_VectorLoadMask:
 1535     case Op_VectorStoreMask:
 1536     case Op_VectorBlend:
 1537       if (UseSSE < 4) {
 1538         return false;
 1539       }
 1540       break;
 1541 #ifdef _LP64
 1542     case Op_MaxD:
 1543     case Op_MaxF:
 1544     case Op_MinD:
 1545     case Op_MinF:
 1546       if (UseAVX < 1) { // enabled for AVX only
 1547         return false;
 1548       }
 1549       break;
 1550 #endif
 1551     case Op_CacheWB:
 1552     case Op_CacheWBPreSync:
 1553     case Op_CacheWBPostSync:
 1554       if (!VM_Version::supports_data_cache_line_flush()) {
 1555         return false;
 1556       }
 1557       break;
 1558     case Op_ExtractB:
 1559     case Op_ExtractL:
 1560     case Op_ExtractI:
 1561     case Op_RoundDoubleMode:
 1562       if (UseSSE < 4) {
 1563         return false;
 1564       }
 1565       break;
 1566     case Op_RoundDoubleModeV:
 1567       if (VM_Version::supports_avx() == false) {
 1568         return false; // 128bit vroundpd is not available
 1569       }
 1570       break;
 1571     case Op_LoadVectorGather:
 1572     case Op_LoadVectorGatherMasked:
 1573       if (UseAVX < 2) {
 1574         return false;
 1575       }
 1576       break;
 1577     case Op_FmaF:
 1578     case Op_FmaD:
 1579     case Op_FmaVD:
 1580     case Op_FmaVF:
 1581       if (!UseFMA) {
 1582         return false;
 1583       }
 1584       break;
 1585     case Op_MacroLogicV:
 1586       if (UseAVX < 3 || !UseVectorMacroLogic) {
 1587         return false;
 1588       }
 1589       break;
 1590 
 1591     case Op_VectorCmpMasked:
 1592     case Op_VectorMaskGen:
 1593       if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1594         return false;
 1595       }
 1596       break;
 1597     case Op_VectorMaskFirstTrue:
 1598     case Op_VectorMaskLastTrue:
 1599     case Op_VectorMaskTrueCount:
 1600     case Op_VectorMaskToLong:
 1601       if (!is_LP64 || UseAVX < 1) {
 1602          return false;
 1603       }
 1604       break;
 1605     case Op_RoundF:
 1606     case Op_RoundD:
 1607       if (!is_LP64) {
 1608         return false;
 1609       }
 1610       break;
 1611     case Op_CopySignD:
 1612     case Op_CopySignF:
 1613       if (UseAVX < 3 || !is_LP64)  {
 1614         return false;
 1615       }
 1616       if (!VM_Version::supports_avx512vl()) {
 1617         return false;
 1618       }
 1619       break;
 1620 #ifndef _LP64
 1621     case Op_AddReductionVF:
 1622     case Op_AddReductionVD:
 1623     case Op_MulReductionVF:
 1624     case Op_MulReductionVD:
 1625       if (UseSSE < 1) { // requires at least SSE
 1626         return false;
 1627       }
 1628       break;
 1629     case Op_MulAddVS2VI:
 1630     case Op_RShiftVL:
 1631     case Op_AbsVD:
 1632     case Op_NegVD:
 1633       if (UseSSE < 2) {
 1634         return false;
 1635       }
 1636       break;
 1637 #endif // !LP64
 1638     case Op_CompressBits:
 1639       if (!VM_Version::supports_bmi2() || (!is_LP64 && UseSSE < 2)) {
 1640         return false;
 1641       }
 1642       break;
 1643     case Op_ExpandBits:
 1644       if (!VM_Version::supports_bmi2() || (!is_LP64 && (UseSSE < 2 || !VM_Version::supports_bmi1()))) {
 1645         return false;
 1646       }
 1647       break;
 1648     case Op_SignumF:
 1649       if (UseSSE < 1) {
 1650         return false;
 1651       }
 1652       break;
 1653     case Op_SignumD:
 1654       if (UseSSE < 2) {
 1655         return false;
 1656       }
 1657       break;
 1658     case Op_CompressM:
 1659       if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
 1660         return false;
 1661       }
 1662       break;
 1663     case Op_SqrtF:
 1664       if (UseSSE < 1) {
 1665         return false;
 1666       }
 1667       break;
 1668     case Op_SqrtD:
 1669 #ifdef _LP64
 1670       if (UseSSE < 2) {
 1671         return false;
 1672       }
 1673 #else
 1674       // x86_32.ad has a special match rule for SqrtD.
 1675       // Together with common x86 rules, this handles all UseSSE cases.
 1676 #endif
 1677       break;
 1678     case Op_ConvF2HF:
 1679     case Op_ConvHF2F:
 1680       if (!VM_Version::supports_float16()) {
 1681         return false;
 1682       }
 1683       break;
 1684     case Op_VectorCastF2HF:
 1685     case Op_VectorCastHF2F:
 1686       if (!VM_Version::supports_f16c() && !VM_Version::supports_evex()) {
 1687         return false;
 1688       }
 1689       break;
 1690   }
 1691   return true;  // Match rules are supported by default.
 1692 }
 1693 
 1694 //------------------------------------------------------------------------
 1695 
 1696 static inline bool is_pop_count_instr_target(BasicType bt) {
 1697   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1698          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1699 }
 1700 
 1701 bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
 1702   return match_rule_supported_vector(opcode, vlen, bt);
 1703 }
 1704 
 1705 // Identify extra cases that we might want to provide match rules for vector nodes and
 1706 // other intrinsics guarded with vector length (vlen) and element type (bt).
 1707 bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
 1708   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1709   if (!match_rule_supported(opcode)) {
 1710     return false;
 1711   }
 1712   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
 1713   //   * SSE2 supports 128bit vectors for all types;
 1714   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
 1715   //   * AVX2 supports 256bit vectors for all types;
 1716   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
 1717   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
 1718   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
 1719   // And MaxVectorSize is taken into account as well.
 1720   if (!vector_size_supported(bt, vlen)) {
 1721     return false;
 1722   }
 1723   // Special cases which require vector length follow:
 1724   //   * implementation limitations
 1725   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
 1726   //   * 128bit vroundpd instruction is present only in AVX1
 1727   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1728   switch (opcode) {
 1729     case Op_AbsVF:
 1730     case Op_NegVF:
 1731       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
 1732         return false; // 512bit vandps and vxorps are not available
 1733       }
 1734       break;
 1735     case Op_AbsVD:
 1736     case Op_NegVD:
 1737       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
 1738         return false; // 512bit vpmullq, vandpd and vxorpd are not available
 1739       }
 1740       break;
 1741     case Op_RotateRightV:
 1742     case Op_RotateLeftV:
 1743       if (bt != T_INT && bt != T_LONG) {
 1744         return false;
 1745       } // fallthrough
 1746     case Op_MacroLogicV:
 1747       if (!VM_Version::supports_evex() ||
 1748           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
 1749         return false;
 1750       }
 1751       break;
 1752     case Op_ClearArray:
 1753     case Op_VectorMaskGen:
 1754     case Op_VectorCmpMasked:
 1755       if (!is_LP64 || !VM_Version::supports_avx512bw()) {
 1756         return false;
 1757       }
 1758       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
 1759         return false;
 1760       }
 1761       break;
 1762     case Op_LoadVectorMasked:
 1763     case Op_StoreVectorMasked:
 1764       if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
 1765         return false;
 1766       }
 1767       break;
 1768     case Op_MaxV:
 1769     case Op_MinV:
 1770       if (UseSSE < 4 && is_integral_type(bt)) {
 1771         return false;
 1772       }
 1773       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
 1774           // Float/Double intrinsics are enabled for AVX family currently.
 1775           if (UseAVX == 0) {
 1776             return false;
 1777           }
 1778           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
 1779             return false;
 1780           }
 1781       }
 1782       break;
 1783     case Op_CallLeafVector:
 1784       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
 1785         return false;
 1786       }
 1787       break;
 1788     case Op_AddReductionVI:
 1789       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
 1790         return false;
 1791       }
 1792       // fallthrough
 1793     case Op_AndReductionV:
 1794     case Op_OrReductionV:
 1795     case Op_XorReductionV:
 1796       if (is_subword_type(bt) && (UseSSE < 4)) {
 1797         return false;
 1798       }
 1799 #ifndef _LP64
 1800       if (bt == T_BYTE || bt == T_LONG) {
 1801         return false;
 1802       }
 1803 #endif
 1804       break;
 1805 #ifndef _LP64
 1806     case Op_VectorInsert:
 1807       if (bt == T_LONG || bt == T_DOUBLE) {
 1808         return false;
 1809       }
 1810       break;
 1811 #endif
 1812     case Op_MinReductionV:
 1813     case Op_MaxReductionV:
 1814       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
 1815         return false;
 1816       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
 1817         return false;
 1818       }
 1819       // Float/Double intrinsics enabled for AVX family.
 1820       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
 1821         return false;
 1822       }
 1823       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
 1824         return false;
 1825       }
 1826 #ifndef _LP64
 1827       if (bt == T_BYTE || bt == T_LONG) {
 1828         return false;
 1829       }
 1830 #endif
 1831       break;
 1832     case Op_VectorTest:
 1833       if (UseSSE < 4) {
 1834         return false; // Implementation limitation
 1835       } else if (size_in_bits < 32) {
 1836         return false; // Implementation limitation
 1837       }
 1838       break;
 1839     case Op_VectorLoadShuffle:
 1840     case Op_VectorRearrange:
 1841       if(vlen == 2) {
 1842         return false; // Implementation limitation due to how shuffle is loaded
 1843       } else if (size_in_bits == 256 && UseAVX < 2) {
 1844         return false; // Implementation limitation
 1845       }
 1846       break;
 1847     case Op_VectorLoadMask:
 1848     case Op_VectorMaskCast:
 1849       if (size_in_bits == 256 && UseAVX < 2) {
 1850         return false; // Implementation limitation
 1851       }
 1852       // fallthrough
 1853     case Op_VectorStoreMask:
 1854       if (vlen == 2) {
 1855         return false; // Implementation limitation
 1856       }
 1857       break;
 1858     case Op_PopulateIndex:
 1859       if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
 1860         return false;
 1861       }
 1862       break;
 1863     case Op_VectorCastB2X:
 1864     case Op_VectorCastS2X:
 1865     case Op_VectorCastI2X:
 1866       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
 1867         return false;
 1868       }
 1869       break;
 1870     case Op_VectorCastL2X:
 1871       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
 1872         return false;
 1873       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
 1874         return false;
 1875       }
 1876       break;
 1877     case Op_VectorCastF2X: {
 1878         // As per JLS section 5.1.3 narrowing conversion to sub-word types
 1879         // happen after intermediate conversion to integer and special handling
 1880         // code needs AVX2 vpcmpeqd instruction for 256 bit vectors.
 1881         int src_size_in_bits = type2aelembytes(T_FLOAT) * vlen * BitsPerByte;
 1882         if (is_integral_type(bt) && src_size_in_bits == 256 && UseAVX < 2) {
 1883           return false;
 1884         }
 1885       }
 1886       // fallthrough
 1887     case Op_VectorCastD2X:
 1888       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
 1889         return false;
 1890       }
 1891       break;
 1892     case Op_VectorCastF2HF:
 1893     case Op_VectorCastHF2F:
 1894       if (!VM_Version::supports_f16c() &&
 1895          ((!VM_Version::supports_evex() ||
 1896          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())))) {
 1897         return false;
 1898       }
 1899       break;
 1900     case Op_RoundVD:
 1901       if (!VM_Version::supports_avx512dq()) {
 1902         return false;
 1903       }
 1904       break;
 1905     case Op_MulReductionVI:
 1906       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1907         return false;
 1908       }
 1909       break;
 1910     case Op_LoadVectorGatherMasked:
 1911       if (!is_subword_type(bt) && size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1912         return false;
 1913       }
 1914       if (is_subword_type(bt) &&
 1915          (!is_LP64                                                ||
 1916          (size_in_bits > 256 && !VM_Version::supports_avx512bw()) ||
 1917          (size_in_bits < 64)                                      ||
 1918          (bt == T_SHORT && !VM_Version::supports_bmi2()))) {
 1919         return false;
 1920       }
 1921       break;
 1922     case Op_StoreVectorScatterMasked:
 1923     case Op_StoreVectorScatter:
 1924       if (is_subword_type(bt)) {
 1925         return false;
 1926       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1927         return false;
 1928       }
 1929       // fallthrough
 1930     case Op_LoadVectorGather:
 1931       if (!is_subword_type(bt) && size_in_bits == 64) {
 1932         return false;
 1933       }
 1934       if (is_subword_type(bt) && size_in_bits < 64) {
 1935         return false;
 1936       }
 1937       break;
 1938     case Op_SelectFromTwoVector:
 1939        if (size_in_bits < 128 || (size_in_bits < 512 && !VM_Version::supports_avx512vl())) {
 1940          return false;
 1941        }
 1942        if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 1943          return false;
 1944        }
 1945        if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 1946          return false;
 1947        }
 1948        if ((bt == T_INT || bt == T_FLOAT || bt == T_DOUBLE) && !VM_Version::supports_evex()) {
 1949          return false;
 1950        }
 1951        break;
 1952     case Op_MaskAll:
 1953       if (!VM_Version::supports_evex()) {
 1954         return false;
 1955       }
 1956       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
 1957         return false;
 1958       }
 1959       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1960         return false;
 1961       }
 1962       break;
 1963     case Op_VectorMaskCmp:
 1964       if (vlen < 2 || size_in_bits < 32) {
 1965         return false;
 1966       }
 1967       break;
 1968     case Op_CompressM:
 1969       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1970         return false;
 1971       }
 1972       break;
 1973     case Op_CompressV:
 1974     case Op_ExpandV:
 1975       if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
 1976         return false;
 1977       }
 1978       if (!is_LP64 && !VM_Version::supports_avx512vl() && size_in_bits < 512) {
 1979         return false;
 1980       }
 1981       if (size_in_bits < 128 ) {
 1982         return false;
 1983       }
 1984     case Op_VectorLongToMask:
 1985       if (UseAVX < 1 || !is_LP64) {
 1986         return false;
 1987       }
 1988       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
 1989         return false;
 1990       }
 1991       break;
 1992     case Op_SignumVD:
 1993     case Op_SignumVF:
 1994       if (UseAVX < 1) {
 1995         return false;
 1996       }
 1997       break;
 1998     case Op_PopCountVI:
 1999     case Op_PopCountVL: {
 2000         if (!is_pop_count_instr_target(bt) &&
 2001             (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
 2002           return false;
 2003         }
 2004       }
 2005       break;
 2006     case Op_ReverseV:
 2007     case Op_ReverseBytesV:
 2008       if (UseAVX < 2) {
 2009         return false;
 2010       }
 2011       break;
 2012     case Op_CountTrailingZerosV:
 2013     case Op_CountLeadingZerosV:
 2014       if (UseAVX < 2) {
 2015         return false;
 2016       }
 2017       break;
 2018   }
 2019   return true;  // Per default match rules are supported.
 2020 }
 2021 
 2022 bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
 2023   // ADLC based match_rule_supported routine checks for the existence of pattern based
 2024   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
 2025   // of their non-masked counterpart with mask edge being the differentiator.
 2026   // This routine does a strict check on the existence of masked operation patterns
 2027   // by returning a default false value for all the other opcodes apart from the
 2028   // ones whose masked instruction patterns are defined in this file.
 2029   if (!match_rule_supported_vector(opcode, vlen, bt)) {
 2030     return false;
 2031   }
 2032 
 2033   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 2034   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 2035   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
 2036     return false;
 2037   }
 2038   switch(opcode) {
 2039     // Unary masked operations
 2040     case Op_AbsVB:
 2041     case Op_AbsVS:
 2042       if(!VM_Version::supports_avx512bw()) {
 2043         return false;  // Implementation limitation
 2044       }
 2045     case Op_AbsVI:
 2046     case Op_AbsVL:
 2047       return true;
 2048 
 2049     // Ternary masked operations
 2050     case Op_FmaVF:
 2051     case Op_FmaVD:
 2052       return true;
 2053 
 2054     case Op_MacroLogicV:
 2055       if(bt != T_INT && bt != T_LONG) {
 2056         return false;
 2057       }
 2058       return true;
 2059 
 2060     // Binary masked operations
 2061     case Op_AddVB:
 2062     case Op_AddVS:
 2063     case Op_SubVB:
 2064     case Op_SubVS:
 2065     case Op_MulVS:
 2066     case Op_LShiftVS:
 2067     case Op_RShiftVS:
 2068     case Op_URShiftVS:
 2069       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2070       if (!VM_Version::supports_avx512bw()) {
 2071         return false;  // Implementation limitation
 2072       }
 2073       return true;
 2074 
 2075     case Op_MulVL:
 2076       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2077       if (!VM_Version::supports_avx512dq()) {
 2078         return false;  // Implementation limitation
 2079       }
 2080       return true;
 2081 
 2082     case Op_AndV:
 2083     case Op_OrV:
 2084     case Op_XorV:
 2085     case Op_RotateRightV:
 2086     case Op_RotateLeftV:
 2087       if (bt != T_INT && bt != T_LONG) {
 2088         return false; // Implementation limitation
 2089       }
 2090       return true;
 2091 
 2092     case Op_VectorLoadMask:
 2093       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2094       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2095         return false;
 2096       }
 2097       return true;
 2098 
 2099     case Op_AddVI:
 2100     case Op_AddVL:
 2101     case Op_AddVF:
 2102     case Op_AddVD:
 2103     case Op_SubVI:
 2104     case Op_SubVL:
 2105     case Op_SubVF:
 2106     case Op_SubVD:
 2107     case Op_MulVI:
 2108     case Op_MulVF:
 2109     case Op_MulVD:
 2110     case Op_DivVF:
 2111     case Op_DivVD:
 2112     case Op_SqrtVF:
 2113     case Op_SqrtVD:
 2114     case Op_LShiftVI:
 2115     case Op_LShiftVL:
 2116     case Op_RShiftVI:
 2117     case Op_RShiftVL:
 2118     case Op_URShiftVI:
 2119     case Op_URShiftVL:
 2120     case Op_LoadVectorMasked:
 2121     case Op_StoreVectorMasked:
 2122     case Op_LoadVectorGatherMasked:
 2123     case Op_StoreVectorScatterMasked:
 2124       return true;
 2125 
 2126     case Op_MaxV:
 2127     case Op_MinV:
 2128       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2129         return false; // Implementation limitation
 2130       }
 2131       if (is_floating_point_type(bt)) {
 2132         return false; // Implementation limitation
 2133       }
 2134       return true;
 2135 
 2136     case Op_VectorMaskCmp:
 2137       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2138         return false; // Implementation limitation
 2139       }
 2140       return true;
 2141 
 2142     case Op_VectorRearrange:
 2143       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 2144         return false; // Implementation limitation
 2145       }
 2146       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 2147         return false; // Implementation limitation
 2148       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
 2149         return false; // Implementation limitation
 2150       }
 2151       return true;
 2152 
 2153     // Binary Logical operations
 2154     case Op_AndVMask:
 2155     case Op_OrVMask:
 2156     case Op_XorVMask:
 2157       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
 2158         return false; // Implementation limitation
 2159       }
 2160       return true;
 2161 
 2162     case Op_PopCountVI:
 2163     case Op_PopCountVL:
 2164       if (!is_pop_count_instr_target(bt)) {
 2165         return false;
 2166       }
 2167       return true;
 2168 
 2169     case Op_MaskAll:
 2170       return true;
 2171 
 2172     case Op_CountLeadingZerosV:
 2173       if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
 2174         return true;
 2175       }
 2176     default:
 2177       return false;
 2178   }
 2179 }
 2180 
 2181 bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
 2182   return false;
 2183 }
 2184 
 2185 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
 2186   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
 2187   bool legacy = (generic_opnd->opcode() == LEGVEC);
 2188   if (!VM_Version::supports_avx512vlbwdq() && // KNL
 2189       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
 2190     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
 2191     return new legVecZOper();
 2192   }
 2193   if (legacy) {
 2194     switch (ideal_reg) {
 2195       case Op_VecS: return new legVecSOper();
 2196       case Op_VecD: return new legVecDOper();
 2197       case Op_VecX: return new legVecXOper();
 2198       case Op_VecY: return new legVecYOper();
 2199       case Op_VecZ: return new legVecZOper();
 2200     }
 2201   } else {
 2202     switch (ideal_reg) {
 2203       case Op_VecS: return new vecSOper();
 2204       case Op_VecD: return new vecDOper();
 2205       case Op_VecX: return new vecXOper();
 2206       case Op_VecY: return new vecYOper();
 2207       case Op_VecZ: return new vecZOper();
 2208     }
 2209   }
 2210   ShouldNotReachHere();
 2211   return nullptr;
 2212 }
 2213 
 2214 bool Matcher::is_reg2reg_move(MachNode* m) {
 2215   switch (m->rule()) {
 2216     case MoveVec2Leg_rule:
 2217     case MoveLeg2Vec_rule:
 2218     case MoveF2VL_rule:
 2219     case MoveF2LEG_rule:
 2220     case MoveVL2F_rule:
 2221     case MoveLEG2F_rule:
 2222     case MoveD2VL_rule:
 2223     case MoveD2LEG_rule:
 2224     case MoveVL2D_rule:
 2225     case MoveLEG2D_rule:
 2226       return true;
 2227     default:
 2228       return false;
 2229   }
 2230 }
 2231 
 2232 bool Matcher::is_generic_vector(MachOper* opnd) {
 2233   switch (opnd->opcode()) {
 2234     case VEC:
 2235     case LEGVEC:
 2236       return true;
 2237     default:
 2238       return false;
 2239   }
 2240 }
 2241 
 2242 //------------------------------------------------------------------------
 2243 
 2244 const RegMask* Matcher::predicate_reg_mask(void) {
 2245   return &_VECTMASK_REG_mask;
 2246 }
 2247 
 2248 // Max vector size in bytes. 0 if not supported.
 2249 int Matcher::vector_width_in_bytes(BasicType bt) {
 2250   assert(is_java_primitive(bt), "only primitive type vectors");
 2251   if (UseSSE < 2) return 0;
 2252   // SSE2 supports 128bit vectors for all types.
 2253   // AVX2 supports 256bit vectors for all types.
 2254   // AVX2/EVEX supports 512bit vectors for all types.
 2255   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
 2256   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 2257   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 2258     size = (UseAVX > 2) ? 64 : 32;
 2259   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
 2260     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
 2261   // Use flag to limit vector size.
 2262   size = MIN2(size,(int)MaxVectorSize);
 2263   // Minimum 2 values in vector (or 4 for bytes).
 2264   switch (bt) {
 2265   case T_DOUBLE:
 2266   case T_LONG:
 2267     if (size < 16) return 0;
 2268     break;
 2269   case T_FLOAT:
 2270   case T_INT:
 2271     if (size < 8) return 0;
 2272     break;
 2273   case T_BOOLEAN:
 2274     if (size < 4) return 0;
 2275     break;
 2276   case T_CHAR:
 2277     if (size < 4) return 0;
 2278     break;
 2279   case T_BYTE:
 2280     if (size < 4) return 0;
 2281     break;
 2282   case T_SHORT:
 2283     if (size < 4) return 0;
 2284     break;
 2285   default:
 2286     ShouldNotReachHere();
 2287   }
 2288   return size;
 2289 }
 2290 
 2291 // Limits on vector size (number of elements) loaded into vector.
 2292 int Matcher::max_vector_size(const BasicType bt) {
 2293   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 2294 }
 2295 int Matcher::min_vector_size(const BasicType bt) {
 2296   int max_size = max_vector_size(bt);
 2297   // Min size which can be loaded into vector is 4 bytes.
 2298   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 2299   // Support for calling svml double64 vectors
 2300   if (bt == T_DOUBLE) {
 2301     size = 1;
 2302   }
 2303   return MIN2(size,max_size);
 2304 }
 2305 
 2306 int Matcher::max_vector_size_auto_vectorization(const BasicType bt) {
 2307   // Limit the max vector size for auto vectorization to 256 bits (32 bytes)
 2308   // by default on Cascade Lake
 2309   if (VM_Version::is_default_intel_cascade_lake()) {
 2310     return MIN2(Matcher::max_vector_size(bt), 32 / type2aelembytes(bt));
 2311   }
 2312   return Matcher::max_vector_size(bt);
 2313 }
 2314 
 2315 int Matcher::scalable_vector_reg_size(const BasicType bt) {
 2316   return -1;
 2317 }
 2318 
 2319 // Vector ideal reg corresponding to specified size in bytes
 2320 uint Matcher::vector_ideal_reg(int size) {
 2321   assert(MaxVectorSize >= size, "");
 2322   switch(size) {
 2323     case  4: return Op_VecS;
 2324     case  8: return Op_VecD;
 2325     case 16: return Op_VecX;
 2326     case 32: return Op_VecY;
 2327     case 64: return Op_VecZ;
 2328   }
 2329   ShouldNotReachHere();
 2330   return 0;
 2331 }
 2332 
 2333 // Check for shift by small constant as well
 2334 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
 2335   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
 2336       shift->in(2)->get_int() <= 3 &&
 2337       // Are there other uses besides address expressions?
 2338       !matcher->is_visited(shift)) {
 2339     address_visited.set(shift->_idx); // Flag as address_visited
 2340     mstack.push(shift->in(2), Matcher::Visit);
 2341     Node *conv = shift->in(1);
 2342 #ifdef _LP64
 2343     // Allow Matcher to match the rule which bypass
 2344     // ConvI2L operation for an array index on LP64
 2345     // if the index value is positive.
 2346     if (conv->Opcode() == Op_ConvI2L &&
 2347         conv->as_Type()->type()->is_long()->_lo >= 0 &&
 2348         // Are there other uses besides address expressions?
 2349         !matcher->is_visited(conv)) {
 2350       address_visited.set(conv->_idx); // Flag as address_visited
 2351       mstack.push(conv->in(1), Matcher::Pre_Visit);
 2352     } else
 2353 #endif
 2354       mstack.push(conv, Matcher::Pre_Visit);
 2355     return true;
 2356   }
 2357   return false;
 2358 }
 2359 
 2360 // This function identifies sub-graphs in which a 'load' node is
 2361 // input to two different nodes, and such that it can be matched
 2362 // with BMI instructions like blsi, blsr, etc.
 2363 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
 2364 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
 2365 // refers to the same node.
 2366 //
 2367 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
 2368 // This is a temporary solution until we make DAGs expressible in ADL.
 2369 template<typename ConType>
 2370 class FusedPatternMatcher {
 2371   Node* _op1_node;
 2372   Node* _mop_node;
 2373   int _con_op;
 2374 
 2375   static int match_next(Node* n, int next_op, int next_op_idx) {
 2376     if (n->in(1) == nullptr || n->in(2) == nullptr) {
 2377       return -1;
 2378     }
 2379 
 2380     if (next_op_idx == -1) { // n is commutative, try rotations
 2381       if (n->in(1)->Opcode() == next_op) {
 2382         return 1;
 2383       } else if (n->in(2)->Opcode() == next_op) {
 2384         return 2;
 2385       }
 2386     } else {
 2387       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
 2388       if (n->in(next_op_idx)->Opcode() == next_op) {
 2389         return next_op_idx;
 2390       }
 2391     }
 2392     return -1;
 2393   }
 2394 
 2395  public:
 2396   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
 2397     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
 2398 
 2399   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
 2400              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
 2401              typename ConType::NativeType con_value) {
 2402     if (_op1_node->Opcode() != op1) {
 2403       return false;
 2404     }
 2405     if (_mop_node->outcnt() > 2) {
 2406       return false;
 2407     }
 2408     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
 2409     if (op1_op2_idx == -1) {
 2410       return false;
 2411     }
 2412     // Memory operation must be the other edge
 2413     int op1_mop_idx = (op1_op2_idx & 1) + 1;
 2414 
 2415     // Check that the mop node is really what we want
 2416     if (_op1_node->in(op1_mop_idx) == _mop_node) {
 2417       Node* op2_node = _op1_node->in(op1_op2_idx);
 2418       if (op2_node->outcnt() > 1) {
 2419         return false;
 2420       }
 2421       assert(op2_node->Opcode() == op2, "Should be");
 2422       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
 2423       if (op2_con_idx == -1) {
 2424         return false;
 2425       }
 2426       // Memory operation must be the other edge
 2427       int op2_mop_idx = (op2_con_idx & 1) + 1;
 2428       // Check that the memory operation is the same node
 2429       if (op2_node->in(op2_mop_idx) == _mop_node) {
 2430         // Now check the constant
 2431         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
 2432         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
 2433           return true;
 2434         }
 2435       }
 2436     }
 2437     return false;
 2438   }
 2439 };
 2440 
 2441 static bool is_bmi_pattern(Node* n, Node* m) {
 2442   assert(UseBMI1Instructions, "sanity");
 2443   if (n != nullptr && m != nullptr) {
 2444     if (m->Opcode() == Op_LoadI) {
 2445       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
 2446       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
 2447              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
 2448              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
 2449     } else if (m->Opcode() == Op_LoadL) {
 2450       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
 2451       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
 2452              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
 2453              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
 2454     }
 2455   }
 2456   return false;
 2457 }
 2458 
 2459 // Should the matcher clone input 'm' of node 'n'?
 2460 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 2461   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
 2462   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
 2463     mstack.push(m, Visit);
 2464     return true;
 2465   }
 2466   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
 2467     mstack.push(m, Visit);           // m = ShiftCntV
 2468     return true;
 2469   }
 2470   if (is_encode_and_store_pattern(n, m)) {
 2471     mstack.push(m, Visit);
 2472     return true;
 2473   }
 2474   return false;
 2475 }
 2476 
 2477 // Should the Matcher clone shifts on addressing modes, expecting them
 2478 // to be subsumed into complex addressing expressions or compute them
 2479 // into registers?
 2480 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 2481   Node *off = m->in(AddPNode::Offset);
 2482   if (off->is_Con()) {
 2483     address_visited.test_set(m->_idx); // Flag as address_visited
 2484     Node *adr = m->in(AddPNode::Address);
 2485 
 2486     // Intel can handle 2 adds in addressing mode
 2487     // AtomicAdd is not an addressing expression.
 2488     // Cheap to find it by looking for screwy base.
 2489     if (adr->is_AddP() &&
 2490         !adr->in(AddPNode::Base)->is_top() &&
 2491         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
 2492         // Are there other uses besides address expressions?
 2493         !is_visited(adr)) {
 2494       address_visited.set(adr->_idx); // Flag as address_visited
 2495       Node *shift = adr->in(AddPNode::Offset);
 2496       if (!clone_shift(shift, this, mstack, address_visited)) {
 2497         mstack.push(shift, Pre_Visit);
 2498       }
 2499       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
 2500       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
 2501     } else {
 2502       mstack.push(adr, Pre_Visit);
 2503     }
 2504 
 2505     // Clone X+offset as it also folds into most addressing expressions
 2506     mstack.push(off, Visit);
 2507     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2508     return true;
 2509   } else if (clone_shift(off, this, mstack, address_visited)) {
 2510     address_visited.test_set(m->_idx); // Flag as address_visited
 2511     mstack.push(m->in(AddPNode::Address), Pre_Visit);
 2512     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2513     return true;
 2514   }
 2515   return false;
 2516 }
 2517 
 2518 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
 2519   switch (bt) {
 2520     case BoolTest::eq:
 2521       return Assembler::eq;
 2522     case BoolTest::ne:
 2523       return Assembler::neq;
 2524     case BoolTest::le:
 2525     case BoolTest::ule:
 2526       return Assembler::le;
 2527     case BoolTest::ge:
 2528     case BoolTest::uge:
 2529       return Assembler::nlt;
 2530     case BoolTest::lt:
 2531     case BoolTest::ult:
 2532       return Assembler::lt;
 2533     case BoolTest::gt:
 2534     case BoolTest::ugt:
 2535       return Assembler::nle;
 2536     default : ShouldNotReachHere(); return Assembler::_false;
 2537   }
 2538 }
 2539 
 2540 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
 2541   switch (bt) {
 2542   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
 2543   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
 2544   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
 2545   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
 2546   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
 2547   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
 2548   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
 2549   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
 2550   }
 2551 }
 2552 
 2553 // Helper methods for MachSpillCopyNode::implementation().
 2554 static void vec_mov_helper(C2_MacroAssembler *masm, int src_lo, int dst_lo,
 2555                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 2556   assert(ireg == Op_VecS || // 32bit vector
 2557          ((src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 2558           (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi),
 2559          "no non-adjacent vector moves" );
 2560   if (masm) {
 2561     switch (ireg) {
 2562     case Op_VecS: // copy whole register
 2563     case Op_VecD:
 2564     case Op_VecX:
 2565 #ifndef _LP64
 2566       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2567 #else
 2568       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2569         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2570       } else {
 2571         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2572      }
 2573 #endif
 2574       break;
 2575     case Op_VecY:
 2576 #ifndef _LP64
 2577       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2578 #else
 2579       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2580         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2581       } else {
 2582         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2583      }
 2584 #endif
 2585       break;
 2586     case Op_VecZ:
 2587       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
 2588       break;
 2589     default:
 2590       ShouldNotReachHere();
 2591     }
 2592 #ifndef PRODUCT
 2593   } else {
 2594     switch (ireg) {
 2595     case Op_VecS:
 2596     case Op_VecD:
 2597     case Op_VecX:
 2598       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2599       break;
 2600     case Op_VecY:
 2601     case Op_VecZ:
 2602       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2603       break;
 2604     default:
 2605       ShouldNotReachHere();
 2606     }
 2607 #endif
 2608   }
 2609 }
 2610 
 2611 void vec_spill_helper(C2_MacroAssembler *masm, bool is_load,
 2612                      int stack_offset, int reg, uint ireg, outputStream* st) {
 2613   if (masm) {
 2614     if (is_load) {
 2615       switch (ireg) {
 2616       case Op_VecS:
 2617         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2618         break;
 2619       case Op_VecD:
 2620         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2621         break;
 2622       case Op_VecX:
 2623 #ifndef _LP64
 2624         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2625 #else
 2626         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2627           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2628         } else {
 2629           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2630           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2631         }
 2632 #endif
 2633         break;
 2634       case Op_VecY:
 2635 #ifndef _LP64
 2636         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2637 #else
 2638         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2639           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2640         } else {
 2641           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2642           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2643         }
 2644 #endif
 2645         break;
 2646       case Op_VecZ:
 2647         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
 2648         break;
 2649       default:
 2650         ShouldNotReachHere();
 2651       }
 2652     } else { // store
 2653       switch (ireg) {
 2654       case Op_VecS:
 2655         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2656         break;
 2657       case Op_VecD:
 2658         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2659         break;
 2660       case Op_VecX:
 2661 #ifndef _LP64
 2662         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2663 #else
 2664         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2665           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2666         }
 2667         else {
 2668           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2669         }
 2670 #endif
 2671         break;
 2672       case Op_VecY:
 2673 #ifndef _LP64
 2674         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2675 #else
 2676         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2677           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2678         }
 2679         else {
 2680           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2681         }
 2682 #endif
 2683         break;
 2684       case Op_VecZ:
 2685         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2686         break;
 2687       default:
 2688         ShouldNotReachHere();
 2689       }
 2690     }
 2691 #ifndef PRODUCT
 2692   } else {
 2693     if (is_load) {
 2694       switch (ireg) {
 2695       case Op_VecS:
 2696         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2697         break;
 2698       case Op_VecD:
 2699         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2700         break;
 2701        case Op_VecX:
 2702         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2703         break;
 2704       case Op_VecY:
 2705       case Op_VecZ:
 2706         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2707         break;
 2708       default:
 2709         ShouldNotReachHere();
 2710       }
 2711     } else { // store
 2712       switch (ireg) {
 2713       case Op_VecS:
 2714         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2715         break;
 2716       case Op_VecD:
 2717         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2718         break;
 2719        case Op_VecX:
 2720         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2721         break;
 2722       case Op_VecY:
 2723       case Op_VecZ:
 2724         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2725         break;
 2726       default:
 2727         ShouldNotReachHere();
 2728       }
 2729     }
 2730 #endif
 2731   }
 2732 }
 2733 
 2734 template <class T>
 2735 static inline GrowableArray<jvalue>* vreplicate_imm(BasicType bt, T con, int len) {
 2736   GrowableArray<jvalue>* val = new GrowableArray<jvalue>(len);
 2737   jvalue ele;
 2738   switch (bt) {
 2739     case T_BYTE:   ele.b = con; break;
 2740     case T_SHORT:  ele.s = con; break;
 2741     case T_INT:    ele.i = con; break;
 2742     case T_LONG:   ele.j = con; break;
 2743     case T_FLOAT:  ele.f = con; break;
 2744     case T_DOUBLE: ele.d = con; break;
 2745     default: ShouldNotReachHere();
 2746   }
 2747   for (int i = 0; i < len; i++) {
 2748     val->append(ele);
 2749   }
 2750   return val;
 2751 }
 2752 
 2753 static inline jlong high_bit_set(BasicType bt) {
 2754   switch (bt) {
 2755     case T_BYTE:  return 0x8080808080808080;
 2756     case T_SHORT: return 0x8000800080008000;
 2757     case T_INT:   return 0x8000000080000000;
 2758     case T_LONG:  return 0x8000000000000000;
 2759     default:
 2760       ShouldNotReachHere();
 2761       return 0;
 2762   }
 2763 }
 2764 
 2765 #ifndef PRODUCT
 2766   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 2767     st->print("nop \t# %d bytes pad for loops and calls", _count);
 2768   }
 2769 #endif
 2770 
 2771   void MachNopNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc*) const {
 2772     __ nop(_count);
 2773   }
 2774 
 2775   uint MachNopNode::size(PhaseRegAlloc*) const {
 2776     return _count;
 2777   }
 2778 
 2779 #ifndef PRODUCT
 2780   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 2781     st->print("# breakpoint");
 2782   }
 2783 #endif
 2784 
 2785   void MachBreakpointNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc* ra_) const {
 2786     __ int3();
 2787   }
 2788 
 2789   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 2790     return MachNode::size(ra_);
 2791   }
 2792 
 2793 %}
 2794 
 2795 encode %{
 2796 
 2797   enc_class call_epilog %{
 2798     if (VerifyStackAtCalls) {
 2799       // Check that stack depth is unchanged: find majik cookie on stack
 2800       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 2801       Label L;
 2802       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 2803       __ jccb(Assembler::equal, L);
 2804       // Die if stack mismatch
 2805       __ int3();
 2806       __ bind(L);
 2807     }
 2808   %}
 2809 
 2810 %}
 2811 
 2812 // Operands for bound floating pointer register arguments
 2813 operand rxmm0() %{
 2814   constraint(ALLOC_IN_RC(xmm0_reg));
 2815   match(VecX);
 2816   format%{%}
 2817   interface(REG_INTER);
 2818 %}
 2819 
 2820 //----------OPERANDS-----------------------------------------------------------
 2821 // Operand definitions must precede instruction definitions for correct parsing
 2822 // in the ADLC because operands constitute user defined types which are used in
 2823 // instruction definitions.
 2824 
 2825 // Vectors
 2826 
 2827 // Dummy generic vector class. Should be used for all vector operands.
 2828 // Replaced with vec[SDXYZ] during post-selection pass.
 2829 operand vec() %{
 2830   constraint(ALLOC_IN_RC(dynamic));
 2831   match(VecX);
 2832   match(VecY);
 2833   match(VecZ);
 2834   match(VecS);
 2835   match(VecD);
 2836 
 2837   format %{ %}
 2838   interface(REG_INTER);
 2839 %}
 2840 
 2841 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
 2842 // Replaced with legVec[SDXYZ] during post-selection cleanup.
 2843 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
 2844 // runtime code generation via reg_class_dynamic.
 2845 operand legVec() %{
 2846   constraint(ALLOC_IN_RC(dynamic));
 2847   match(VecX);
 2848   match(VecY);
 2849   match(VecZ);
 2850   match(VecS);
 2851   match(VecD);
 2852 
 2853   format %{ %}
 2854   interface(REG_INTER);
 2855 %}
 2856 
 2857 // Replaces vec during post-selection cleanup. See above.
 2858 operand vecS() %{
 2859   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
 2860   match(VecS);
 2861 
 2862   format %{ %}
 2863   interface(REG_INTER);
 2864 %}
 2865 
 2866 // Replaces legVec during post-selection cleanup. See above.
 2867 operand legVecS() %{
 2868   constraint(ALLOC_IN_RC(vectors_reg_legacy));
 2869   match(VecS);
 2870 
 2871   format %{ %}
 2872   interface(REG_INTER);
 2873 %}
 2874 
 2875 // Replaces vec during post-selection cleanup. See above.
 2876 operand vecD() %{
 2877   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
 2878   match(VecD);
 2879 
 2880   format %{ %}
 2881   interface(REG_INTER);
 2882 %}
 2883 
 2884 // Replaces legVec during post-selection cleanup. See above.
 2885 operand legVecD() %{
 2886   constraint(ALLOC_IN_RC(vectord_reg_legacy));
 2887   match(VecD);
 2888 
 2889   format %{ %}
 2890   interface(REG_INTER);
 2891 %}
 2892 
 2893 // Replaces vec during post-selection cleanup. See above.
 2894 operand vecX() %{
 2895   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
 2896   match(VecX);
 2897 
 2898   format %{ %}
 2899   interface(REG_INTER);
 2900 %}
 2901 
 2902 // Replaces legVec during post-selection cleanup. See above.
 2903 operand legVecX() %{
 2904   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
 2905   match(VecX);
 2906 
 2907   format %{ %}
 2908   interface(REG_INTER);
 2909 %}
 2910 
 2911 // Replaces vec during post-selection cleanup. See above.
 2912 operand vecY() %{
 2913   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
 2914   match(VecY);
 2915 
 2916   format %{ %}
 2917   interface(REG_INTER);
 2918 %}
 2919 
 2920 // Replaces legVec during post-selection cleanup. See above.
 2921 operand legVecY() %{
 2922   constraint(ALLOC_IN_RC(vectory_reg_legacy));
 2923   match(VecY);
 2924 
 2925   format %{ %}
 2926   interface(REG_INTER);
 2927 %}
 2928 
 2929 // Replaces vec during post-selection cleanup. See above.
 2930 operand vecZ() %{
 2931   constraint(ALLOC_IN_RC(vectorz_reg));
 2932   match(VecZ);
 2933 
 2934   format %{ %}
 2935   interface(REG_INTER);
 2936 %}
 2937 
 2938 // Replaces legVec during post-selection cleanup. See above.
 2939 operand legVecZ() %{
 2940   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
 2941   match(VecZ);
 2942 
 2943   format %{ %}
 2944   interface(REG_INTER);
 2945 %}
 2946 
 2947 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 2948 
 2949 // ============================================================================
 2950 
 2951 instruct ShouldNotReachHere() %{
 2952   match(Halt);
 2953   format %{ "stop\t# ShouldNotReachHere" %}
 2954   ins_encode %{
 2955     if (is_reachable()) {
 2956       __ stop(_halt_reason);
 2957     }
 2958   %}
 2959   ins_pipe(pipe_slow);
 2960 %}
 2961 
 2962 // ============================================================================
 2963 
 2964 instruct addF_reg(regF dst, regF src) %{
 2965   predicate((UseSSE>=1) && (UseAVX == 0));
 2966   match(Set dst (AddF dst src));
 2967 
 2968   format %{ "addss   $dst, $src" %}
 2969   ins_cost(150);
 2970   ins_encode %{
 2971     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 2972   %}
 2973   ins_pipe(pipe_slow);
 2974 %}
 2975 
 2976 instruct addF_mem(regF dst, memory src) %{
 2977   predicate((UseSSE>=1) && (UseAVX == 0));
 2978   match(Set dst (AddF dst (LoadF src)));
 2979 
 2980   format %{ "addss   $dst, $src" %}
 2981   ins_cost(150);
 2982   ins_encode %{
 2983     __ addss($dst$$XMMRegister, $src$$Address);
 2984   %}
 2985   ins_pipe(pipe_slow);
 2986 %}
 2987 
 2988 instruct addF_imm(regF dst, immF con) %{
 2989   predicate((UseSSE>=1) && (UseAVX == 0));
 2990   match(Set dst (AddF dst con));
 2991   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 2992   ins_cost(150);
 2993   ins_encode %{
 2994     __ addss($dst$$XMMRegister, $constantaddress($con));
 2995   %}
 2996   ins_pipe(pipe_slow);
 2997 %}
 2998 
 2999 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 3000   predicate(UseAVX > 0);
 3001   match(Set dst (AddF src1 src2));
 3002 
 3003   format %{ "vaddss  $dst, $src1, $src2" %}
 3004   ins_cost(150);
 3005   ins_encode %{
 3006     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3007   %}
 3008   ins_pipe(pipe_slow);
 3009 %}
 3010 
 3011 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 3012   predicate(UseAVX > 0);
 3013   match(Set dst (AddF src1 (LoadF src2)));
 3014 
 3015   format %{ "vaddss  $dst, $src1, $src2" %}
 3016   ins_cost(150);
 3017   ins_encode %{
 3018     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3019   %}
 3020   ins_pipe(pipe_slow);
 3021 %}
 3022 
 3023 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 3024   predicate(UseAVX > 0);
 3025   match(Set dst (AddF src con));
 3026 
 3027   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3028   ins_cost(150);
 3029   ins_encode %{
 3030     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3031   %}
 3032   ins_pipe(pipe_slow);
 3033 %}
 3034 
 3035 instruct addD_reg(regD dst, regD src) %{
 3036   predicate((UseSSE>=2) && (UseAVX == 0));
 3037   match(Set dst (AddD dst src));
 3038 
 3039   format %{ "addsd   $dst, $src" %}
 3040   ins_cost(150);
 3041   ins_encode %{
 3042     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 3043   %}
 3044   ins_pipe(pipe_slow);
 3045 %}
 3046 
 3047 instruct addD_mem(regD dst, memory src) %{
 3048   predicate((UseSSE>=2) && (UseAVX == 0));
 3049   match(Set dst (AddD dst (LoadD src)));
 3050 
 3051   format %{ "addsd   $dst, $src" %}
 3052   ins_cost(150);
 3053   ins_encode %{
 3054     __ addsd($dst$$XMMRegister, $src$$Address);
 3055   %}
 3056   ins_pipe(pipe_slow);
 3057 %}
 3058 
 3059 instruct addD_imm(regD dst, immD con) %{
 3060   predicate((UseSSE>=2) && (UseAVX == 0));
 3061   match(Set dst (AddD dst con));
 3062   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3063   ins_cost(150);
 3064   ins_encode %{
 3065     __ addsd($dst$$XMMRegister, $constantaddress($con));
 3066   %}
 3067   ins_pipe(pipe_slow);
 3068 %}
 3069 
 3070 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 3071   predicate(UseAVX > 0);
 3072   match(Set dst (AddD src1 src2));
 3073 
 3074   format %{ "vaddsd  $dst, $src1, $src2" %}
 3075   ins_cost(150);
 3076   ins_encode %{
 3077     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3078   %}
 3079   ins_pipe(pipe_slow);
 3080 %}
 3081 
 3082 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 3083   predicate(UseAVX > 0);
 3084   match(Set dst (AddD src1 (LoadD src2)));
 3085 
 3086   format %{ "vaddsd  $dst, $src1, $src2" %}
 3087   ins_cost(150);
 3088   ins_encode %{
 3089     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3090   %}
 3091   ins_pipe(pipe_slow);
 3092 %}
 3093 
 3094 instruct addD_reg_imm(regD dst, regD src, immD con) %{
 3095   predicate(UseAVX > 0);
 3096   match(Set dst (AddD src con));
 3097 
 3098   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3099   ins_cost(150);
 3100   ins_encode %{
 3101     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3102   %}
 3103   ins_pipe(pipe_slow);
 3104 %}
 3105 
 3106 instruct subF_reg(regF dst, regF src) %{
 3107   predicate((UseSSE>=1) && (UseAVX == 0));
 3108   match(Set dst (SubF dst src));
 3109 
 3110   format %{ "subss   $dst, $src" %}
 3111   ins_cost(150);
 3112   ins_encode %{
 3113     __ subss($dst$$XMMRegister, $src$$XMMRegister);
 3114   %}
 3115   ins_pipe(pipe_slow);
 3116 %}
 3117 
 3118 instruct subF_mem(regF dst, memory src) %{
 3119   predicate((UseSSE>=1) && (UseAVX == 0));
 3120   match(Set dst (SubF dst (LoadF src)));
 3121 
 3122   format %{ "subss   $dst, $src" %}
 3123   ins_cost(150);
 3124   ins_encode %{
 3125     __ subss($dst$$XMMRegister, $src$$Address);
 3126   %}
 3127   ins_pipe(pipe_slow);
 3128 %}
 3129 
 3130 instruct subF_imm(regF dst, immF con) %{
 3131   predicate((UseSSE>=1) && (UseAVX == 0));
 3132   match(Set dst (SubF dst con));
 3133   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3134   ins_cost(150);
 3135   ins_encode %{
 3136     __ subss($dst$$XMMRegister, $constantaddress($con));
 3137   %}
 3138   ins_pipe(pipe_slow);
 3139 %}
 3140 
 3141 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
 3142   predicate(UseAVX > 0);
 3143   match(Set dst (SubF src1 src2));
 3144 
 3145   format %{ "vsubss  $dst, $src1, $src2" %}
 3146   ins_cost(150);
 3147   ins_encode %{
 3148     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3149   %}
 3150   ins_pipe(pipe_slow);
 3151 %}
 3152 
 3153 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
 3154   predicate(UseAVX > 0);
 3155   match(Set dst (SubF src1 (LoadF src2)));
 3156 
 3157   format %{ "vsubss  $dst, $src1, $src2" %}
 3158   ins_cost(150);
 3159   ins_encode %{
 3160     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3161   %}
 3162   ins_pipe(pipe_slow);
 3163 %}
 3164 
 3165 instruct subF_reg_imm(regF dst, regF src, immF con) %{
 3166   predicate(UseAVX > 0);
 3167   match(Set dst (SubF src con));
 3168 
 3169   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3170   ins_cost(150);
 3171   ins_encode %{
 3172     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3173   %}
 3174   ins_pipe(pipe_slow);
 3175 %}
 3176 
 3177 instruct subD_reg(regD dst, regD src) %{
 3178   predicate((UseSSE>=2) && (UseAVX == 0));
 3179   match(Set dst (SubD dst src));
 3180 
 3181   format %{ "subsd   $dst, $src" %}
 3182   ins_cost(150);
 3183   ins_encode %{
 3184     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
 3185   %}
 3186   ins_pipe(pipe_slow);
 3187 %}
 3188 
 3189 instruct subD_mem(regD dst, memory src) %{
 3190   predicate((UseSSE>=2) && (UseAVX == 0));
 3191   match(Set dst (SubD dst (LoadD src)));
 3192 
 3193   format %{ "subsd   $dst, $src" %}
 3194   ins_cost(150);
 3195   ins_encode %{
 3196     __ subsd($dst$$XMMRegister, $src$$Address);
 3197   %}
 3198   ins_pipe(pipe_slow);
 3199 %}
 3200 
 3201 instruct subD_imm(regD dst, immD con) %{
 3202   predicate((UseSSE>=2) && (UseAVX == 0));
 3203   match(Set dst (SubD dst con));
 3204   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3205   ins_cost(150);
 3206   ins_encode %{
 3207     __ subsd($dst$$XMMRegister, $constantaddress($con));
 3208   %}
 3209   ins_pipe(pipe_slow);
 3210 %}
 3211 
 3212 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
 3213   predicate(UseAVX > 0);
 3214   match(Set dst (SubD src1 src2));
 3215 
 3216   format %{ "vsubsd  $dst, $src1, $src2" %}
 3217   ins_cost(150);
 3218   ins_encode %{
 3219     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3220   %}
 3221   ins_pipe(pipe_slow);
 3222 %}
 3223 
 3224 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
 3225   predicate(UseAVX > 0);
 3226   match(Set dst (SubD src1 (LoadD src2)));
 3227 
 3228   format %{ "vsubsd  $dst, $src1, $src2" %}
 3229   ins_cost(150);
 3230   ins_encode %{
 3231     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3232   %}
 3233   ins_pipe(pipe_slow);
 3234 %}
 3235 
 3236 instruct subD_reg_imm(regD dst, regD src, immD con) %{
 3237   predicate(UseAVX > 0);
 3238   match(Set dst (SubD src con));
 3239 
 3240   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3241   ins_cost(150);
 3242   ins_encode %{
 3243     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3244   %}
 3245   ins_pipe(pipe_slow);
 3246 %}
 3247 
 3248 instruct mulF_reg(regF dst, regF src) %{
 3249   predicate((UseSSE>=1) && (UseAVX == 0));
 3250   match(Set dst (MulF dst src));
 3251 
 3252   format %{ "mulss   $dst, $src" %}
 3253   ins_cost(150);
 3254   ins_encode %{
 3255     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
 3256   %}
 3257   ins_pipe(pipe_slow);
 3258 %}
 3259 
 3260 instruct mulF_mem(regF dst, memory src) %{
 3261   predicate((UseSSE>=1) && (UseAVX == 0));
 3262   match(Set dst (MulF dst (LoadF src)));
 3263 
 3264   format %{ "mulss   $dst, $src" %}
 3265   ins_cost(150);
 3266   ins_encode %{
 3267     __ mulss($dst$$XMMRegister, $src$$Address);
 3268   %}
 3269   ins_pipe(pipe_slow);
 3270 %}
 3271 
 3272 instruct mulF_imm(regF dst, immF con) %{
 3273   predicate((UseSSE>=1) && (UseAVX == 0));
 3274   match(Set dst (MulF dst con));
 3275   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3276   ins_cost(150);
 3277   ins_encode %{
 3278     __ mulss($dst$$XMMRegister, $constantaddress($con));
 3279   %}
 3280   ins_pipe(pipe_slow);
 3281 %}
 3282 
 3283 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
 3284   predicate(UseAVX > 0);
 3285   match(Set dst (MulF src1 src2));
 3286 
 3287   format %{ "vmulss  $dst, $src1, $src2" %}
 3288   ins_cost(150);
 3289   ins_encode %{
 3290     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3291   %}
 3292   ins_pipe(pipe_slow);
 3293 %}
 3294 
 3295 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 3296   predicate(UseAVX > 0);
 3297   match(Set dst (MulF src1 (LoadF src2)));
 3298 
 3299   format %{ "vmulss  $dst, $src1, $src2" %}
 3300   ins_cost(150);
 3301   ins_encode %{
 3302     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3303   %}
 3304   ins_pipe(pipe_slow);
 3305 %}
 3306 
 3307 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 3308   predicate(UseAVX > 0);
 3309   match(Set dst (MulF src con));
 3310 
 3311   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3312   ins_cost(150);
 3313   ins_encode %{
 3314     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3315   %}
 3316   ins_pipe(pipe_slow);
 3317 %}
 3318 
 3319 instruct mulD_reg(regD dst, regD src) %{
 3320   predicate((UseSSE>=2) && (UseAVX == 0));
 3321   match(Set dst (MulD dst src));
 3322 
 3323   format %{ "mulsd   $dst, $src" %}
 3324   ins_cost(150);
 3325   ins_encode %{
 3326     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
 3327   %}
 3328   ins_pipe(pipe_slow);
 3329 %}
 3330 
 3331 instruct mulD_mem(regD dst, memory src) %{
 3332   predicate((UseSSE>=2) && (UseAVX == 0));
 3333   match(Set dst (MulD dst (LoadD src)));
 3334 
 3335   format %{ "mulsd   $dst, $src" %}
 3336   ins_cost(150);
 3337   ins_encode %{
 3338     __ mulsd($dst$$XMMRegister, $src$$Address);
 3339   %}
 3340   ins_pipe(pipe_slow);
 3341 %}
 3342 
 3343 instruct mulD_imm(regD dst, immD con) %{
 3344   predicate((UseSSE>=2) && (UseAVX == 0));
 3345   match(Set dst (MulD dst con));
 3346   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3347   ins_cost(150);
 3348   ins_encode %{
 3349     __ mulsd($dst$$XMMRegister, $constantaddress($con));
 3350   %}
 3351   ins_pipe(pipe_slow);
 3352 %}
 3353 
 3354 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
 3355   predicate(UseAVX > 0);
 3356   match(Set dst (MulD src1 src2));
 3357 
 3358   format %{ "vmulsd  $dst, $src1, $src2" %}
 3359   ins_cost(150);
 3360   ins_encode %{
 3361     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3362   %}
 3363   ins_pipe(pipe_slow);
 3364 %}
 3365 
 3366 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
 3367   predicate(UseAVX > 0);
 3368   match(Set dst (MulD src1 (LoadD src2)));
 3369 
 3370   format %{ "vmulsd  $dst, $src1, $src2" %}
 3371   ins_cost(150);
 3372   ins_encode %{
 3373     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3374   %}
 3375   ins_pipe(pipe_slow);
 3376 %}
 3377 
 3378 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
 3379   predicate(UseAVX > 0);
 3380   match(Set dst (MulD src con));
 3381 
 3382   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3383   ins_cost(150);
 3384   ins_encode %{
 3385     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3386   %}
 3387   ins_pipe(pipe_slow);
 3388 %}
 3389 
 3390 instruct divF_reg(regF dst, regF src) %{
 3391   predicate((UseSSE>=1) && (UseAVX == 0));
 3392   match(Set dst (DivF dst src));
 3393 
 3394   format %{ "divss   $dst, $src" %}
 3395   ins_cost(150);
 3396   ins_encode %{
 3397     __ divss($dst$$XMMRegister, $src$$XMMRegister);
 3398   %}
 3399   ins_pipe(pipe_slow);
 3400 %}
 3401 
 3402 instruct divF_mem(regF dst, memory src) %{
 3403   predicate((UseSSE>=1) && (UseAVX == 0));
 3404   match(Set dst (DivF dst (LoadF src)));
 3405 
 3406   format %{ "divss   $dst, $src" %}
 3407   ins_cost(150);
 3408   ins_encode %{
 3409     __ divss($dst$$XMMRegister, $src$$Address);
 3410   %}
 3411   ins_pipe(pipe_slow);
 3412 %}
 3413 
 3414 instruct divF_imm(regF dst, immF con) %{
 3415   predicate((UseSSE>=1) && (UseAVX == 0));
 3416   match(Set dst (DivF dst con));
 3417   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3418   ins_cost(150);
 3419   ins_encode %{
 3420     __ divss($dst$$XMMRegister, $constantaddress($con));
 3421   %}
 3422   ins_pipe(pipe_slow);
 3423 %}
 3424 
 3425 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
 3426   predicate(UseAVX > 0);
 3427   match(Set dst (DivF src1 src2));
 3428 
 3429   format %{ "vdivss  $dst, $src1, $src2" %}
 3430   ins_cost(150);
 3431   ins_encode %{
 3432     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3433   %}
 3434   ins_pipe(pipe_slow);
 3435 %}
 3436 
 3437 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
 3438   predicate(UseAVX > 0);
 3439   match(Set dst (DivF src1 (LoadF src2)));
 3440 
 3441   format %{ "vdivss  $dst, $src1, $src2" %}
 3442   ins_cost(150);
 3443   ins_encode %{
 3444     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3445   %}
 3446   ins_pipe(pipe_slow);
 3447 %}
 3448 
 3449 instruct divF_reg_imm(regF dst, regF src, immF con) %{
 3450   predicate(UseAVX > 0);
 3451   match(Set dst (DivF src con));
 3452 
 3453   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3454   ins_cost(150);
 3455   ins_encode %{
 3456     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3457   %}
 3458   ins_pipe(pipe_slow);
 3459 %}
 3460 
 3461 instruct divD_reg(regD dst, regD src) %{
 3462   predicate((UseSSE>=2) && (UseAVX == 0));
 3463   match(Set dst (DivD dst src));
 3464 
 3465   format %{ "divsd   $dst, $src" %}
 3466   ins_cost(150);
 3467   ins_encode %{
 3468     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
 3469   %}
 3470   ins_pipe(pipe_slow);
 3471 %}
 3472 
 3473 instruct divD_mem(regD dst, memory src) %{
 3474   predicate((UseSSE>=2) && (UseAVX == 0));
 3475   match(Set dst (DivD dst (LoadD src)));
 3476 
 3477   format %{ "divsd   $dst, $src" %}
 3478   ins_cost(150);
 3479   ins_encode %{
 3480     __ divsd($dst$$XMMRegister, $src$$Address);
 3481   %}
 3482   ins_pipe(pipe_slow);
 3483 %}
 3484 
 3485 instruct divD_imm(regD dst, immD con) %{
 3486   predicate((UseSSE>=2) && (UseAVX == 0));
 3487   match(Set dst (DivD dst con));
 3488   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3489   ins_cost(150);
 3490   ins_encode %{
 3491     __ divsd($dst$$XMMRegister, $constantaddress($con));
 3492   %}
 3493   ins_pipe(pipe_slow);
 3494 %}
 3495 
 3496 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
 3497   predicate(UseAVX > 0);
 3498   match(Set dst (DivD src1 src2));
 3499 
 3500   format %{ "vdivsd  $dst, $src1, $src2" %}
 3501   ins_cost(150);
 3502   ins_encode %{
 3503     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3504   %}
 3505   ins_pipe(pipe_slow);
 3506 %}
 3507 
 3508 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
 3509   predicate(UseAVX > 0);
 3510   match(Set dst (DivD src1 (LoadD src2)));
 3511 
 3512   format %{ "vdivsd  $dst, $src1, $src2" %}
 3513   ins_cost(150);
 3514   ins_encode %{
 3515     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3516   %}
 3517   ins_pipe(pipe_slow);
 3518 %}
 3519 
 3520 instruct divD_reg_imm(regD dst, regD src, immD con) %{
 3521   predicate(UseAVX > 0);
 3522   match(Set dst (DivD src con));
 3523 
 3524   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3525   ins_cost(150);
 3526   ins_encode %{
 3527     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3528   %}
 3529   ins_pipe(pipe_slow);
 3530 %}
 3531 
 3532 instruct absF_reg(regF dst) %{
 3533   predicate((UseSSE>=1) && (UseAVX == 0));
 3534   match(Set dst (AbsF dst));
 3535   ins_cost(150);
 3536   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
 3537   ins_encode %{
 3538     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
 3539   %}
 3540   ins_pipe(pipe_slow);
 3541 %}
 3542 
 3543 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
 3544   predicate(UseAVX > 0);
 3545   match(Set dst (AbsF src));
 3546   ins_cost(150);
 3547   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
 3548   ins_encode %{
 3549     int vlen_enc = Assembler::AVX_128bit;
 3550     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
 3551               ExternalAddress(float_signmask()), vlen_enc);
 3552   %}
 3553   ins_pipe(pipe_slow);
 3554 %}
 3555 
 3556 instruct absD_reg(regD dst) %{
 3557   predicate((UseSSE>=2) && (UseAVX == 0));
 3558   match(Set dst (AbsD dst));
 3559   ins_cost(150);
 3560   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
 3561             "# abs double by sign masking" %}
 3562   ins_encode %{
 3563     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
 3564   %}
 3565   ins_pipe(pipe_slow);
 3566 %}
 3567 
 3568 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
 3569   predicate(UseAVX > 0);
 3570   match(Set dst (AbsD src));
 3571   ins_cost(150);
 3572   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
 3573             "# abs double by sign masking" %}
 3574   ins_encode %{
 3575     int vlen_enc = Assembler::AVX_128bit;
 3576     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
 3577               ExternalAddress(double_signmask()), vlen_enc);
 3578   %}
 3579   ins_pipe(pipe_slow);
 3580 %}
 3581 
 3582 instruct negF_reg(regF dst) %{
 3583   predicate((UseSSE>=1) && (UseAVX == 0));
 3584   match(Set dst (NegF dst));
 3585   ins_cost(150);
 3586   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
 3587   ins_encode %{
 3588     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
 3589   %}
 3590   ins_pipe(pipe_slow);
 3591 %}
 3592 
 3593 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
 3594   predicate(UseAVX > 0);
 3595   match(Set dst (NegF src));
 3596   ins_cost(150);
 3597   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
 3598   ins_encode %{
 3599     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
 3600                  ExternalAddress(float_signflip()));
 3601   %}
 3602   ins_pipe(pipe_slow);
 3603 %}
 3604 
 3605 instruct negD_reg(regD dst) %{
 3606   predicate((UseSSE>=2) && (UseAVX == 0));
 3607   match(Set dst (NegD dst));
 3608   ins_cost(150);
 3609   format %{ "xorpd   $dst, [0x8000000000000000]\t"
 3610             "# neg double by sign flipping" %}
 3611   ins_encode %{
 3612     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
 3613   %}
 3614   ins_pipe(pipe_slow);
 3615 %}
 3616 
 3617 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
 3618   predicate(UseAVX > 0);
 3619   match(Set dst (NegD src));
 3620   ins_cost(150);
 3621   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
 3622             "# neg double by sign flipping" %}
 3623   ins_encode %{
 3624     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
 3625                  ExternalAddress(double_signflip()));
 3626   %}
 3627   ins_pipe(pipe_slow);
 3628 %}
 3629 
 3630 // sqrtss instruction needs destination register to be pre initialized for best performance
 3631 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3632 instruct sqrtF_reg(regF dst) %{
 3633   predicate(UseSSE>=1);
 3634   match(Set dst (SqrtF dst));
 3635   format %{ "sqrtss  $dst, $dst" %}
 3636   ins_encode %{
 3637     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
 3638   %}
 3639   ins_pipe(pipe_slow);
 3640 %}
 3641 
 3642 // sqrtsd instruction needs destination register to be pre initialized for best performance
 3643 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3644 instruct sqrtD_reg(regD dst) %{
 3645   predicate(UseSSE>=2);
 3646   match(Set dst (SqrtD dst));
 3647   format %{ "sqrtsd  $dst, $dst" %}
 3648   ins_encode %{
 3649     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
 3650   %}
 3651   ins_pipe(pipe_slow);
 3652 %}
 3653 
 3654 instruct convF2HF_reg_reg(rRegI dst, vlRegF src, vlRegF tmp) %{
 3655   effect(TEMP tmp);
 3656   match(Set dst (ConvF2HF src));
 3657   ins_cost(125);
 3658   format %{ "vcvtps2ph $dst,$src \t using $tmp as TEMP"%}
 3659   ins_encode %{
 3660     __ flt_to_flt16($dst$$Register, $src$$XMMRegister, $tmp$$XMMRegister);
 3661   %}
 3662   ins_pipe( pipe_slow );
 3663 %}
 3664 
 3665 instruct convF2HF_mem_reg(memory mem, regF src, kReg ktmp, rRegI rtmp) %{
 3666   predicate((UseAVX > 2) && VM_Version::supports_avx512vl());
 3667   effect(TEMP ktmp, TEMP rtmp);
 3668   match(Set mem (StoreC mem (ConvF2HF src)));
 3669   format %{ "evcvtps2ph $mem,$src \t using $ktmp and $rtmp as TEMP" %}
 3670   ins_encode %{
 3671     __ movl($rtmp$$Register, 0x1);
 3672     __ kmovwl($ktmp$$KRegister, $rtmp$$Register);
 3673     __ evcvtps2ph($mem$$Address, $ktmp$$KRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
 3674   %}
 3675   ins_pipe( pipe_slow );
 3676 %}
 3677 
 3678 instruct vconvF2HF(vec dst, vec src) %{
 3679   match(Set dst (VectorCastF2HF src));
 3680   format %{ "vector_conv_F2HF $dst $src" %}
 3681   ins_encode %{
 3682     int vlen_enc = vector_length_encoding(this, $src);
 3683     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, vlen_enc);
 3684   %}
 3685   ins_pipe( pipe_slow );
 3686 %}
 3687 
 3688 instruct vconvF2HF_mem_reg(memory mem, vec src) %{
 3689   match(Set mem (StoreVector mem (VectorCastF2HF src)));
 3690   format %{ "vcvtps2ph $mem,$src" %}
 3691   ins_encode %{
 3692     int vlen_enc = vector_length_encoding(this, $src);
 3693     __ vcvtps2ph($mem$$Address, $src$$XMMRegister, 0x04, vlen_enc);
 3694   %}
 3695   ins_pipe( pipe_slow );
 3696 %}
 3697 
 3698 instruct convHF2F_reg_reg(vlRegF dst, rRegI src) %{
 3699   match(Set dst (ConvHF2F src));
 3700   format %{ "vcvtph2ps $dst,$src" %}
 3701   ins_encode %{
 3702     __ flt16_to_flt($dst$$XMMRegister, $src$$Register);
 3703   %}
 3704   ins_pipe( pipe_slow );
 3705 %}
 3706 
 3707 instruct vconvHF2F_reg_mem(vec dst, memory mem) %{
 3708   match(Set dst (VectorCastHF2F (LoadVector mem)));
 3709   format %{ "vcvtph2ps $dst,$mem" %}
 3710   ins_encode %{
 3711     int vlen_enc = vector_length_encoding(this);
 3712     __ vcvtph2ps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 3713   %}
 3714   ins_pipe( pipe_slow );
 3715 %}
 3716 
 3717 instruct vconvHF2F(vec dst, vec src) %{
 3718   match(Set dst (VectorCastHF2F src));
 3719   ins_cost(125);
 3720   format %{ "vector_conv_HF2F $dst,$src" %}
 3721   ins_encode %{
 3722     int vlen_enc = vector_length_encoding(this);
 3723     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 3724   %}
 3725   ins_pipe( pipe_slow );
 3726 %}
 3727 
 3728 // ---------------------------------------- VectorReinterpret ------------------------------------
 3729 instruct reinterpret_mask(kReg dst) %{
 3730   predicate(n->bottom_type()->isa_vectmask() &&
 3731             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
 3732   match(Set dst (VectorReinterpret dst));
 3733   ins_cost(125);
 3734   format %{ "vector_reinterpret $dst\t!" %}
 3735   ins_encode %{
 3736     // empty
 3737   %}
 3738   ins_pipe( pipe_slow );
 3739 %}
 3740 
 3741 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
 3742   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3743             n->bottom_type()->isa_vectmask() &&
 3744             n->in(1)->bottom_type()->isa_vectmask() &&
 3745             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
 3746             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3747   match(Set dst (VectorReinterpret src));
 3748   effect(TEMP xtmp);
 3749   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
 3750   ins_encode %{
 3751      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
 3752      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3753      assert(src_sz == dst_sz , "src and dst size mismatch");
 3754      int vlen_enc = vector_length_encoding(src_sz);
 3755      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3756      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3757   %}
 3758   ins_pipe( pipe_slow );
 3759 %}
 3760 
 3761 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
 3762   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3763             n->bottom_type()->isa_vectmask() &&
 3764             n->in(1)->bottom_type()->isa_vectmask() &&
 3765             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
 3766              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
 3767             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3768   match(Set dst (VectorReinterpret src));
 3769   effect(TEMP xtmp);
 3770   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
 3771   ins_encode %{
 3772      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
 3773      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3774      assert(src_sz == dst_sz , "src and dst size mismatch");
 3775      int vlen_enc = vector_length_encoding(src_sz);
 3776      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3777      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3778   %}
 3779   ins_pipe( pipe_slow );
 3780 %}
 3781 
 3782 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
 3783   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3784             n->bottom_type()->isa_vectmask() &&
 3785             n->in(1)->bottom_type()->isa_vectmask() &&
 3786             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
 3787              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
 3788             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3789   match(Set dst (VectorReinterpret src));
 3790   effect(TEMP xtmp);
 3791   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
 3792   ins_encode %{
 3793      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
 3794      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3795      assert(src_sz == dst_sz , "src and dst size mismatch");
 3796      int vlen_enc = vector_length_encoding(src_sz);
 3797      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3798      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3799   %}
 3800   ins_pipe( pipe_slow );
 3801 %}
 3802 
 3803 instruct reinterpret(vec dst) %{
 3804   predicate(!n->bottom_type()->isa_vectmask() &&
 3805             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
 3806   match(Set dst (VectorReinterpret dst));
 3807   ins_cost(125);
 3808   format %{ "vector_reinterpret $dst\t!" %}
 3809   ins_encode %{
 3810     // empty
 3811   %}
 3812   ins_pipe( pipe_slow );
 3813 %}
 3814 
 3815 instruct reinterpret_expand(vec dst, vec src) %{
 3816   predicate(UseAVX == 0 &&
 3817             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3818   match(Set dst (VectorReinterpret src));
 3819   ins_cost(125);
 3820   effect(TEMP dst);
 3821   format %{ "vector_reinterpret_expand $dst,$src" %}
 3822   ins_encode %{
 3823     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
 3824     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
 3825 
 3826     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
 3827     if (src_vlen_in_bytes == 4) {
 3828       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), noreg);
 3829     } else {
 3830       assert(src_vlen_in_bytes == 8, "");
 3831       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), noreg);
 3832     }
 3833     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 3834   %}
 3835   ins_pipe( pipe_slow );
 3836 %}
 3837 
 3838 instruct vreinterpret_expand4(legVec dst, vec src) %{
 3839   predicate(UseAVX > 0 &&
 3840             !n->bottom_type()->isa_vectmask() &&
 3841             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
 3842             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3843   match(Set dst (VectorReinterpret src));
 3844   ins_cost(125);
 3845   format %{ "vector_reinterpret_expand $dst,$src" %}
 3846   ins_encode %{
 3847     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, noreg);
 3848   %}
 3849   ins_pipe( pipe_slow );
 3850 %}
 3851 
 3852 
 3853 instruct vreinterpret_expand(legVec dst, vec src) %{
 3854   predicate(UseAVX > 0 &&
 3855             !n->bottom_type()->isa_vectmask() &&
 3856             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
 3857             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3858   match(Set dst (VectorReinterpret src));
 3859   ins_cost(125);
 3860   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
 3861   ins_encode %{
 3862     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3863       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3864       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3865       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3866       default: ShouldNotReachHere();
 3867     }
 3868   %}
 3869   ins_pipe( pipe_slow );
 3870 %}
 3871 
 3872 instruct reinterpret_shrink(vec dst, legVec src) %{
 3873   predicate(!n->bottom_type()->isa_vectmask() &&
 3874             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
 3875   match(Set dst (VectorReinterpret src));
 3876   ins_cost(125);
 3877   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
 3878   ins_encode %{
 3879     switch (Matcher::vector_length_in_bytes(this)) {
 3880       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
 3881       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3882       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3883       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3884       default: ShouldNotReachHere();
 3885     }
 3886   %}
 3887   ins_pipe( pipe_slow );
 3888 %}
 3889 
 3890 // ----------------------------------------------------------------------------------------------------
 3891 
 3892 #ifdef _LP64
 3893 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
 3894   match(Set dst (RoundDoubleMode src rmode));
 3895   format %{ "roundsd $dst,$src" %}
 3896   ins_cost(150);
 3897   ins_encode %{
 3898     assert(UseSSE >= 4, "required");
 3899     if ((UseAVX == 0) && ($dst$$XMMRegister != $src$$XMMRegister)) {
 3900       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 3901     }
 3902     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
 3903   %}
 3904   ins_pipe(pipe_slow);
 3905 %}
 3906 
 3907 instruct roundD_imm(legRegD dst, immD con, immU8 rmode) %{
 3908   match(Set dst (RoundDoubleMode con rmode));
 3909   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
 3910   ins_cost(150);
 3911   ins_encode %{
 3912     assert(UseSSE >= 4, "required");
 3913     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, noreg);
 3914   %}
 3915   ins_pipe(pipe_slow);
 3916 %}
 3917 
 3918 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
 3919   predicate(Matcher::vector_length(n) < 8);
 3920   match(Set dst (RoundDoubleModeV src rmode));
 3921   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
 3922   ins_encode %{
 3923     assert(UseAVX > 0, "required");
 3924     int vlen_enc = vector_length_encoding(this);
 3925     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
 3926   %}
 3927   ins_pipe( pipe_slow );
 3928 %}
 3929 
 3930 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
 3931   predicate(Matcher::vector_length(n) == 8);
 3932   match(Set dst (RoundDoubleModeV src rmode));
 3933   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
 3934   ins_encode %{
 3935     assert(UseAVX > 2, "required");
 3936     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
 3937   %}
 3938   ins_pipe( pipe_slow );
 3939 %}
 3940 
 3941 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
 3942   predicate(Matcher::vector_length(n) < 8);
 3943   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3944   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
 3945   ins_encode %{
 3946     assert(UseAVX > 0, "required");
 3947     int vlen_enc = vector_length_encoding(this);
 3948     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
 3949   %}
 3950   ins_pipe( pipe_slow );
 3951 %}
 3952 
 3953 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
 3954   predicate(Matcher::vector_length(n) == 8);
 3955   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3956   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
 3957   ins_encode %{
 3958     assert(UseAVX > 2, "required");
 3959     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
 3960   %}
 3961   ins_pipe( pipe_slow );
 3962 %}
 3963 #endif // _LP64
 3964 
 3965 instruct onspinwait() %{
 3966   match(OnSpinWait);
 3967   ins_cost(200);
 3968 
 3969   format %{
 3970     $$template
 3971     $$emit$$"pause\t! membar_onspinwait"
 3972   %}
 3973   ins_encode %{
 3974     __ pause();
 3975   %}
 3976   ins_pipe(pipe_slow);
 3977 %}
 3978 
 3979 // a * b + c
 3980 instruct fmaD_reg(regD a, regD b, regD c) %{
 3981   match(Set c (FmaD  c (Binary a b)));
 3982   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
 3983   ins_cost(150);
 3984   ins_encode %{
 3985     assert(UseFMA, "Needs FMA instructions support.");
 3986     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3987   %}
 3988   ins_pipe( pipe_slow );
 3989 %}
 3990 
 3991 // a * b + c
 3992 instruct fmaF_reg(regF a, regF b, regF c) %{
 3993   match(Set c (FmaF  c (Binary a b)));
 3994   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
 3995   ins_cost(150);
 3996   ins_encode %{
 3997     assert(UseFMA, "Needs FMA instructions support.");
 3998     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3999   %}
 4000   ins_pipe( pipe_slow );
 4001 %}
 4002 
 4003 // ====================VECTOR INSTRUCTIONS=====================================
 4004 
 4005 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
 4006 instruct MoveVec2Leg(legVec dst, vec src) %{
 4007   match(Set dst src);
 4008   format %{ "" %}
 4009   ins_encode %{
 4010     ShouldNotReachHere();
 4011   %}
 4012   ins_pipe( fpu_reg_reg );
 4013 %}
 4014 
 4015 instruct MoveLeg2Vec(vec dst, legVec src) %{
 4016   match(Set dst src);
 4017   format %{ "" %}
 4018   ins_encode %{
 4019     ShouldNotReachHere();
 4020   %}
 4021   ins_pipe( fpu_reg_reg );
 4022 %}
 4023 
 4024 // ============================================================================
 4025 
 4026 // Load vectors generic operand pattern
 4027 instruct loadV(vec dst, memory mem) %{
 4028   match(Set dst (LoadVector mem));
 4029   ins_cost(125);
 4030   format %{ "load_vector $dst,$mem" %}
 4031   ins_encode %{
 4032     __ load_vector($dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
 4033   %}
 4034   ins_pipe( pipe_slow );
 4035 %}
 4036 
 4037 // Store vectors generic operand pattern.
 4038 instruct storeV(memory mem, vec src) %{
 4039   match(Set mem (StoreVector mem src));
 4040   ins_cost(145);
 4041   format %{ "store_vector $mem,$src\n\t" %}
 4042   ins_encode %{
 4043     switch (Matcher::vector_length_in_bytes(this, $src)) {
 4044       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
 4045       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
 4046       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
 4047       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
 4048       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
 4049       default: ShouldNotReachHere();
 4050     }
 4051   %}
 4052   ins_pipe( pipe_slow );
 4053 %}
 4054 
 4055 // ---------------------------------------- Gather ------------------------------------
 4056 
 4057 // Gather BYTE, SHORT, INT, LONG, FLOAT, DOUBLE
 4058 
 4059 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
 4060   predicate(!VM_Version::supports_avx512vl() && !is_subword_type(Matcher::vector_element_basic_type(n)) &&
 4061             Matcher::vector_length_in_bytes(n) <= 32);
 4062   match(Set dst (LoadVectorGather mem idx));
 4063   effect(TEMP dst, TEMP tmp, TEMP mask);
 4064   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
 4065   ins_encode %{
 4066     int vlen_enc = vector_length_encoding(this);
 4067     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4068     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4069     __ vpcmpeqd($mask$$XMMRegister, $mask$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4070     __ lea($tmp$$Register, $mem$$Address);
 4071     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4072   %}
 4073   ins_pipe( pipe_slow );
 4074 %}
 4075 
 4076 
 4077 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
 4078   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4079             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4080   match(Set dst (LoadVectorGather mem idx));
 4081   effect(TEMP dst, TEMP tmp, TEMP ktmp);
 4082   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
 4083   ins_encode %{
 4084     int vlen_enc = vector_length_encoding(this);
 4085     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4086     __ kxnorwl($ktmp$$KRegister, $ktmp$$KRegister, $ktmp$$KRegister);
 4087     __ lea($tmp$$Register, $mem$$Address);
 4088     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4089   %}
 4090   ins_pipe( pipe_slow );
 4091 %}
 4092 
 4093 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4094   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4095             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4096   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
 4097   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
 4098   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
 4099   ins_encode %{
 4100     assert(UseAVX > 2, "sanity");
 4101     int vlen_enc = vector_length_encoding(this);
 4102     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4103     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4104     // Note: Since gather instruction partially updates the opmask register used
 4105     // for predication hense moving mask operand to a temporary.
 4106     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4107     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4108     __ lea($tmp$$Register, $mem$$Address);
 4109     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4110   %}
 4111   ins_pipe( pipe_slow );
 4112 %}
 4113 
 4114 instruct vgather_subwordLE8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegI rtmp) %{
 4115   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4116   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4117   effect(TEMP tmp, TEMP rtmp);
 4118   format %{ "vector_gatherLE8 $dst, $mem, $idx_base\t! using $tmp and $rtmp as TEMP" %}
 4119   ins_encode %{
 4120     int vlen_enc = vector_length_encoding(this);
 4121     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4122     __ lea($tmp$$Register, $mem$$Address);
 4123     __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp$$Register, vlen_enc);
 4124   %}
 4125   ins_pipe( pipe_slow );
 4126 %}
 4127 
 4128 instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegP idx_base_temp,
 4129                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4130   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4131   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4132   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4133   format %{ "vector_gatherGT8 $dst, $mem, $idx_base\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4134   ins_encode %{
 4135     int vlen_enc = vector_length_encoding(this);
 4136     int vector_len = Matcher::vector_length(this);
 4137     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4138     __ lea($tmp$$Register, $mem$$Address);
 4139     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4140     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, noreg, $xtmp1$$XMMRegister,
 4141                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4142   %}
 4143   ins_pipe( pipe_slow );
 4144 %}
 4145 
 4146 instruct vgather_subwordLE8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegI rtmp, rFlagsReg cr) %{
 4147   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4148   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4149   effect(TEMP tmp, TEMP rtmp, KILL cr);
 4150   format %{ "vector_gatherLE8_off $dst, $mem, $idx_base, $offset\t! using $tmp and $rtmp as TEMP" %}
 4151   ins_encode %{
 4152     int vlen_enc = vector_length_encoding(this);
 4153     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4154     __ lea($tmp$$Register, $mem$$Address);
 4155     __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register, $rtmp$$Register, vlen_enc);
 4156   %}
 4157   ins_pipe( pipe_slow );
 4158 %}
 4159 
 4160 
 4161 instruct vgather_subwordGT8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegP idx_base_temp,
 4162                                  vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4163   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4164   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4165   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4166   format %{ "vector_gatherGT8_off $dst, $mem, $idx_base, $offset\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4167   ins_encode %{
 4168     int vlen_enc = vector_length_encoding(this);
 4169     int vector_len = Matcher::vector_length(this);
 4170     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4171     __ lea($tmp$$Register, $mem$$Address);
 4172     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4173     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, noreg, $xtmp1$$XMMRegister,
 4174                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4175   %}
 4176   ins_pipe( pipe_slow );
 4177 %}
 4178 
 4179 
 4180 #ifdef _LP64
 4181 instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4182   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4183   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4184   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4185   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4186   ins_encode %{
 4187     int vlen_enc = vector_length_encoding(this);
 4188     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4189     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4190     __ lea($tmp$$Register, $mem$$Address);
 4191     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4192     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4193   %}
 4194   ins_pipe( pipe_slow );
 4195 %}
 4196 
 4197 instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4198                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4199   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4200   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4201   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4202   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4203   ins_encode %{
 4204     int vlen_enc = vector_length_encoding(this);
 4205     int vector_len = Matcher::vector_length(this);
 4206     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4207     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4208     __ lea($tmp$$Register, $mem$$Address);
 4209     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4210     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4211     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4212                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4213   %}
 4214   ins_pipe( pipe_slow );
 4215 %}
 4216 
 4217 instruct vgather_masked_subwordLE8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4218   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4219   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4220   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4221   format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4222   ins_encode %{
 4223     int vlen_enc = vector_length_encoding(this);
 4224     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4225     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4226     __ lea($tmp$$Register, $mem$$Address);
 4227     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4228     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
 4229                                 $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4230   %}
 4231   ins_pipe( pipe_slow );
 4232 %}
 4233 
 4234 instruct vgather_masked_subwordGT8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4235                                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4236   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4237   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4238   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4239   format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4240   ins_encode %{
 4241     int vlen_enc = vector_length_encoding(this);
 4242     int vector_len = Matcher::vector_length(this);
 4243     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4244     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4245     __ lea($tmp$$Register, $mem$$Address);
 4246     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4247     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4248     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4249                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4250   %}
 4251   ins_pipe( pipe_slow );
 4252 %}
 4253 
 4254 instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4255   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4256   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4257   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4258   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4259   ins_encode %{
 4260     int vlen_enc = vector_length_encoding(this);
 4261     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4262     __ lea($tmp$$Register, $mem$$Address);
 4263     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4264     if (elem_bt == T_SHORT) {
 4265       __ movl($mask_idx$$Register, 0x55555555);
 4266       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4267     }
 4268     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4269     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4270   %}
 4271   ins_pipe( pipe_slow );
 4272 %}
 4273 
 4274 instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegP tmp, rRegP idx_base_temp,
 4275                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4276   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4277   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4278   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4279   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4280   ins_encode %{
 4281     int vlen_enc = vector_length_encoding(this);
 4282     int vector_len = Matcher::vector_length(this);
 4283     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4284     __ lea($tmp$$Register, $mem$$Address);
 4285     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4286     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4287     if (elem_bt == T_SHORT) {
 4288       __ movl($mask_idx$$Register, 0x55555555);
 4289       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4290     }
 4291     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4292     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4293                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4294   %}
 4295   ins_pipe( pipe_slow );
 4296 %}
 4297 
 4298 instruct vgather_masked_subwordLE8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4299   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4300   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4301   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4302   format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4303   ins_encode %{
 4304     int vlen_enc = vector_length_encoding(this);
 4305     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4306     __ lea($tmp$$Register, $mem$$Address);
 4307     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4308     if (elem_bt == T_SHORT) {
 4309       __ movl($mask_idx$$Register, 0x55555555);
 4310       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4311     }
 4312     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4313     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
 4314                                 $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4315   %}
 4316   ins_pipe( pipe_slow );
 4317 %}
 4318 
 4319 instruct vgather_masked_subwordGT8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegP tmp, rRegP idx_base_temp,
 4320                                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4321   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4322   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4323   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4324   format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4325   ins_encode %{
 4326     int vlen_enc = vector_length_encoding(this);
 4327     int vector_len = Matcher::vector_length(this);
 4328     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4329     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4330     __ lea($tmp$$Register, $mem$$Address);
 4331     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4332     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4333     if (elem_bt == T_SHORT) {
 4334       __ movl($mask_idx$$Register, 0x55555555);
 4335       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4336     }
 4337     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4338     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4339                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4340   %}
 4341   ins_pipe( pipe_slow );
 4342 %}
 4343 #endif
 4344 
 4345 // ====================Scatter=======================================
 4346 
 4347 // Scatter INT, LONG, FLOAT, DOUBLE
 4348 
 4349 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
 4350   predicate(UseAVX > 2);
 4351   match(Set mem (StoreVectorScatter mem (Binary src idx)));
 4352   effect(TEMP tmp, TEMP ktmp);
 4353   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
 4354   ins_encode %{
 4355     int vlen_enc = vector_length_encoding(this, $src);
 4356     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4357 
 4358     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4359     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4360 
 4361     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4362     __ lea($tmp$$Register, $mem$$Address);
 4363     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4364   %}
 4365   ins_pipe( pipe_slow );
 4366 %}
 4367 
 4368 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4369   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
 4370   effect(TEMP tmp, TEMP ktmp);
 4371   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
 4372   ins_encode %{
 4373     int vlen_enc = vector_length_encoding(this, $src);
 4374     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4375     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4376     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4377     // Note: Since scatter instruction partially updates the opmask register used
 4378     // for predication hense moving mask operand to a temporary.
 4379     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4380     __ lea($tmp$$Register, $mem$$Address);
 4381     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4382   %}
 4383   ins_pipe( pipe_slow );
 4384 %}
 4385 
 4386 // ====================REPLICATE=======================================
 4387 
 4388 // Replicate byte scalar to be vector
 4389 instruct vReplB_reg(vec dst, rRegI src) %{
 4390   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 4391   match(Set dst (Replicate src));
 4392   format %{ "replicateB $dst,$src" %}
 4393   ins_encode %{
 4394     uint vlen = Matcher::vector_length(this);
 4395     if (UseAVX >= 2) {
 4396       int vlen_enc = vector_length_encoding(this);
 4397       if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4398         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
 4399         __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
 4400       } else {
 4401         __ movdl($dst$$XMMRegister, $src$$Register);
 4402         __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4403       }
 4404     } else {
 4405        assert(UseAVX < 2, "");
 4406       __ movdl($dst$$XMMRegister, $src$$Register);
 4407       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
 4408       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4409       if (vlen >= 16) {
 4410         assert(vlen == 16, "");
 4411         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4412       }
 4413     }
 4414   %}
 4415   ins_pipe( pipe_slow );
 4416 %}
 4417 
 4418 instruct ReplB_mem(vec dst, memory mem) %{
 4419   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_BYTE);
 4420   match(Set dst (Replicate (LoadB mem)));
 4421   format %{ "replicateB $dst,$mem" %}
 4422   ins_encode %{
 4423     int vlen_enc = vector_length_encoding(this);
 4424     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4425   %}
 4426   ins_pipe( pipe_slow );
 4427 %}
 4428 
 4429 // ====================ReplicateS=======================================
 4430 
 4431 instruct vReplS_reg(vec dst, rRegI src) %{
 4432   predicate(Matcher::vector_element_basic_type(n) == T_SHORT);
 4433   match(Set dst (Replicate src));
 4434   format %{ "replicateS $dst,$src" %}
 4435   ins_encode %{
 4436     uint vlen = Matcher::vector_length(this);
 4437     int vlen_enc = vector_length_encoding(this);
 4438     if (UseAVX >= 2) {
 4439       if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4440         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
 4441         __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
 4442       } else {
 4443         __ movdl($dst$$XMMRegister, $src$$Register);
 4444         __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4445       }
 4446     } else {
 4447       assert(UseAVX < 2, "");
 4448       __ movdl($dst$$XMMRegister, $src$$Register);
 4449       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4450       if (vlen >= 8) {
 4451         assert(vlen == 8, "");
 4452         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4453       }
 4454     }
 4455   %}
 4456   ins_pipe( pipe_slow );
 4457 %}
 4458 
 4459 instruct ReplS_mem(vec dst, memory mem) %{
 4460   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_SHORT);
 4461   match(Set dst (Replicate (LoadS mem)));
 4462   format %{ "replicateS $dst,$mem" %}
 4463   ins_encode %{
 4464     int vlen_enc = vector_length_encoding(this);
 4465     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4466   %}
 4467   ins_pipe( pipe_slow );
 4468 %}
 4469 
 4470 // ====================ReplicateI=======================================
 4471 
 4472 instruct ReplI_reg(vec dst, rRegI src) %{
 4473   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4474   match(Set dst (Replicate src));
 4475   format %{ "replicateI $dst,$src" %}
 4476   ins_encode %{
 4477     uint vlen = Matcher::vector_length(this);
 4478     int vlen_enc = vector_length_encoding(this);
 4479     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4480       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
 4481     } else if (VM_Version::supports_avx2()) {
 4482       __ movdl($dst$$XMMRegister, $src$$Register);
 4483       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4484     } else {
 4485       __ movdl($dst$$XMMRegister, $src$$Register);
 4486       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4487     }
 4488   %}
 4489   ins_pipe( pipe_slow );
 4490 %}
 4491 
 4492 instruct ReplI_mem(vec dst, memory mem) %{
 4493   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4494   match(Set dst (Replicate (LoadI mem)));
 4495   format %{ "replicateI $dst,$mem" %}
 4496   ins_encode %{
 4497     int vlen_enc = vector_length_encoding(this);
 4498     if (VM_Version::supports_avx2()) {
 4499       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4500     } else if (VM_Version::supports_avx()) {
 4501       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4502     } else {
 4503       __ movdl($dst$$XMMRegister, $mem$$Address);
 4504       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4505     }
 4506   %}
 4507   ins_pipe( pipe_slow );
 4508 %}
 4509 
 4510 instruct ReplI_imm(vec dst, immI con) %{
 4511   predicate(Matcher::is_non_long_integral_vector(n));
 4512   match(Set dst (Replicate con));
 4513   format %{ "replicateI $dst,$con" %}
 4514   ins_encode %{
 4515     InternalAddress addr = $constantaddress(Matcher::vector_element_basic_type(this),
 4516         vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
 4517             (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 8) /
 4518                 type2aelembytes(Matcher::vector_element_basic_type(this))));
 4519     BasicType bt = Matcher::vector_element_basic_type(this);
 4520     int vlen = Matcher::vector_length_in_bytes(this);
 4521     __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
 4522   %}
 4523   ins_pipe( pipe_slow );
 4524 %}
 4525 
 4526 // Replicate scalar zero to be vector
 4527 instruct ReplI_zero(vec dst, immI_0 zero) %{
 4528   predicate(Matcher::is_non_long_integral_vector(n));
 4529   match(Set dst (Replicate zero));
 4530   format %{ "replicateI $dst,$zero" %}
 4531   ins_encode %{
 4532     int vlen_enc = vector_length_encoding(this);
 4533     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4534       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4535     } else {
 4536       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4537     }
 4538   %}
 4539   ins_pipe( fpu_reg_reg );
 4540 %}
 4541 
 4542 instruct ReplI_M1(vec dst, immI_M1 con) %{
 4543   predicate(UseSSE >= 2 && Matcher::is_non_long_integral_vector(n));
 4544   match(Set dst (Replicate con));
 4545   format %{ "vallones $dst" %}
 4546   ins_encode %{
 4547     int vector_len = vector_length_encoding(this);
 4548     __ vallones($dst$$XMMRegister, vector_len);
 4549   %}
 4550   ins_pipe( pipe_slow );
 4551 %}
 4552 
 4553 // ====================ReplicateL=======================================
 4554 
 4555 #ifdef _LP64
 4556 // Replicate long (8 byte) scalar to be vector
 4557 instruct ReplL_reg(vec dst, rRegL src) %{
 4558   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4559   match(Set dst (Replicate src));
 4560   format %{ "replicateL $dst,$src" %}
 4561   ins_encode %{
 4562     int vlen = Matcher::vector_length(this);
 4563     int vlen_enc = vector_length_encoding(this);
 4564     if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4565       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
 4566     } else if (VM_Version::supports_avx2()) {
 4567       __ movdq($dst$$XMMRegister, $src$$Register);
 4568       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4569     } else {
 4570       __ movdq($dst$$XMMRegister, $src$$Register);
 4571       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4572     }
 4573   %}
 4574   ins_pipe( pipe_slow );
 4575 %}
 4576 #else // _LP64
 4577 // Replicate long (8 byte) scalar to be vector
 4578 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
 4579   predicate(Matcher::vector_length(n) <= 4 && Matcher::vector_element_basic_type(n) == T_LONG);
 4580   match(Set dst (Replicate src));
 4581   effect(TEMP dst, USE src, TEMP tmp);
 4582   format %{ "replicateL $dst,$src" %}
 4583   ins_encode %{
 4584     uint vlen = Matcher::vector_length(this);
 4585     if (vlen == 2) {
 4586       __ movdl($dst$$XMMRegister, $src$$Register);
 4587       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4588       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4589       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4590     } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4591       int vlen_enc = Assembler::AVX_256bit;
 4592       __ movdl($dst$$XMMRegister, $src$$Register);
 4593       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4594       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4595       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4596     } else {
 4597       __ movdl($dst$$XMMRegister, $src$$Register);
 4598       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4599       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4600       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4601       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4602     }
 4603   %}
 4604   ins_pipe( pipe_slow );
 4605 %}
 4606 
 4607 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
 4608   predicate(Matcher::vector_length(n) == 8 && Matcher::vector_element_basic_type(n) == T_LONG);
 4609   match(Set dst (Replicate src));
 4610   effect(TEMP dst, USE src, TEMP tmp);
 4611   format %{ "replicateL $dst,$src" %}
 4612   ins_encode %{
 4613     if (VM_Version::supports_avx512vl()) {
 4614       __ movdl($dst$$XMMRegister, $src$$Register);
 4615       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4616       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4617       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4618       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4619       __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
 4620     } else {
 4621       int vlen_enc = Assembler::AVX_512bit;
 4622       __ movdl($dst$$XMMRegister, $src$$Register);
 4623       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4624       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4625       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4626     }
 4627   %}
 4628   ins_pipe( pipe_slow );
 4629 %}
 4630 #endif // _LP64
 4631 
 4632 instruct ReplL_mem(vec dst, memory mem) %{
 4633   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4634   match(Set dst (Replicate (LoadL mem)));
 4635   format %{ "replicateL $dst,$mem" %}
 4636   ins_encode %{
 4637     int vlen_enc = vector_length_encoding(this);
 4638     if (VM_Version::supports_avx2()) {
 4639       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4640     } else if (VM_Version::supports_sse3()) {
 4641       __ movddup($dst$$XMMRegister, $mem$$Address);
 4642     } else {
 4643       __ movq($dst$$XMMRegister, $mem$$Address);
 4644       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4645     }
 4646   %}
 4647   ins_pipe( pipe_slow );
 4648 %}
 4649 
 4650 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
 4651 instruct ReplL_imm(vec dst, immL con) %{
 4652   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4653   match(Set dst (Replicate con));
 4654   format %{ "replicateL $dst,$con" %}
 4655   ins_encode %{
 4656     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, $con$$constant, 1));
 4657     int vlen = Matcher::vector_length_in_bytes(this);
 4658     __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
 4659   %}
 4660   ins_pipe( pipe_slow );
 4661 %}
 4662 
 4663 instruct ReplL_zero(vec dst, immL0 zero) %{
 4664   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4665   match(Set dst (Replicate zero));
 4666   format %{ "replicateL $dst,$zero" %}
 4667   ins_encode %{
 4668     int vlen_enc = vector_length_encoding(this);
 4669     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4670       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4671     } else {
 4672       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4673     }
 4674   %}
 4675   ins_pipe( fpu_reg_reg );
 4676 %}
 4677 
 4678 instruct ReplL_M1(vec dst, immL_M1 con) %{
 4679   predicate(UseSSE >= 2 && Matcher::vector_element_basic_type(n) == T_LONG);
 4680   match(Set dst (Replicate con));
 4681   format %{ "vallones $dst" %}
 4682   ins_encode %{
 4683     int vector_len = vector_length_encoding(this);
 4684     __ vallones($dst$$XMMRegister, vector_len);
 4685   %}
 4686   ins_pipe( pipe_slow );
 4687 %}
 4688 
 4689 // ====================ReplicateF=======================================
 4690 
 4691 instruct vReplF_reg(vec dst, vlRegF src) %{
 4692   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4693   match(Set dst (Replicate src));
 4694   format %{ "replicateF $dst,$src" %}
 4695   ins_encode %{
 4696     uint vlen = Matcher::vector_length(this);
 4697     int vlen_enc = vector_length_encoding(this);
 4698     if (vlen <= 4) {
 4699       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4700     } else if (VM_Version::supports_avx2()) {
 4701       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4702     } else {
 4703       assert(vlen == 8, "sanity");
 4704       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4705       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4706     }
 4707   %}
 4708   ins_pipe( pipe_slow );
 4709 %}
 4710 
 4711 instruct ReplF_reg(vec dst, vlRegF src) %{
 4712   predicate(UseAVX == 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4713   match(Set dst (Replicate src));
 4714   format %{ "replicateF $dst,$src" %}
 4715   ins_encode %{
 4716     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4717   %}
 4718   ins_pipe( pipe_slow );
 4719 %}
 4720 
 4721 instruct ReplF_mem(vec dst, memory mem) %{
 4722   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4723   match(Set dst (Replicate (LoadF mem)));
 4724   format %{ "replicateF $dst,$mem" %}
 4725   ins_encode %{
 4726     int vlen_enc = vector_length_encoding(this);
 4727     __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4728   %}
 4729   ins_pipe( pipe_slow );
 4730 %}
 4731 
 4732 // Replicate float scalar immediate to be vector by loading from const table.
 4733 instruct ReplF_imm(vec dst, immF con) %{
 4734   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4735   match(Set dst (Replicate con));
 4736   format %{ "replicateF $dst,$con" %}
 4737   ins_encode %{
 4738     InternalAddress addr = $constantaddress(T_FLOAT, vreplicate_imm(T_FLOAT, $con$$constant,
 4739         VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 2));
 4740     int vlen = Matcher::vector_length_in_bytes(this);
 4741     __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
 4742   %}
 4743   ins_pipe( pipe_slow );
 4744 %}
 4745 
 4746 instruct ReplF_zero(vec dst, immF0 zero) %{
 4747   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4748   match(Set dst (Replicate zero));
 4749   format %{ "replicateF $dst,$zero" %}
 4750   ins_encode %{
 4751     int vlen_enc = vector_length_encoding(this);
 4752     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4753       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4754     } else {
 4755       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4756     }
 4757   %}
 4758   ins_pipe( fpu_reg_reg );
 4759 %}
 4760 
 4761 // ====================ReplicateD=======================================
 4762 
 4763 // Replicate double (8 bytes) scalar to be vector
 4764 instruct vReplD_reg(vec dst, vlRegD src) %{
 4765   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4766   match(Set dst (Replicate src));
 4767   format %{ "replicateD $dst,$src" %}
 4768   ins_encode %{
 4769     uint vlen = Matcher::vector_length(this);
 4770     int vlen_enc = vector_length_encoding(this);
 4771     if (vlen <= 2) {
 4772       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4773     } else if (VM_Version::supports_avx2()) {
 4774       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4775     } else {
 4776       assert(vlen == 4, "sanity");
 4777       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4778       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4779     }
 4780   %}
 4781   ins_pipe( pipe_slow );
 4782 %}
 4783 
 4784 instruct ReplD_reg(vec dst, vlRegD src) %{
 4785   predicate(UseSSE < 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4786   match(Set dst (Replicate src));
 4787   format %{ "replicateD $dst,$src" %}
 4788   ins_encode %{
 4789     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4790   %}
 4791   ins_pipe( pipe_slow );
 4792 %}
 4793 
 4794 instruct ReplD_mem(vec dst, memory mem) %{
 4795   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4796   match(Set dst (Replicate (LoadD mem)));
 4797   format %{ "replicateD $dst,$mem" %}
 4798   ins_encode %{
 4799     if (Matcher::vector_length(this) >= 4) {
 4800       int vlen_enc = vector_length_encoding(this);
 4801       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4802     } else {
 4803       __ movddup($dst$$XMMRegister, $mem$$Address);
 4804     }
 4805   %}
 4806   ins_pipe( pipe_slow );
 4807 %}
 4808 
 4809 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
 4810 instruct ReplD_imm(vec dst, immD con) %{
 4811   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4812   match(Set dst (Replicate con));
 4813   format %{ "replicateD $dst,$con" %}
 4814   ins_encode %{
 4815     InternalAddress addr = $constantaddress(T_DOUBLE, vreplicate_imm(T_DOUBLE, $con$$constant, 1));
 4816     int vlen = Matcher::vector_length_in_bytes(this);
 4817     __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
 4818   %}
 4819   ins_pipe( pipe_slow );
 4820 %}
 4821 
 4822 instruct ReplD_zero(vec dst, immD0 zero) %{
 4823   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4824   match(Set dst (Replicate zero));
 4825   format %{ "replicateD $dst,$zero" %}
 4826   ins_encode %{
 4827     int vlen_enc = vector_length_encoding(this);
 4828     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4829       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4830     } else {
 4831       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4832     }
 4833   %}
 4834   ins_pipe( fpu_reg_reg );
 4835 %}
 4836 
 4837 // ====================VECTOR INSERT=======================================
 4838 
 4839 instruct insert(vec dst, rRegI val, immU8 idx) %{
 4840   predicate(Matcher::vector_length_in_bytes(n) < 32);
 4841   match(Set dst (VectorInsert (Binary dst val) idx));
 4842   format %{ "vector_insert $dst,$val,$idx" %}
 4843   ins_encode %{
 4844     assert(UseSSE >= 4, "required");
 4845     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
 4846 
 4847     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4848 
 4849     assert(is_integral_type(elem_bt), "");
 4850     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4851 
 4852     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
 4853   %}
 4854   ins_pipe( pipe_slow );
 4855 %}
 4856 
 4857 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
 4858   predicate(Matcher::vector_length_in_bytes(n) == 32);
 4859   match(Set dst (VectorInsert (Binary src val) idx));
 4860   effect(TEMP vtmp);
 4861   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4862   ins_encode %{
 4863     int vlen_enc = Assembler::AVX_256bit;
 4864     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4865     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4866     int log2epr = log2(elem_per_lane);
 4867 
 4868     assert(is_integral_type(elem_bt), "sanity");
 4869     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4870 
 4871     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4872     uint y_idx = ($idx$$constant >> log2epr) & 1;
 4873     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4874     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4875     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4876   %}
 4877   ins_pipe( pipe_slow );
 4878 %}
 4879 
 4880 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
 4881   predicate(Matcher::vector_length_in_bytes(n) == 64);
 4882   match(Set dst (VectorInsert (Binary src val) idx));
 4883   effect(TEMP vtmp);
 4884   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4885   ins_encode %{
 4886     assert(UseAVX > 2, "sanity");
 4887 
 4888     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4889     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4890     int log2epr = log2(elem_per_lane);
 4891 
 4892     assert(is_integral_type(elem_bt), "");
 4893     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4894 
 4895     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4896     uint y_idx = ($idx$$constant >> log2epr) & 3;
 4897     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4898     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4899     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4900   %}
 4901   ins_pipe( pipe_slow );
 4902 %}
 4903 
 4904 #ifdef _LP64
 4905 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
 4906   predicate(Matcher::vector_length(n) == 2);
 4907   match(Set dst (VectorInsert (Binary dst val) idx));
 4908   format %{ "vector_insert $dst,$val,$idx" %}
 4909   ins_encode %{
 4910     assert(UseSSE >= 4, "required");
 4911     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4912     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4913 
 4914     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
 4915   %}
 4916   ins_pipe( pipe_slow );
 4917 %}
 4918 
 4919 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
 4920   predicate(Matcher::vector_length(n) == 4);
 4921   match(Set dst (VectorInsert (Binary src val) idx));
 4922   effect(TEMP vtmp);
 4923   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4924   ins_encode %{
 4925     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4926     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4927 
 4928     uint x_idx = $idx$$constant & right_n_bits(1);
 4929     uint y_idx = ($idx$$constant >> 1) & 1;
 4930     int vlen_enc = Assembler::AVX_256bit;
 4931     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4932     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4933     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4934   %}
 4935   ins_pipe( pipe_slow );
 4936 %}
 4937 
 4938 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
 4939   predicate(Matcher::vector_length(n) == 8);
 4940   match(Set dst (VectorInsert (Binary src val) idx));
 4941   effect(TEMP vtmp);
 4942   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4943   ins_encode %{
 4944     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
 4945     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4946 
 4947     uint x_idx = $idx$$constant & right_n_bits(1);
 4948     uint y_idx = ($idx$$constant >> 1) & 3;
 4949     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4950     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4951     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4952   %}
 4953   ins_pipe( pipe_slow );
 4954 %}
 4955 #endif
 4956 
 4957 instruct insertF(vec dst, regF val, immU8 idx) %{
 4958   predicate(Matcher::vector_length(n) < 8);
 4959   match(Set dst (VectorInsert (Binary dst val) idx));
 4960   format %{ "vector_insert $dst,$val,$idx" %}
 4961   ins_encode %{
 4962     assert(UseSSE >= 4, "sanity");
 4963 
 4964     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4965     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4966 
 4967     uint x_idx = $idx$$constant & right_n_bits(2);
 4968     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4969   %}
 4970   ins_pipe( pipe_slow );
 4971 %}
 4972 
 4973 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
 4974   predicate(Matcher::vector_length(n) >= 8);
 4975   match(Set dst (VectorInsert (Binary src val) idx));
 4976   effect(TEMP vtmp);
 4977   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4978   ins_encode %{
 4979     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4980     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4981 
 4982     int vlen = Matcher::vector_length(this);
 4983     uint x_idx = $idx$$constant & right_n_bits(2);
 4984     if (vlen == 8) {
 4985       uint y_idx = ($idx$$constant >> 2) & 1;
 4986       int vlen_enc = Assembler::AVX_256bit;
 4987       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4988       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4989       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4990     } else {
 4991       assert(vlen == 16, "sanity");
 4992       uint y_idx = ($idx$$constant >> 2) & 3;
 4993       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4994       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4995       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4996     }
 4997   %}
 4998   ins_pipe( pipe_slow );
 4999 %}
 5000 
 5001 #ifdef _LP64
 5002 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
 5003   predicate(Matcher::vector_length(n) == 2);
 5004   match(Set dst (VectorInsert (Binary dst val) idx));
 5005   effect(TEMP tmp);
 5006   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
 5007   ins_encode %{
 5008     assert(UseSSE >= 4, "sanity");
 5009     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 5010     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5011 
 5012     __ movq($tmp$$Register, $val$$XMMRegister);
 5013     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
 5014   %}
 5015   ins_pipe( pipe_slow );
 5016 %}
 5017 
 5018 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
 5019   predicate(Matcher::vector_length(n) == 4);
 5020   match(Set dst (VectorInsert (Binary src val) idx));
 5021   effect(TEMP vtmp, TEMP tmp);
 5022   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
 5023   ins_encode %{
 5024     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 5025     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5026 
 5027     uint x_idx = $idx$$constant & right_n_bits(1);
 5028     uint y_idx = ($idx$$constant >> 1) & 1;
 5029     int vlen_enc = Assembler::AVX_256bit;
 5030     __ movq($tmp$$Register, $val$$XMMRegister);
 5031     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5032     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 5033     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5034   %}
 5035   ins_pipe( pipe_slow );
 5036 %}
 5037 
 5038 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
 5039   predicate(Matcher::vector_length(n) == 8);
 5040   match(Set dst (VectorInsert (Binary src val) idx));
 5041   effect(TEMP tmp, TEMP vtmp);
 5042   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 5043   ins_encode %{
 5044     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 5045     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5046 
 5047     uint x_idx = $idx$$constant & right_n_bits(1);
 5048     uint y_idx = ($idx$$constant >> 1) & 3;
 5049     __ movq($tmp$$Register, $val$$XMMRegister);
 5050     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5051     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 5052     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5053   %}
 5054   ins_pipe( pipe_slow );
 5055 %}
 5056 #endif
 5057 
 5058 // ====================REDUCTION ARITHMETIC=======================================
 5059 
 5060 // =======================Int Reduction==========================================
 5061 
 5062 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5063   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
 5064   match(Set dst (AddReductionVI src1 src2));
 5065   match(Set dst (MulReductionVI src1 src2));
 5066   match(Set dst (AndReductionV  src1 src2));
 5067   match(Set dst ( OrReductionV  src1 src2));
 5068   match(Set dst (XorReductionV  src1 src2));
 5069   match(Set dst (MinReductionV  src1 src2));
 5070   match(Set dst (MaxReductionV  src1 src2));
 5071   effect(TEMP vtmp1, TEMP vtmp2);
 5072   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5073   ins_encode %{
 5074     int opcode = this->ideal_Opcode();
 5075     int vlen = Matcher::vector_length(this, $src2);
 5076     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5077   %}
 5078   ins_pipe( pipe_slow );
 5079 %}
 5080 
 5081 // =======================Long Reduction==========================================
 5082 
 5083 #ifdef _LP64
 5084 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5085   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
 5086   match(Set dst (AddReductionVL src1 src2));
 5087   match(Set dst (MulReductionVL src1 src2));
 5088   match(Set dst (AndReductionV  src1 src2));
 5089   match(Set dst ( OrReductionV  src1 src2));
 5090   match(Set dst (XorReductionV  src1 src2));
 5091   match(Set dst (MinReductionV  src1 src2));
 5092   match(Set dst (MaxReductionV  src1 src2));
 5093   effect(TEMP vtmp1, TEMP vtmp2);
 5094   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5095   ins_encode %{
 5096     int opcode = this->ideal_Opcode();
 5097     int vlen = Matcher::vector_length(this, $src2);
 5098     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5099   %}
 5100   ins_pipe( pipe_slow );
 5101 %}
 5102 
 5103 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
 5104   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
 5105   match(Set dst (AddReductionVL src1 src2));
 5106   match(Set dst (MulReductionVL src1 src2));
 5107   match(Set dst (AndReductionV  src1 src2));
 5108   match(Set dst ( OrReductionV  src1 src2));
 5109   match(Set dst (XorReductionV  src1 src2));
 5110   match(Set dst (MinReductionV  src1 src2));
 5111   match(Set dst (MaxReductionV  src1 src2));
 5112   effect(TEMP vtmp1, TEMP vtmp2);
 5113   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5114   ins_encode %{
 5115     int opcode = this->ideal_Opcode();
 5116     int vlen = Matcher::vector_length(this, $src2);
 5117     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5118   %}
 5119   ins_pipe( pipe_slow );
 5120 %}
 5121 #endif // _LP64
 5122 
 5123 // =======================Float Reduction==========================================
 5124 
 5125 instruct reductionF128(regF dst, vec src, vec vtmp) %{
 5126   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) <= 4); // src
 5127   match(Set dst (AddReductionVF dst src));
 5128   match(Set dst (MulReductionVF dst src));
 5129   effect(TEMP dst, TEMP vtmp);
 5130   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
 5131   ins_encode %{
 5132     int opcode = this->ideal_Opcode();
 5133     int vlen = Matcher::vector_length(this, $src);
 5134     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5135   %}
 5136   ins_pipe( pipe_slow );
 5137 %}
 5138 
 5139 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
 5140   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5141   match(Set dst (AddReductionVF dst src));
 5142   match(Set dst (MulReductionVF dst src));
 5143   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5144   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5145   ins_encode %{
 5146     int opcode = this->ideal_Opcode();
 5147     int vlen = Matcher::vector_length(this, $src);
 5148     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5149   %}
 5150   ins_pipe( pipe_slow );
 5151 %}
 5152 
 5153 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5154   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src
 5155   match(Set dst (AddReductionVF dst src));
 5156   match(Set dst (MulReductionVF dst src));
 5157   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5158   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5159   ins_encode %{
 5160     int opcode = this->ideal_Opcode();
 5161     int vlen = Matcher::vector_length(this, $src);
 5162     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5163   %}
 5164   ins_pipe( pipe_slow );
 5165 %}
 5166 
 5167 
 5168 instruct unordered_reduction2F(regF dst, regF src1, vec src2) %{
 5169   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5170   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5171   // src1 contains reduction identity
 5172   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5173   match(Set dst (AddReductionVF src1 src2));
 5174   match(Set dst (MulReductionVF src1 src2));
 5175   effect(TEMP dst);
 5176   format %{ "vector_reduction_float  $dst,$src1,$src2 ;" %}
 5177   ins_encode %{
 5178     int opcode = this->ideal_Opcode();
 5179     int vlen = Matcher::vector_length(this, $src2);
 5180     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5181   %}
 5182   ins_pipe( pipe_slow );
 5183 %}
 5184 
 5185 instruct unordered_reduction4F(regF dst, regF src1, vec src2, vec vtmp) %{
 5186   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5187   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5188   // src1 contains reduction identity
 5189   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5190   match(Set dst (AddReductionVF src1 src2));
 5191   match(Set dst (MulReductionVF src1 src2));
 5192   effect(TEMP dst, TEMP vtmp);
 5193   format %{ "vector_reduction_float  $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5194   ins_encode %{
 5195     int opcode = this->ideal_Opcode();
 5196     int vlen = Matcher::vector_length(this, $src2);
 5197     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5198   %}
 5199   ins_pipe( pipe_slow );
 5200 %}
 5201 
 5202 instruct unordered_reduction8F(regF dst, regF src1, vec src2, vec vtmp1, vec vtmp2) %{
 5203   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5204   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5205   // src1 contains reduction identity
 5206   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5207   match(Set dst (AddReductionVF src1 src2));
 5208   match(Set dst (MulReductionVF src1 src2));
 5209   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5210   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5211   ins_encode %{
 5212     int opcode = this->ideal_Opcode();
 5213     int vlen = Matcher::vector_length(this, $src2);
 5214     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5215   %}
 5216   ins_pipe( pipe_slow );
 5217 %}
 5218 
 5219 instruct unordered_reduction16F(regF dst, regF src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5220   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5221   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5222   // src1 contains reduction identity
 5223   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src2
 5224   match(Set dst (AddReductionVF src1 src2));
 5225   match(Set dst (MulReductionVF src1 src2));
 5226   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5227   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5228   ins_encode %{
 5229     int opcode = this->ideal_Opcode();
 5230     int vlen = Matcher::vector_length(this, $src2);
 5231     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5232   %}
 5233   ins_pipe( pipe_slow );
 5234 %}
 5235 
 5236 // =======================Double Reduction==========================================
 5237 
 5238 instruct reduction2D(regD dst, vec src, vec vtmp) %{
 5239   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src
 5240   match(Set dst (AddReductionVD dst src));
 5241   match(Set dst (MulReductionVD dst src));
 5242   effect(TEMP dst, TEMP vtmp);
 5243   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
 5244   ins_encode %{
 5245     int opcode = this->ideal_Opcode();
 5246     int vlen = Matcher::vector_length(this, $src);
 5247     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5248 %}
 5249   ins_pipe( pipe_slow );
 5250 %}
 5251 
 5252 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
 5253   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src
 5254   match(Set dst (AddReductionVD dst src));
 5255   match(Set dst (MulReductionVD dst src));
 5256   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5257   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5258   ins_encode %{
 5259     int opcode = this->ideal_Opcode();
 5260     int vlen = Matcher::vector_length(this, $src);
 5261     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5262   %}
 5263   ins_pipe( pipe_slow );
 5264 %}
 5265 
 5266 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5267   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5268   match(Set dst (AddReductionVD dst src));
 5269   match(Set dst (MulReductionVD dst src));
 5270   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5271   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5272   ins_encode %{
 5273     int opcode = this->ideal_Opcode();
 5274     int vlen = Matcher::vector_length(this, $src);
 5275     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5276   %}
 5277   ins_pipe( pipe_slow );
 5278 %}
 5279 
 5280 instruct unordered_reduction2D(regD dst, regD src1, vec src2) %{
 5281   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5282   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5283   // src1 contains reduction identity
 5284   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5285   match(Set dst (AddReductionVD src1 src2));
 5286   match(Set dst (MulReductionVD src1 src2));
 5287   effect(TEMP dst);
 5288   format %{ "vector_reduction_double $dst,$src1,$src2 ;" %}
 5289   ins_encode %{
 5290     int opcode = this->ideal_Opcode();
 5291     int vlen = Matcher::vector_length(this, $src2);
 5292     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5293 %}
 5294   ins_pipe( pipe_slow );
 5295 %}
 5296 
 5297 instruct unordered_reduction4D(regD dst, regD src1, vec src2, vec vtmp) %{
 5298   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5299   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5300   // src1 contains reduction identity
 5301   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5302   match(Set dst (AddReductionVD src1 src2));
 5303   match(Set dst (MulReductionVD src1 src2));
 5304   effect(TEMP dst, TEMP vtmp);
 5305   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5306   ins_encode %{
 5307     int opcode = this->ideal_Opcode();
 5308     int vlen = Matcher::vector_length(this, $src2);
 5309     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5310   %}
 5311   ins_pipe( pipe_slow );
 5312 %}
 5313 
 5314 instruct unordered_reduction8D(regD dst, regD src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5315   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5316   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5317   // src1 contains reduction identity
 5318   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5319   match(Set dst (AddReductionVD src1 src2));
 5320   match(Set dst (MulReductionVD src1 src2));
 5321   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5322   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5323   ins_encode %{
 5324     int opcode = this->ideal_Opcode();
 5325     int vlen = Matcher::vector_length(this, $src2);
 5326     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5327   %}
 5328   ins_pipe( pipe_slow );
 5329 %}
 5330 
 5331 // =======================Byte Reduction==========================================
 5332 
 5333 #ifdef _LP64
 5334 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5335   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
 5336   match(Set dst (AddReductionVI src1 src2));
 5337   match(Set dst (AndReductionV  src1 src2));
 5338   match(Set dst ( OrReductionV  src1 src2));
 5339   match(Set dst (XorReductionV  src1 src2));
 5340   match(Set dst (MinReductionV  src1 src2));
 5341   match(Set dst (MaxReductionV  src1 src2));
 5342   effect(TEMP vtmp1, TEMP vtmp2);
 5343   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5344   ins_encode %{
 5345     int opcode = this->ideal_Opcode();
 5346     int vlen = Matcher::vector_length(this, $src2);
 5347     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5348   %}
 5349   ins_pipe( pipe_slow );
 5350 %}
 5351 
 5352 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5353   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
 5354   match(Set dst (AddReductionVI src1 src2));
 5355   match(Set dst (AndReductionV  src1 src2));
 5356   match(Set dst ( OrReductionV  src1 src2));
 5357   match(Set dst (XorReductionV  src1 src2));
 5358   match(Set dst (MinReductionV  src1 src2));
 5359   match(Set dst (MaxReductionV  src1 src2));
 5360   effect(TEMP vtmp1, TEMP vtmp2);
 5361   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5362   ins_encode %{
 5363     int opcode = this->ideal_Opcode();
 5364     int vlen = Matcher::vector_length(this, $src2);
 5365     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5366   %}
 5367   ins_pipe( pipe_slow );
 5368 %}
 5369 #endif
 5370 
 5371 // =======================Short Reduction==========================================
 5372 
 5373 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5374   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
 5375   match(Set dst (AddReductionVI src1 src2));
 5376   match(Set dst (MulReductionVI src1 src2));
 5377   match(Set dst (AndReductionV  src1 src2));
 5378   match(Set dst ( OrReductionV  src1 src2));
 5379   match(Set dst (XorReductionV  src1 src2));
 5380   match(Set dst (MinReductionV  src1 src2));
 5381   match(Set dst (MaxReductionV  src1 src2));
 5382   effect(TEMP vtmp1, TEMP vtmp2);
 5383   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5384   ins_encode %{
 5385     int opcode = this->ideal_Opcode();
 5386     int vlen = Matcher::vector_length(this, $src2);
 5387     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5388   %}
 5389   ins_pipe( pipe_slow );
 5390 %}
 5391 
 5392 // =======================Mul Reduction==========================================
 5393 
 5394 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5395   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5396             Matcher::vector_length(n->in(2)) <= 32); // src2
 5397   match(Set dst (MulReductionVI src1 src2));
 5398   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5399   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5400   ins_encode %{
 5401     int opcode = this->ideal_Opcode();
 5402     int vlen = Matcher::vector_length(this, $src2);
 5403     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5404   %}
 5405   ins_pipe( pipe_slow );
 5406 %}
 5407 
 5408 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5409   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5410             Matcher::vector_length(n->in(2)) == 64); // src2
 5411   match(Set dst (MulReductionVI src1 src2));
 5412   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5413   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5414   ins_encode %{
 5415     int opcode = this->ideal_Opcode();
 5416     int vlen = Matcher::vector_length(this, $src2);
 5417     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5418   %}
 5419   ins_pipe( pipe_slow );
 5420 %}
 5421 
 5422 //--------------------Min/Max Float Reduction --------------------
 5423 // Float Min Reduction
 5424 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
 5425                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5426   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5427             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5428              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5429             Matcher::vector_length(n->in(2)) == 2);
 5430   match(Set dst (MinReductionV src1 src2));
 5431   match(Set dst (MaxReductionV src1 src2));
 5432   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5433   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5434   ins_encode %{
 5435     assert(UseAVX > 0, "sanity");
 5436 
 5437     int opcode = this->ideal_Opcode();
 5438     int vlen = Matcher::vector_length(this, $src2);
 5439     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5440                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5441   %}
 5442   ins_pipe( pipe_slow );
 5443 %}
 5444 
 5445 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5446                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5447   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5448             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5449              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5450             Matcher::vector_length(n->in(2)) >= 4);
 5451   match(Set dst (MinReductionV src1 src2));
 5452   match(Set dst (MaxReductionV src1 src2));
 5453   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5454   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5455   ins_encode %{
 5456     assert(UseAVX > 0, "sanity");
 5457 
 5458     int opcode = this->ideal_Opcode();
 5459     int vlen = Matcher::vector_length(this, $src2);
 5460     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5461                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5462   %}
 5463   ins_pipe( pipe_slow );
 5464 %}
 5465 
 5466 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
 5467                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5468   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5469             Matcher::vector_length(n->in(2)) == 2);
 5470   match(Set dst (MinReductionV dst src));
 5471   match(Set dst (MaxReductionV dst src));
 5472   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5473   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5474   ins_encode %{
 5475     assert(UseAVX > 0, "sanity");
 5476 
 5477     int opcode = this->ideal_Opcode();
 5478     int vlen = Matcher::vector_length(this, $src);
 5479     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5480                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5481   %}
 5482   ins_pipe( pipe_slow );
 5483 %}
 5484 
 5485 
 5486 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
 5487                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5488   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5489             Matcher::vector_length(n->in(2)) >= 4);
 5490   match(Set dst (MinReductionV dst src));
 5491   match(Set dst (MaxReductionV dst src));
 5492   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5493   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5494   ins_encode %{
 5495     assert(UseAVX > 0, "sanity");
 5496 
 5497     int opcode = this->ideal_Opcode();
 5498     int vlen = Matcher::vector_length(this, $src);
 5499     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5500                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5501   %}
 5502   ins_pipe( pipe_slow );
 5503 %}
 5504 
 5505 
 5506 //--------------------Min Double Reduction --------------------
 5507 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
 5508                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5509                             rFlagsReg cr) %{
 5510   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5511             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5512              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5513             Matcher::vector_length(n->in(2)) == 2);
 5514   match(Set dst (MinReductionV src1 src2));
 5515   match(Set dst (MaxReductionV src1 src2));
 5516   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5517   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5518   ins_encode %{
 5519     assert(UseAVX > 0, "sanity");
 5520 
 5521     int opcode = this->ideal_Opcode();
 5522     int vlen = Matcher::vector_length(this, $src2);
 5523     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5524                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5525   %}
 5526   ins_pipe( pipe_slow );
 5527 %}
 5528 
 5529 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
 5530                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5531                            rFlagsReg cr) %{
 5532   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5533             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5534              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5535             Matcher::vector_length(n->in(2)) >= 4);
 5536   match(Set dst (MinReductionV src1 src2));
 5537   match(Set dst (MaxReductionV src1 src2));
 5538   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5539   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5540   ins_encode %{
 5541     assert(UseAVX > 0, "sanity");
 5542 
 5543     int opcode = this->ideal_Opcode();
 5544     int vlen = Matcher::vector_length(this, $src2);
 5545     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5546                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5547   %}
 5548   ins_pipe( pipe_slow );
 5549 %}
 5550 
 5551 
 5552 instruct minmax_reduction2D_av(legRegD dst, legVec src,
 5553                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5554                                rFlagsReg cr) %{
 5555   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5556             Matcher::vector_length(n->in(2)) == 2);
 5557   match(Set dst (MinReductionV dst src));
 5558   match(Set dst (MaxReductionV dst src));
 5559   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5560   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5561   ins_encode %{
 5562     assert(UseAVX > 0, "sanity");
 5563 
 5564     int opcode = this->ideal_Opcode();
 5565     int vlen = Matcher::vector_length(this, $src);
 5566     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5567                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5568   %}
 5569   ins_pipe( pipe_slow );
 5570 %}
 5571 
 5572 instruct minmax_reductionD_av(legRegD dst, legVec src,
 5573                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5574                               rFlagsReg cr) %{
 5575   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5576             Matcher::vector_length(n->in(2)) >= 4);
 5577   match(Set dst (MinReductionV dst src));
 5578   match(Set dst (MaxReductionV dst src));
 5579   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5580   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5581   ins_encode %{
 5582     assert(UseAVX > 0, "sanity");
 5583 
 5584     int opcode = this->ideal_Opcode();
 5585     int vlen = Matcher::vector_length(this, $src);
 5586     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5587                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5588   %}
 5589   ins_pipe( pipe_slow );
 5590 %}
 5591 
 5592 // ====================VECTOR ARITHMETIC=======================================
 5593 
 5594 // --------------------------------- ADD --------------------------------------
 5595 
 5596 // Bytes vector add
 5597 instruct vaddB(vec dst, vec src) %{
 5598   predicate(UseAVX == 0);
 5599   match(Set dst (AddVB dst src));
 5600   format %{ "paddb   $dst,$src\t! add packedB" %}
 5601   ins_encode %{
 5602     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
 5603   %}
 5604   ins_pipe( pipe_slow );
 5605 %}
 5606 
 5607 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
 5608   predicate(UseAVX > 0);
 5609   match(Set dst (AddVB src1 src2));
 5610   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
 5611   ins_encode %{
 5612     int vlen_enc = vector_length_encoding(this);
 5613     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5614   %}
 5615   ins_pipe( pipe_slow );
 5616 %}
 5617 
 5618 instruct vaddB_mem(vec dst, vec src, memory mem) %{
 5619   predicate((UseAVX > 0) &&
 5620             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5621   match(Set dst (AddVB src (LoadVector mem)));
 5622   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
 5623   ins_encode %{
 5624     int vlen_enc = vector_length_encoding(this);
 5625     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5626   %}
 5627   ins_pipe( pipe_slow );
 5628 %}
 5629 
 5630 // Shorts/Chars vector add
 5631 instruct vaddS(vec dst, vec src) %{
 5632   predicate(UseAVX == 0);
 5633   match(Set dst (AddVS dst src));
 5634   format %{ "paddw   $dst,$src\t! add packedS" %}
 5635   ins_encode %{
 5636     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
 5637   %}
 5638   ins_pipe( pipe_slow );
 5639 %}
 5640 
 5641 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
 5642   predicate(UseAVX > 0);
 5643   match(Set dst (AddVS src1 src2));
 5644   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
 5645   ins_encode %{
 5646     int vlen_enc = vector_length_encoding(this);
 5647     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5648   %}
 5649   ins_pipe( pipe_slow );
 5650 %}
 5651 
 5652 instruct vaddS_mem(vec dst, vec src, memory mem) %{
 5653   predicate((UseAVX > 0) &&
 5654             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5655   match(Set dst (AddVS src (LoadVector mem)));
 5656   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
 5657   ins_encode %{
 5658     int vlen_enc = vector_length_encoding(this);
 5659     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5660   %}
 5661   ins_pipe( pipe_slow );
 5662 %}
 5663 
 5664 // Integers vector add
 5665 instruct vaddI(vec dst, vec src) %{
 5666   predicate(UseAVX == 0);
 5667   match(Set dst (AddVI dst src));
 5668   format %{ "paddd   $dst,$src\t! add packedI" %}
 5669   ins_encode %{
 5670     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
 5671   %}
 5672   ins_pipe( pipe_slow );
 5673 %}
 5674 
 5675 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
 5676   predicate(UseAVX > 0);
 5677   match(Set dst (AddVI src1 src2));
 5678   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
 5679   ins_encode %{
 5680     int vlen_enc = vector_length_encoding(this);
 5681     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5682   %}
 5683   ins_pipe( pipe_slow );
 5684 %}
 5685 
 5686 
 5687 instruct vaddI_mem(vec dst, vec src, memory mem) %{
 5688   predicate((UseAVX > 0) &&
 5689             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5690   match(Set dst (AddVI src (LoadVector mem)));
 5691   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
 5692   ins_encode %{
 5693     int vlen_enc = vector_length_encoding(this);
 5694     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5695   %}
 5696   ins_pipe( pipe_slow );
 5697 %}
 5698 
 5699 // Longs vector add
 5700 instruct vaddL(vec dst, vec src) %{
 5701   predicate(UseAVX == 0);
 5702   match(Set dst (AddVL dst src));
 5703   format %{ "paddq   $dst,$src\t! add packedL" %}
 5704   ins_encode %{
 5705     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
 5706   %}
 5707   ins_pipe( pipe_slow );
 5708 %}
 5709 
 5710 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
 5711   predicate(UseAVX > 0);
 5712   match(Set dst (AddVL src1 src2));
 5713   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
 5714   ins_encode %{
 5715     int vlen_enc = vector_length_encoding(this);
 5716     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5717   %}
 5718   ins_pipe( pipe_slow );
 5719 %}
 5720 
 5721 instruct vaddL_mem(vec dst, vec src, memory mem) %{
 5722   predicate((UseAVX > 0) &&
 5723             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5724   match(Set dst (AddVL src (LoadVector mem)));
 5725   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
 5726   ins_encode %{
 5727     int vlen_enc = vector_length_encoding(this);
 5728     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5729   %}
 5730   ins_pipe( pipe_slow );
 5731 %}
 5732 
 5733 // Floats vector add
 5734 instruct vaddF(vec dst, vec src) %{
 5735   predicate(UseAVX == 0);
 5736   match(Set dst (AddVF dst src));
 5737   format %{ "addps   $dst,$src\t! add packedF" %}
 5738   ins_encode %{
 5739     __ addps($dst$$XMMRegister, $src$$XMMRegister);
 5740   %}
 5741   ins_pipe( pipe_slow );
 5742 %}
 5743 
 5744 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
 5745   predicate(UseAVX > 0);
 5746   match(Set dst (AddVF src1 src2));
 5747   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
 5748   ins_encode %{
 5749     int vlen_enc = vector_length_encoding(this);
 5750     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5751   %}
 5752   ins_pipe( pipe_slow );
 5753 %}
 5754 
 5755 instruct vaddF_mem(vec dst, vec src, memory mem) %{
 5756   predicate((UseAVX > 0) &&
 5757             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5758   match(Set dst (AddVF src (LoadVector mem)));
 5759   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
 5760   ins_encode %{
 5761     int vlen_enc = vector_length_encoding(this);
 5762     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5763   %}
 5764   ins_pipe( pipe_slow );
 5765 %}
 5766 
 5767 // Doubles vector add
 5768 instruct vaddD(vec dst, vec src) %{
 5769   predicate(UseAVX == 0);
 5770   match(Set dst (AddVD dst src));
 5771   format %{ "addpd   $dst,$src\t! add packedD" %}
 5772   ins_encode %{
 5773     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
 5774   %}
 5775   ins_pipe( pipe_slow );
 5776 %}
 5777 
 5778 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
 5779   predicate(UseAVX > 0);
 5780   match(Set dst (AddVD src1 src2));
 5781   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
 5782   ins_encode %{
 5783     int vlen_enc = vector_length_encoding(this);
 5784     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5785   %}
 5786   ins_pipe( pipe_slow );
 5787 %}
 5788 
 5789 instruct vaddD_mem(vec dst, vec src, memory mem) %{
 5790   predicate((UseAVX > 0) &&
 5791             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5792   match(Set dst (AddVD src (LoadVector mem)));
 5793   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
 5794   ins_encode %{
 5795     int vlen_enc = vector_length_encoding(this);
 5796     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5797   %}
 5798   ins_pipe( pipe_slow );
 5799 %}
 5800 
 5801 // --------------------------------- SUB --------------------------------------
 5802 
 5803 // Bytes vector sub
 5804 instruct vsubB(vec dst, vec src) %{
 5805   predicate(UseAVX == 0);
 5806   match(Set dst (SubVB dst src));
 5807   format %{ "psubb   $dst,$src\t! sub packedB" %}
 5808   ins_encode %{
 5809     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
 5810   %}
 5811   ins_pipe( pipe_slow );
 5812 %}
 5813 
 5814 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
 5815   predicate(UseAVX > 0);
 5816   match(Set dst (SubVB src1 src2));
 5817   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
 5818   ins_encode %{
 5819     int vlen_enc = vector_length_encoding(this);
 5820     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5821   %}
 5822   ins_pipe( pipe_slow );
 5823 %}
 5824 
 5825 instruct vsubB_mem(vec dst, vec src, memory mem) %{
 5826   predicate((UseAVX > 0) &&
 5827             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5828   match(Set dst (SubVB src (LoadVector mem)));
 5829   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
 5830   ins_encode %{
 5831     int vlen_enc = vector_length_encoding(this);
 5832     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5833   %}
 5834   ins_pipe( pipe_slow );
 5835 %}
 5836 
 5837 // Shorts/Chars vector sub
 5838 instruct vsubS(vec dst, vec src) %{
 5839   predicate(UseAVX == 0);
 5840   match(Set dst (SubVS dst src));
 5841   format %{ "psubw   $dst,$src\t! sub packedS" %}
 5842   ins_encode %{
 5843     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
 5844   %}
 5845   ins_pipe( pipe_slow );
 5846 %}
 5847 
 5848 
 5849 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
 5850   predicate(UseAVX > 0);
 5851   match(Set dst (SubVS src1 src2));
 5852   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
 5853   ins_encode %{
 5854     int vlen_enc = vector_length_encoding(this);
 5855     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5856   %}
 5857   ins_pipe( pipe_slow );
 5858 %}
 5859 
 5860 instruct vsubS_mem(vec dst, vec src, memory mem) %{
 5861   predicate((UseAVX > 0) &&
 5862             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5863   match(Set dst (SubVS src (LoadVector mem)));
 5864   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
 5865   ins_encode %{
 5866     int vlen_enc = vector_length_encoding(this);
 5867     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5868   %}
 5869   ins_pipe( pipe_slow );
 5870 %}
 5871 
 5872 // Integers vector sub
 5873 instruct vsubI(vec dst, vec src) %{
 5874   predicate(UseAVX == 0);
 5875   match(Set dst (SubVI dst src));
 5876   format %{ "psubd   $dst,$src\t! sub packedI" %}
 5877   ins_encode %{
 5878     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
 5879   %}
 5880   ins_pipe( pipe_slow );
 5881 %}
 5882 
 5883 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
 5884   predicate(UseAVX > 0);
 5885   match(Set dst (SubVI src1 src2));
 5886   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
 5887   ins_encode %{
 5888     int vlen_enc = vector_length_encoding(this);
 5889     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5890   %}
 5891   ins_pipe( pipe_slow );
 5892 %}
 5893 
 5894 instruct vsubI_mem(vec dst, vec src, memory mem) %{
 5895   predicate((UseAVX > 0) &&
 5896             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5897   match(Set dst (SubVI src (LoadVector mem)));
 5898   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
 5899   ins_encode %{
 5900     int vlen_enc = vector_length_encoding(this);
 5901     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5902   %}
 5903   ins_pipe( pipe_slow );
 5904 %}
 5905 
 5906 // Longs vector sub
 5907 instruct vsubL(vec dst, vec src) %{
 5908   predicate(UseAVX == 0);
 5909   match(Set dst (SubVL dst src));
 5910   format %{ "psubq   $dst,$src\t! sub packedL" %}
 5911   ins_encode %{
 5912     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
 5913   %}
 5914   ins_pipe( pipe_slow );
 5915 %}
 5916 
 5917 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
 5918   predicate(UseAVX > 0);
 5919   match(Set dst (SubVL src1 src2));
 5920   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
 5921   ins_encode %{
 5922     int vlen_enc = vector_length_encoding(this);
 5923     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5924   %}
 5925   ins_pipe( pipe_slow );
 5926 %}
 5927 
 5928 
 5929 instruct vsubL_mem(vec dst, vec src, memory mem) %{
 5930   predicate((UseAVX > 0) &&
 5931             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5932   match(Set dst (SubVL src (LoadVector mem)));
 5933   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
 5934   ins_encode %{
 5935     int vlen_enc = vector_length_encoding(this);
 5936     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5937   %}
 5938   ins_pipe( pipe_slow );
 5939 %}
 5940 
 5941 // Floats vector sub
 5942 instruct vsubF(vec dst, vec src) %{
 5943   predicate(UseAVX == 0);
 5944   match(Set dst (SubVF dst src));
 5945   format %{ "subps   $dst,$src\t! sub packedF" %}
 5946   ins_encode %{
 5947     __ subps($dst$$XMMRegister, $src$$XMMRegister);
 5948   %}
 5949   ins_pipe( pipe_slow );
 5950 %}
 5951 
 5952 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
 5953   predicate(UseAVX > 0);
 5954   match(Set dst (SubVF src1 src2));
 5955   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
 5956   ins_encode %{
 5957     int vlen_enc = vector_length_encoding(this);
 5958     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5959   %}
 5960   ins_pipe( pipe_slow );
 5961 %}
 5962 
 5963 instruct vsubF_mem(vec dst, vec src, memory mem) %{
 5964   predicate((UseAVX > 0) &&
 5965             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5966   match(Set dst (SubVF src (LoadVector mem)));
 5967   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
 5968   ins_encode %{
 5969     int vlen_enc = vector_length_encoding(this);
 5970     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5971   %}
 5972   ins_pipe( pipe_slow );
 5973 %}
 5974 
 5975 // Doubles vector sub
 5976 instruct vsubD(vec dst, vec src) %{
 5977   predicate(UseAVX == 0);
 5978   match(Set dst (SubVD dst src));
 5979   format %{ "subpd   $dst,$src\t! sub packedD" %}
 5980   ins_encode %{
 5981     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
 5982   %}
 5983   ins_pipe( pipe_slow );
 5984 %}
 5985 
 5986 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
 5987   predicate(UseAVX > 0);
 5988   match(Set dst (SubVD src1 src2));
 5989   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
 5990   ins_encode %{
 5991     int vlen_enc = vector_length_encoding(this);
 5992     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5993   %}
 5994   ins_pipe( pipe_slow );
 5995 %}
 5996 
 5997 instruct vsubD_mem(vec dst, vec src, memory mem) %{
 5998   predicate((UseAVX > 0) &&
 5999             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6000   match(Set dst (SubVD src (LoadVector mem)));
 6001   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
 6002   ins_encode %{
 6003     int vlen_enc = vector_length_encoding(this);
 6004     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6005   %}
 6006   ins_pipe( pipe_slow );
 6007 %}
 6008 
 6009 // --------------------------------- MUL --------------------------------------
 6010 
 6011 // Byte vector mul
 6012 instruct vmul8B(vec dst, vec src1, vec src2, vec xtmp) %{
 6013   predicate(Matcher::vector_length_in_bytes(n) <= 8);
 6014   match(Set dst (MulVB src1 src2));
 6015   effect(TEMP dst, TEMP xtmp);
 6016   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6017   ins_encode %{
 6018     assert(UseSSE > 3, "required");
 6019     __ pmovsxbw($dst$$XMMRegister, $src1$$XMMRegister);
 6020     __ pmovsxbw($xtmp$$XMMRegister, $src2$$XMMRegister);
 6021     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 6022     __ psllw($dst$$XMMRegister, 8);
 6023     __ psrlw($dst$$XMMRegister, 8);
 6024     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6025   %}
 6026   ins_pipe( pipe_slow );
 6027 %}
 6028 
 6029 instruct vmulB(vec dst, vec src1, vec src2, vec xtmp) %{
 6030   predicate(UseAVX == 0 && Matcher::vector_length_in_bytes(n) > 8);
 6031   match(Set dst (MulVB src1 src2));
 6032   effect(TEMP dst, TEMP xtmp);
 6033   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6034   ins_encode %{
 6035     assert(UseSSE > 3, "required");
 6036     // Odd-index elements
 6037     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
 6038     __ psrlw($dst$$XMMRegister, 8);
 6039     __ movdqu($xtmp$$XMMRegister, $src2$$XMMRegister);
 6040     __ psrlw($xtmp$$XMMRegister, 8);
 6041     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 6042     __ psllw($dst$$XMMRegister, 8);
 6043     // Even-index elements
 6044     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 6045     __ pmullw($xtmp$$XMMRegister, $src2$$XMMRegister);
 6046     __ psllw($xtmp$$XMMRegister, 8);
 6047     __ psrlw($xtmp$$XMMRegister, 8);
 6048     // Combine
 6049     __ por($dst$$XMMRegister, $xtmp$$XMMRegister);
 6050   %}
 6051   ins_pipe( pipe_slow );
 6052 %}
 6053 
 6054 instruct vmulB_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 6055   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) > 8);
 6056   match(Set dst (MulVB src1 src2));
 6057   effect(TEMP xtmp1, TEMP xtmp2);
 6058   format %{ "vmulVB  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 6059   ins_encode %{
 6060     int vlen_enc = vector_length_encoding(this);
 6061     // Odd-index elements
 6062     __ vpsrlw($xtmp2$$XMMRegister, $src1$$XMMRegister, 8, vlen_enc);
 6063     __ vpsrlw($xtmp1$$XMMRegister, $src2$$XMMRegister, 8, vlen_enc);
 6064     __ vpmullw($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6065     __ vpsllw($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 8, vlen_enc);
 6066     // Even-index elements
 6067     __ vpmullw($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6068     __ vpsllw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 6069     __ vpsrlw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 6070     // Combine
 6071     __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6072   %}
 6073   ins_pipe( pipe_slow );
 6074 %}
 6075 
 6076 // Shorts/Chars vector mul
 6077 instruct vmulS(vec dst, vec src) %{
 6078   predicate(UseAVX == 0);
 6079   match(Set dst (MulVS dst src));
 6080   format %{ "pmullw  $dst,$src\t! mul packedS" %}
 6081   ins_encode %{
 6082     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
 6083   %}
 6084   ins_pipe( pipe_slow );
 6085 %}
 6086 
 6087 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
 6088   predicate(UseAVX > 0);
 6089   match(Set dst (MulVS src1 src2));
 6090   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
 6091   ins_encode %{
 6092     int vlen_enc = vector_length_encoding(this);
 6093     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6094   %}
 6095   ins_pipe( pipe_slow );
 6096 %}
 6097 
 6098 instruct vmulS_mem(vec dst, vec src, memory mem) %{
 6099   predicate((UseAVX > 0) &&
 6100             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6101   match(Set dst (MulVS src (LoadVector mem)));
 6102   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
 6103   ins_encode %{
 6104     int vlen_enc = vector_length_encoding(this);
 6105     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6106   %}
 6107   ins_pipe( pipe_slow );
 6108 %}
 6109 
 6110 // Integers vector mul
 6111 instruct vmulI(vec dst, vec src) %{
 6112   predicate(UseAVX == 0);
 6113   match(Set dst (MulVI dst src));
 6114   format %{ "pmulld  $dst,$src\t! mul packedI" %}
 6115   ins_encode %{
 6116     assert(UseSSE > 3, "required");
 6117     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
 6118   %}
 6119   ins_pipe( pipe_slow );
 6120 %}
 6121 
 6122 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
 6123   predicate(UseAVX > 0);
 6124   match(Set dst (MulVI src1 src2));
 6125   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
 6126   ins_encode %{
 6127     int vlen_enc = vector_length_encoding(this);
 6128     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6129   %}
 6130   ins_pipe( pipe_slow );
 6131 %}
 6132 
 6133 instruct vmulI_mem(vec dst, vec src, memory mem) %{
 6134   predicate((UseAVX > 0) &&
 6135             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6136   match(Set dst (MulVI src (LoadVector mem)));
 6137   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
 6138   ins_encode %{
 6139     int vlen_enc = vector_length_encoding(this);
 6140     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6141   %}
 6142   ins_pipe( pipe_slow );
 6143 %}
 6144 
 6145 // Longs vector mul
 6146 instruct evmulL_reg(vec dst, vec src1, vec src2) %{
 6147   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6148              VM_Version::supports_avx512dq()) ||
 6149             VM_Version::supports_avx512vldq());
 6150   match(Set dst (MulVL src1 src2));
 6151   format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
 6152   ins_encode %{
 6153     assert(UseAVX > 2, "required");
 6154     int vlen_enc = vector_length_encoding(this);
 6155     __ evpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6156   %}
 6157   ins_pipe( pipe_slow );
 6158 %}
 6159 
 6160 instruct evmulL_mem(vec dst, vec src, memory mem) %{
 6161   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6162              VM_Version::supports_avx512dq()) ||
 6163             (Matcher::vector_length_in_bytes(n) > 8 &&
 6164              VM_Version::supports_avx512vldq()));
 6165   match(Set dst (MulVL src (LoadVector mem)));
 6166   format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
 6167   ins_encode %{
 6168     assert(UseAVX > 2, "required");
 6169     int vlen_enc = vector_length_encoding(this);
 6170     __ evpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6171   %}
 6172   ins_pipe( pipe_slow );
 6173 %}
 6174 
 6175 instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
 6176   predicate(UseAVX == 0);
 6177   match(Set dst (MulVL src1 src2));
 6178   effect(TEMP dst, TEMP xtmp);
 6179   format %{ "mulVL   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6180   ins_encode %{
 6181     assert(VM_Version::supports_sse4_1(), "required");
 6182     // Get the lo-hi products, only the lower 32 bits is in concerns
 6183     __ pshufd($xtmp$$XMMRegister, $src2$$XMMRegister, 0xB1);
 6184     __ pmulld($xtmp$$XMMRegister, $src1$$XMMRegister);
 6185     __ pshufd($dst$$XMMRegister, $xtmp$$XMMRegister, 0xB1);
 6186     __ paddd($dst$$XMMRegister, $xtmp$$XMMRegister);
 6187     __ psllq($dst$$XMMRegister, 32);
 6188     // Get the lo-lo products
 6189     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 6190     __ pmuludq($xtmp$$XMMRegister, $src2$$XMMRegister);
 6191     __ paddq($dst$$XMMRegister, $xtmp$$XMMRegister);
 6192   %}
 6193   ins_pipe( pipe_slow );
 6194 %}
 6195 
 6196 instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 6197   predicate(UseAVX > 0 &&
 6198             ((Matcher::vector_length_in_bytes(n) == 64 &&
 6199               !VM_Version::supports_avx512dq()) ||
 6200              (Matcher::vector_length_in_bytes(n) < 64 &&
 6201               !VM_Version::supports_avx512vldq())));
 6202   match(Set dst (MulVL src1 src2));
 6203   effect(TEMP xtmp1, TEMP xtmp2);
 6204   format %{ "vmulVL  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 6205   ins_encode %{
 6206     int vlen_enc = vector_length_encoding(this);
 6207     // Get the lo-hi products, only the lower 32 bits is in concerns
 6208     __ vpshufd($xtmp1$$XMMRegister, $src2$$XMMRegister, 0xB1, vlen_enc);
 6209     __ vpmulld($xtmp1$$XMMRegister, $src1$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6210     __ vpshufd($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 0xB1, vlen_enc);
 6211     __ vpaddd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6212     __ vpsllq($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 32, vlen_enc);
 6213     // Get the lo-lo products
 6214     __ vpmuludq($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6215     __ vpaddq($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6216   %}
 6217   ins_pipe( pipe_slow );
 6218 %}
 6219 
 6220 // Floats vector mul
 6221 instruct vmulF(vec dst, vec src) %{
 6222   predicate(UseAVX == 0);
 6223   match(Set dst (MulVF dst src));
 6224   format %{ "mulps   $dst,$src\t! mul packedF" %}
 6225   ins_encode %{
 6226     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
 6227   %}
 6228   ins_pipe( pipe_slow );
 6229 %}
 6230 
 6231 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
 6232   predicate(UseAVX > 0);
 6233   match(Set dst (MulVF src1 src2));
 6234   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
 6235   ins_encode %{
 6236     int vlen_enc = vector_length_encoding(this);
 6237     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6238   %}
 6239   ins_pipe( pipe_slow );
 6240 %}
 6241 
 6242 instruct vmulF_mem(vec dst, vec src, memory mem) %{
 6243   predicate((UseAVX > 0) &&
 6244             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6245   match(Set dst (MulVF src (LoadVector mem)));
 6246   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
 6247   ins_encode %{
 6248     int vlen_enc = vector_length_encoding(this);
 6249     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6250   %}
 6251   ins_pipe( pipe_slow );
 6252 %}
 6253 
 6254 // Doubles vector mul
 6255 instruct vmulD(vec dst, vec src) %{
 6256   predicate(UseAVX == 0);
 6257   match(Set dst (MulVD dst src));
 6258   format %{ "mulpd   $dst,$src\t! mul packedD" %}
 6259   ins_encode %{
 6260     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
 6261   %}
 6262   ins_pipe( pipe_slow );
 6263 %}
 6264 
 6265 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
 6266   predicate(UseAVX > 0);
 6267   match(Set dst (MulVD src1 src2));
 6268   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
 6269   ins_encode %{
 6270     int vlen_enc = vector_length_encoding(this);
 6271     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6272   %}
 6273   ins_pipe( pipe_slow );
 6274 %}
 6275 
 6276 instruct vmulD_mem(vec dst, vec src, memory mem) %{
 6277   predicate((UseAVX > 0) &&
 6278             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6279   match(Set dst (MulVD src (LoadVector mem)));
 6280   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
 6281   ins_encode %{
 6282     int vlen_enc = vector_length_encoding(this);
 6283     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6284   %}
 6285   ins_pipe( pipe_slow );
 6286 %}
 6287 
 6288 // --------------------------------- DIV --------------------------------------
 6289 
 6290 // Floats vector div
 6291 instruct vdivF(vec dst, vec src) %{
 6292   predicate(UseAVX == 0);
 6293   match(Set dst (DivVF dst src));
 6294   format %{ "divps   $dst,$src\t! div packedF" %}
 6295   ins_encode %{
 6296     __ divps($dst$$XMMRegister, $src$$XMMRegister);
 6297   %}
 6298   ins_pipe( pipe_slow );
 6299 %}
 6300 
 6301 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
 6302   predicate(UseAVX > 0);
 6303   match(Set dst (DivVF src1 src2));
 6304   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
 6305   ins_encode %{
 6306     int vlen_enc = vector_length_encoding(this);
 6307     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6308   %}
 6309   ins_pipe( pipe_slow );
 6310 %}
 6311 
 6312 instruct vdivF_mem(vec dst, vec src, memory mem) %{
 6313   predicate((UseAVX > 0) &&
 6314             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6315   match(Set dst (DivVF src (LoadVector mem)));
 6316   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
 6317   ins_encode %{
 6318     int vlen_enc = vector_length_encoding(this);
 6319     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6320   %}
 6321   ins_pipe( pipe_slow );
 6322 %}
 6323 
 6324 // Doubles vector div
 6325 instruct vdivD(vec dst, vec src) %{
 6326   predicate(UseAVX == 0);
 6327   match(Set dst (DivVD dst src));
 6328   format %{ "divpd   $dst,$src\t! div packedD" %}
 6329   ins_encode %{
 6330     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
 6331   %}
 6332   ins_pipe( pipe_slow );
 6333 %}
 6334 
 6335 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
 6336   predicate(UseAVX > 0);
 6337   match(Set dst (DivVD src1 src2));
 6338   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
 6339   ins_encode %{
 6340     int vlen_enc = vector_length_encoding(this);
 6341     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6342   %}
 6343   ins_pipe( pipe_slow );
 6344 %}
 6345 
 6346 instruct vdivD_mem(vec dst, vec src, memory mem) %{
 6347   predicate((UseAVX > 0) &&
 6348             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6349   match(Set dst (DivVD src (LoadVector mem)));
 6350   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
 6351   ins_encode %{
 6352     int vlen_enc = vector_length_encoding(this);
 6353     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6354   %}
 6355   ins_pipe( pipe_slow );
 6356 %}
 6357 
 6358 // ------------------------------ MinMax ---------------------------------------
 6359 
 6360 // Byte, Short, Int vector Min/Max
 6361 instruct minmax_reg_sse(vec dst, vec src) %{
 6362   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6363             UseAVX == 0);
 6364   match(Set dst (MinV dst src));
 6365   match(Set dst (MaxV dst src));
 6366   format %{ "vector_minmax  $dst,$src\t!  " %}
 6367   ins_encode %{
 6368     assert(UseSSE >= 4, "required");
 6369 
 6370     int opcode = this->ideal_Opcode();
 6371     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6372     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
 6373   %}
 6374   ins_pipe( pipe_slow );
 6375 %}
 6376 
 6377 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
 6378   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6379             UseAVX > 0);
 6380   match(Set dst (MinV src1 src2));
 6381   match(Set dst (MaxV src1 src2));
 6382   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
 6383   ins_encode %{
 6384     int opcode = this->ideal_Opcode();
 6385     int vlen_enc = vector_length_encoding(this);
 6386     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6387 
 6388     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6389   %}
 6390   ins_pipe( pipe_slow );
 6391 %}
 6392 
 6393 // Long vector Min/Max
 6394 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
 6395   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6396             UseAVX == 0);
 6397   match(Set dst (MinV dst src));
 6398   match(Set dst (MaxV src dst));
 6399   effect(TEMP dst, TEMP tmp);
 6400   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
 6401   ins_encode %{
 6402     assert(UseSSE >= 4, "required");
 6403 
 6404     int opcode = this->ideal_Opcode();
 6405     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6406     assert(elem_bt == T_LONG, "sanity");
 6407 
 6408     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
 6409   %}
 6410   ins_pipe( pipe_slow );
 6411 %}
 6412 
 6413 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
 6414   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6415             UseAVX > 0 && !VM_Version::supports_avx512vl());
 6416   match(Set dst (MinV src1 src2));
 6417   match(Set dst (MaxV src1 src2));
 6418   effect(TEMP dst);
 6419   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
 6420   ins_encode %{
 6421     int vlen_enc = vector_length_encoding(this);
 6422     int opcode = this->ideal_Opcode();
 6423     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6424     assert(elem_bt == T_LONG, "sanity");
 6425 
 6426     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6427   %}
 6428   ins_pipe( pipe_slow );
 6429 %}
 6430 
 6431 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
 6432   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
 6433             Matcher::vector_element_basic_type(n) == T_LONG);
 6434   match(Set dst (MinV src1 src2));
 6435   match(Set dst (MaxV src1 src2));
 6436   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
 6437   ins_encode %{
 6438     assert(UseAVX > 2, "required");
 6439 
 6440     int vlen_enc = vector_length_encoding(this);
 6441     int opcode = this->ideal_Opcode();
 6442     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6443     assert(elem_bt == T_LONG, "sanity");
 6444 
 6445     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6446   %}
 6447   ins_pipe( pipe_slow );
 6448 %}
 6449 
 6450 // Float/Double vector Min/Max
 6451 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
 6452   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
 6453             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
 6454             UseAVX > 0);
 6455   match(Set dst (MinV a b));
 6456   match(Set dst (MaxV a b));
 6457   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
 6458   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
 6459   ins_encode %{
 6460     assert(UseAVX > 0, "required");
 6461 
 6462     int opcode = this->ideal_Opcode();
 6463     int vlen_enc = vector_length_encoding(this);
 6464     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6465 
 6466     __ vminmax_fp(opcode, elem_bt,
 6467                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6468                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6469   %}
 6470   ins_pipe( pipe_slow );
 6471 %}
 6472 
 6473 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
 6474   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 6475             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6476   match(Set dst (MinV a b));
 6477   match(Set dst (MaxV a b));
 6478   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
 6479   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
 6480   ins_encode %{
 6481     assert(UseAVX > 2, "required");
 6482 
 6483     int opcode = this->ideal_Opcode();
 6484     int vlen_enc = vector_length_encoding(this);
 6485     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6486 
 6487     __ evminmax_fp(opcode, elem_bt,
 6488                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6489                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6490   %}
 6491   ins_pipe( pipe_slow );
 6492 %}
 6493 
 6494 // --------------------------------- Signum/CopySign ---------------------------
 6495 
 6496 instruct signumF_reg(regF dst, regF zero, regF one, rFlagsReg cr) %{
 6497   match(Set dst (SignumF dst (Binary zero one)));
 6498   effect(KILL cr);
 6499   format %{ "signumF $dst, $dst" %}
 6500   ins_encode %{
 6501     int opcode = this->ideal_Opcode();
 6502     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6503   %}
 6504   ins_pipe( pipe_slow );
 6505 %}
 6506 
 6507 instruct signumD_reg(regD dst, regD zero, regD one, rFlagsReg cr) %{
 6508   match(Set dst (SignumD dst (Binary zero one)));
 6509   effect(KILL cr);
 6510   format %{ "signumD $dst, $dst" %}
 6511   ins_encode %{
 6512     int opcode = this->ideal_Opcode();
 6513     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6514   %}
 6515   ins_pipe( pipe_slow );
 6516 %}
 6517 
 6518 instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
 6519   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 6520   match(Set dst (SignumVF src (Binary zero one)));
 6521   match(Set dst (SignumVD src (Binary zero one)));
 6522   effect(TEMP dst, TEMP xtmp1);
 6523   format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
 6524   ins_encode %{
 6525     int opcode = this->ideal_Opcode();
 6526     int vec_enc = vector_length_encoding(this);
 6527     __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6528                          $xtmp1$$XMMRegister, vec_enc);
 6529   %}
 6530   ins_pipe( pipe_slow );
 6531 %}
 6532 
 6533 instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
 6534   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 6535   match(Set dst (SignumVF src (Binary zero one)));
 6536   match(Set dst (SignumVD src (Binary zero one)));
 6537   effect(TEMP dst, TEMP ktmp1);
 6538   format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
 6539   ins_encode %{
 6540     int opcode = this->ideal_Opcode();
 6541     int vec_enc = vector_length_encoding(this);
 6542     __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6543                           $ktmp1$$KRegister, vec_enc);
 6544   %}
 6545   ins_pipe( pipe_slow );
 6546 %}
 6547 
 6548 // ---------------------------------------
 6549 // For copySign use 0xE4 as writemask for vpternlog
 6550 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
 6551 // C (xmm2) is set to 0x7FFFFFFF
 6552 // Wherever xmm2 is 0, we want to pick from B (sign)
 6553 // Wherever xmm2 is 1, we want to pick from A (src)
 6554 //
 6555 // A B C Result
 6556 // 0 0 0 0
 6557 // 0 0 1 0
 6558 // 0 1 0 1
 6559 // 0 1 1 0
 6560 // 1 0 0 0
 6561 // 1 0 1 1
 6562 // 1 1 0 1
 6563 // 1 1 1 1
 6564 //
 6565 // Result going from high bit to low bit is 0x11100100 = 0xe4
 6566 // ---------------------------------------
 6567 
 6568 #ifdef _LP64
 6569 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
 6570   match(Set dst (CopySignF dst src));
 6571   effect(TEMP tmp1, TEMP tmp2);
 6572   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6573   ins_encode %{
 6574     __ movl($tmp2$$Register, 0x7FFFFFFF);
 6575     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
 6576     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6577   %}
 6578   ins_pipe( pipe_slow );
 6579 %}
 6580 
 6581 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
 6582   match(Set dst (CopySignD dst (Binary src zero)));
 6583   ins_cost(100);
 6584   effect(TEMP tmp1, TEMP tmp2);
 6585   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6586   ins_encode %{
 6587     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
 6588     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
 6589     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6590   %}
 6591   ins_pipe( pipe_slow );
 6592 %}
 6593 
 6594 #endif // _LP64
 6595 
 6596 //----------------------------- CompressBits/ExpandBits ------------------------
 6597 
 6598 instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6599   predicate(n->bottom_type()->isa_int());
 6600   match(Set dst (CompressBits src mask));
 6601   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6602   ins_encode %{
 6603     __ pextl($dst$$Register, $src$$Register, $mask$$Register);
 6604   %}
 6605   ins_pipe( pipe_slow );
 6606 %}
 6607 
 6608 instruct expandBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6609   predicate(n->bottom_type()->isa_int());
 6610   match(Set dst (ExpandBits src mask));
 6611   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6612   ins_encode %{
 6613     __ pdepl($dst$$Register, $src$$Register, $mask$$Register);
 6614   %}
 6615   ins_pipe( pipe_slow );
 6616 %}
 6617 
 6618 instruct compressBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6619   predicate(n->bottom_type()->isa_int());
 6620   match(Set dst (CompressBits src (LoadI mask)));
 6621   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6622   ins_encode %{
 6623     __ pextl($dst$$Register, $src$$Register, $mask$$Address);
 6624   %}
 6625   ins_pipe( pipe_slow );
 6626 %}
 6627 
 6628 instruct expandBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6629   predicate(n->bottom_type()->isa_int());
 6630   match(Set dst (ExpandBits src (LoadI mask)));
 6631   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6632   ins_encode %{
 6633     __ pdepl($dst$$Register, $src$$Register, $mask$$Address);
 6634   %}
 6635   ins_pipe( pipe_slow );
 6636 %}
 6637 
 6638 // --------------------------------- Sqrt --------------------------------------
 6639 
 6640 instruct vsqrtF_reg(vec dst, vec src) %{
 6641   match(Set dst (SqrtVF src));
 6642   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
 6643   ins_encode %{
 6644     assert(UseAVX > 0, "required");
 6645     int vlen_enc = vector_length_encoding(this);
 6646     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6647   %}
 6648   ins_pipe( pipe_slow );
 6649 %}
 6650 
 6651 instruct vsqrtF_mem(vec dst, memory mem) %{
 6652   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6653   match(Set dst (SqrtVF (LoadVector mem)));
 6654   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
 6655   ins_encode %{
 6656     assert(UseAVX > 0, "required");
 6657     int vlen_enc = vector_length_encoding(this);
 6658     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6659   %}
 6660   ins_pipe( pipe_slow );
 6661 %}
 6662 
 6663 // Floating point vector sqrt
 6664 instruct vsqrtD_reg(vec dst, vec src) %{
 6665   match(Set dst (SqrtVD src));
 6666   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
 6667   ins_encode %{
 6668     assert(UseAVX > 0, "required");
 6669     int vlen_enc = vector_length_encoding(this);
 6670     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6671   %}
 6672   ins_pipe( pipe_slow );
 6673 %}
 6674 
 6675 instruct vsqrtD_mem(vec dst, memory mem) %{
 6676   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6677   match(Set dst (SqrtVD (LoadVector mem)));
 6678   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
 6679   ins_encode %{
 6680     assert(UseAVX > 0, "required");
 6681     int vlen_enc = vector_length_encoding(this);
 6682     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6683   %}
 6684   ins_pipe( pipe_slow );
 6685 %}
 6686 
 6687 // ------------------------------ Shift ---------------------------------------
 6688 
 6689 // Left and right shift count vectors are the same on x86
 6690 // (only lowest bits of xmm reg are used for count).
 6691 instruct vshiftcnt(vec dst, rRegI cnt) %{
 6692   match(Set dst (LShiftCntV cnt));
 6693   match(Set dst (RShiftCntV cnt));
 6694   format %{ "movdl    $dst,$cnt\t! load shift count" %}
 6695   ins_encode %{
 6696     __ movdl($dst$$XMMRegister, $cnt$$Register);
 6697   %}
 6698   ins_pipe( pipe_slow );
 6699 %}
 6700 
 6701 // Byte vector shift
 6702 instruct vshiftB(vec dst, vec src, vec shift, vec tmp) %{
 6703   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
 6704   match(Set dst ( LShiftVB src shift));
 6705   match(Set dst ( RShiftVB src shift));
 6706   match(Set dst (URShiftVB src shift));
 6707   effect(TEMP dst, USE src, USE shift, TEMP tmp);
 6708   format %{"vector_byte_shift $dst,$src,$shift" %}
 6709   ins_encode %{
 6710     assert(UseSSE > 3, "required");
 6711     int opcode = this->ideal_Opcode();
 6712     bool sign = (opcode != Op_URShiftVB);
 6713     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
 6714     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
 6715     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6716     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 6717     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6718   %}
 6719   ins_pipe( pipe_slow );
 6720 %}
 6721 
 6722 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6723   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6724             UseAVX <= 1);
 6725   match(Set dst ( LShiftVB src shift));
 6726   match(Set dst ( RShiftVB src shift));
 6727   match(Set dst (URShiftVB src shift));
 6728   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2);
 6729   format %{"vector_byte_shift $dst,$src,$shift" %}
 6730   ins_encode %{
 6731     assert(UseSSE > 3, "required");
 6732     int opcode = this->ideal_Opcode();
 6733     bool sign = (opcode != Op_URShiftVB);
 6734     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
 6735     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
 6736     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
 6737     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
 6738     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
 6739     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6740     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 6741     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 6742     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 6743   %}
 6744   ins_pipe( pipe_slow );
 6745 %}
 6746 
 6747 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6748   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6749             UseAVX > 1);
 6750   match(Set dst ( LShiftVB src shift));
 6751   match(Set dst ( RShiftVB src shift));
 6752   match(Set dst (URShiftVB src shift));
 6753   effect(TEMP dst, TEMP tmp);
 6754   format %{"vector_byte_shift $dst,$src,$shift" %}
 6755   ins_encode %{
 6756     int opcode = this->ideal_Opcode();
 6757     bool sign = (opcode != Op_URShiftVB);
 6758     int vlen_enc = Assembler::AVX_256bit;
 6759     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6760     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6761     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6762     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
 6763     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
 6764   %}
 6765   ins_pipe( pipe_slow );
 6766 %}
 6767 
 6768 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6769   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
 6770   match(Set dst ( LShiftVB src shift));
 6771   match(Set dst ( RShiftVB src shift));
 6772   match(Set dst (URShiftVB src shift));
 6773   effect(TEMP dst, TEMP tmp);
 6774   format %{"vector_byte_shift $dst,$src,$shift" %}
 6775   ins_encode %{
 6776     assert(UseAVX > 1, "required");
 6777     int opcode = this->ideal_Opcode();
 6778     bool sign = (opcode != Op_URShiftVB);
 6779     int vlen_enc = Assembler::AVX_256bit;
 6780     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
 6781     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6782     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6783     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6784     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6785     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6786     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6787     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6788     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6789   %}
 6790   ins_pipe( pipe_slow );
 6791 %}
 6792 
 6793 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6794   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
 6795   match(Set dst ( LShiftVB src shift));
 6796   match(Set dst  (RShiftVB src shift));
 6797   match(Set dst (URShiftVB src shift));
 6798   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 6799   format %{"vector_byte_shift $dst,$src,$shift" %}
 6800   ins_encode %{
 6801     assert(UseAVX > 2, "required");
 6802     int opcode = this->ideal_Opcode();
 6803     bool sign = (opcode != Op_URShiftVB);
 6804     int vlen_enc = Assembler::AVX_512bit;
 6805     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
 6806     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 6807     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6808     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6809     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6810     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6811     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6812     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6813     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6814     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 6815     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
 6816     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6817   %}
 6818   ins_pipe( pipe_slow );
 6819 %}
 6820 
 6821 // Shorts vector logical right shift produces incorrect Java result
 6822 // for negative data because java code convert short value into int with
 6823 // sign extension before a shift. But char vectors are fine since chars are
 6824 // unsigned values.
 6825 // Shorts/Chars vector left shift
 6826 instruct vshiftS(vec dst, vec src, vec shift) %{
 6827   predicate(!n->as_ShiftV()->is_var_shift());
 6828   match(Set dst ( LShiftVS src shift));
 6829   match(Set dst ( RShiftVS src shift));
 6830   match(Set dst (URShiftVS src shift));
 6831   effect(TEMP dst, USE src, USE shift);
 6832   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
 6833   ins_encode %{
 6834     int opcode = this->ideal_Opcode();
 6835     if (UseAVX > 0) {
 6836       int vlen_enc = vector_length_encoding(this);
 6837       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6838     } else {
 6839       int vlen = Matcher::vector_length(this);
 6840       if (vlen == 2) {
 6841         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 6842         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6843       } else if (vlen == 4) {
 6844         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6845         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6846       } else {
 6847         assert (vlen == 8, "sanity");
 6848         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6849         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6850       }
 6851     }
 6852   %}
 6853   ins_pipe( pipe_slow );
 6854 %}
 6855 
 6856 // Integers vector left shift
 6857 instruct vshiftI(vec dst, vec src, vec shift) %{
 6858   predicate(!n->as_ShiftV()->is_var_shift());
 6859   match(Set dst ( LShiftVI src shift));
 6860   match(Set dst ( RShiftVI src shift));
 6861   match(Set dst (URShiftVI src shift));
 6862   effect(TEMP dst, USE src, USE shift);
 6863   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
 6864   ins_encode %{
 6865     int opcode = this->ideal_Opcode();
 6866     if (UseAVX > 0) {
 6867       int vlen_enc = vector_length_encoding(this);
 6868       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6869     } else {
 6870       int vlen = Matcher::vector_length(this);
 6871       if (vlen == 2) {
 6872         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6873         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6874       } else {
 6875         assert(vlen == 4, "sanity");
 6876         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6877         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6878       }
 6879     }
 6880   %}
 6881   ins_pipe( pipe_slow );
 6882 %}
 6883 
 6884 // Integers vector left constant shift
 6885 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
 6886   match(Set dst (LShiftVI src (LShiftCntV shift)));
 6887   match(Set dst (RShiftVI src (RShiftCntV shift)));
 6888   match(Set dst (URShiftVI src (RShiftCntV shift)));
 6889   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
 6890   ins_encode %{
 6891     int opcode = this->ideal_Opcode();
 6892     if (UseAVX > 0) {
 6893       int vector_len = vector_length_encoding(this);
 6894       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6895     } else {
 6896       int vlen = Matcher::vector_length(this);
 6897       if (vlen == 2) {
 6898         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6899         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6900       } else {
 6901         assert(vlen == 4, "sanity");
 6902         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6903         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6904       }
 6905     }
 6906   %}
 6907   ins_pipe( pipe_slow );
 6908 %}
 6909 
 6910 // Longs vector shift
 6911 instruct vshiftL(vec dst, vec src, vec shift) %{
 6912   predicate(!n->as_ShiftV()->is_var_shift());
 6913   match(Set dst ( LShiftVL src shift));
 6914   match(Set dst (URShiftVL src shift));
 6915   effect(TEMP dst, USE src, USE shift);
 6916   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
 6917   ins_encode %{
 6918     int opcode = this->ideal_Opcode();
 6919     if (UseAVX > 0) {
 6920       int vlen_enc = vector_length_encoding(this);
 6921       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6922     } else {
 6923       assert(Matcher::vector_length(this) == 2, "");
 6924       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6925       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6926     }
 6927   %}
 6928   ins_pipe( pipe_slow );
 6929 %}
 6930 
 6931 // Longs vector constant shift
 6932 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
 6933   match(Set dst (LShiftVL src (LShiftCntV shift)));
 6934   match(Set dst (URShiftVL src (RShiftCntV shift)));
 6935   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
 6936   ins_encode %{
 6937     int opcode = this->ideal_Opcode();
 6938     if (UseAVX > 0) {
 6939       int vector_len = vector_length_encoding(this);
 6940       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6941     } else {
 6942       assert(Matcher::vector_length(this) == 2, "");
 6943       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6944       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6945     }
 6946   %}
 6947   ins_pipe( pipe_slow );
 6948 %}
 6949 
 6950 // -------------------ArithmeticRightShift -----------------------------------
 6951 // Long vector arithmetic right shift
 6952 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp) %{
 6953   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
 6954   match(Set dst (RShiftVL src shift));
 6955   effect(TEMP dst, TEMP tmp);
 6956   format %{ "vshiftq $dst,$src,$shift" %}
 6957   ins_encode %{
 6958     uint vlen = Matcher::vector_length(this);
 6959     if (vlen == 2) {
 6960       assert(UseSSE >= 2, "required");
 6961       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6962       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
 6963       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6964       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
 6965       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
 6966       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
 6967     } else {
 6968       assert(vlen == 4, "sanity");
 6969       assert(UseAVX > 1, "required");
 6970       int vlen_enc = Assembler::AVX_256bit;
 6971       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6972       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6973       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6974       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6975       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6976     }
 6977   %}
 6978   ins_pipe( pipe_slow );
 6979 %}
 6980 
 6981 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
 6982   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
 6983   match(Set dst (RShiftVL src shift));
 6984   format %{ "vshiftq $dst,$src,$shift" %}
 6985   ins_encode %{
 6986     int vlen_enc = vector_length_encoding(this);
 6987     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6988   %}
 6989   ins_pipe( pipe_slow );
 6990 %}
 6991 
 6992 // ------------------- Variable Shift -----------------------------
 6993 // Byte variable shift
 6994 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 6995   predicate(Matcher::vector_length(n) <= 8 &&
 6996             n->as_ShiftV()->is_var_shift() &&
 6997             !VM_Version::supports_avx512bw());
 6998   match(Set dst ( LShiftVB src shift));
 6999   match(Set dst ( RShiftVB src shift));
 7000   match(Set dst (URShiftVB src shift));
 7001   effect(TEMP dst, TEMP vtmp);
 7002   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7003   ins_encode %{
 7004     assert(UseAVX >= 2, "required");
 7005 
 7006     int opcode = this->ideal_Opcode();
 7007     int vlen_enc = Assembler::AVX_128bit;
 7008     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7009     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7010   %}
 7011   ins_pipe( pipe_slow );
 7012 %}
 7013 
 7014 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7015   predicate(Matcher::vector_length(n) == 16 &&
 7016             n->as_ShiftV()->is_var_shift() &&
 7017             !VM_Version::supports_avx512bw());
 7018   match(Set dst ( LShiftVB src shift));
 7019   match(Set dst ( RShiftVB src shift));
 7020   match(Set dst (URShiftVB src shift));
 7021   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7022   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7023   ins_encode %{
 7024     assert(UseAVX >= 2, "required");
 7025 
 7026     int opcode = this->ideal_Opcode();
 7027     int vlen_enc = Assembler::AVX_128bit;
 7028     // Shift lower half and get word result in dst
 7029     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7030 
 7031     // Shift upper half and get word result in vtmp1
 7032     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7033     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7034     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7035 
 7036     // Merge and down convert the two word results to byte in dst
 7037     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7038   %}
 7039   ins_pipe( pipe_slow );
 7040 %}
 7041 
 7042 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4) %{
 7043   predicate(Matcher::vector_length(n) == 32 &&
 7044             n->as_ShiftV()->is_var_shift() &&
 7045             !VM_Version::supports_avx512bw());
 7046   match(Set dst ( LShiftVB src shift));
 7047   match(Set dst ( RShiftVB src shift));
 7048   match(Set dst (URShiftVB src shift));
 7049   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4);
 7050   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 as TEMP" %}
 7051   ins_encode %{
 7052     assert(UseAVX >= 2, "required");
 7053 
 7054     int opcode = this->ideal_Opcode();
 7055     int vlen_enc = Assembler::AVX_128bit;
 7056     // Process lower 128 bits and get result in dst
 7057     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7058     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7059     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7060     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7061     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7062 
 7063     // Process higher 128 bits and get result in vtmp3
 7064     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7065     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7066     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister);
 7067     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
 7068     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
 7069     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7070     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7071 
 7072     // Merge the two results in dst
 7073     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7074   %}
 7075   ins_pipe( pipe_slow );
 7076 %}
 7077 
 7078 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp) %{
 7079   predicate(Matcher::vector_length(n) <= 32 &&
 7080             n->as_ShiftV()->is_var_shift() &&
 7081             VM_Version::supports_avx512bw());
 7082   match(Set dst ( LShiftVB src shift));
 7083   match(Set dst ( RShiftVB src shift));
 7084   match(Set dst (URShiftVB src shift));
 7085   effect(TEMP dst, TEMP vtmp);
 7086   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7087   ins_encode %{
 7088     assert(UseAVX > 2, "required");
 7089 
 7090     int opcode = this->ideal_Opcode();
 7091     int vlen_enc = vector_length_encoding(this);
 7092     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7093   %}
 7094   ins_pipe( pipe_slow );
 7095 %}
 7096 
 7097 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7098   predicate(Matcher::vector_length(n) == 64 &&
 7099             n->as_ShiftV()->is_var_shift() &&
 7100             VM_Version::supports_avx512bw());
 7101   match(Set dst ( LShiftVB src shift));
 7102   match(Set dst ( RShiftVB src shift));
 7103   match(Set dst (URShiftVB src shift));
 7104   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7105   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7106   ins_encode %{
 7107     assert(UseAVX > 2, "required");
 7108 
 7109     int opcode = this->ideal_Opcode();
 7110     int vlen_enc = Assembler::AVX_256bit;
 7111     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7112     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7113     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7114     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7115     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7116   %}
 7117   ins_pipe( pipe_slow );
 7118 %}
 7119 
 7120 // Short variable shift
 7121 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7122   predicate(Matcher::vector_length(n) <= 8 &&
 7123             n->as_ShiftV()->is_var_shift() &&
 7124             !VM_Version::supports_avx512bw());
 7125   match(Set dst ( LShiftVS src shift));
 7126   match(Set dst ( RShiftVS src shift));
 7127   match(Set dst (URShiftVS src shift));
 7128   effect(TEMP dst, TEMP vtmp);
 7129   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7130   ins_encode %{
 7131     assert(UseAVX >= 2, "required");
 7132 
 7133     int opcode = this->ideal_Opcode();
 7134     bool sign = (opcode != Op_URShiftVS);
 7135     int vlen_enc = Assembler::AVX_256bit;
 7136     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
 7137     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
 7138     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 7139     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7140     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
 7141     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7142   %}
 7143   ins_pipe( pipe_slow );
 7144 %}
 7145 
 7146 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7147   predicate(Matcher::vector_length(n) == 16 &&
 7148             n->as_ShiftV()->is_var_shift() &&
 7149             !VM_Version::supports_avx512bw());
 7150   match(Set dst ( LShiftVS src shift));
 7151   match(Set dst ( RShiftVS src shift));
 7152   match(Set dst (URShiftVS src shift));
 7153   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7154   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7155   ins_encode %{
 7156     assert(UseAVX >= 2, "required");
 7157 
 7158     int opcode = this->ideal_Opcode();
 7159     bool sign = (opcode != Op_URShiftVS);
 7160     int vlen_enc = Assembler::AVX_256bit;
 7161     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
 7162     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7163     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7164     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7165     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7166 
 7167     // Shift upper half, with result in dst using vtmp1 as TEMP
 7168     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
 7169     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
 7170     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7171     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7172     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7173     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7174 
 7175     // Merge lower and upper half result into dst
 7176     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7177     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 7178   %}
 7179   ins_pipe( pipe_slow );
 7180 %}
 7181 
 7182 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
 7183   predicate(n->as_ShiftV()->is_var_shift() &&
 7184             VM_Version::supports_avx512bw());
 7185   match(Set dst ( LShiftVS src shift));
 7186   match(Set dst ( RShiftVS src shift));
 7187   match(Set dst (URShiftVS src shift));
 7188   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
 7189   ins_encode %{
 7190     assert(UseAVX > 2, "required");
 7191 
 7192     int opcode = this->ideal_Opcode();
 7193     int vlen_enc = vector_length_encoding(this);
 7194     if (!VM_Version::supports_avx512vl()) {
 7195       vlen_enc = Assembler::AVX_512bit;
 7196     }
 7197     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7198   %}
 7199   ins_pipe( pipe_slow );
 7200 %}
 7201 
 7202 //Integer variable shift
 7203 instruct vshiftI_var(vec dst, vec src, vec shift) %{
 7204   predicate(n->as_ShiftV()->is_var_shift());
 7205   match(Set dst ( LShiftVI src shift));
 7206   match(Set dst ( RShiftVI src shift));
 7207   match(Set dst (URShiftVI src shift));
 7208   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
 7209   ins_encode %{
 7210     assert(UseAVX >= 2, "required");
 7211 
 7212     int opcode = this->ideal_Opcode();
 7213     int vlen_enc = vector_length_encoding(this);
 7214     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7215   %}
 7216   ins_pipe( pipe_slow );
 7217 %}
 7218 
 7219 //Long variable shift
 7220 instruct vshiftL_var(vec dst, vec src, vec shift) %{
 7221   predicate(n->as_ShiftV()->is_var_shift());
 7222   match(Set dst ( LShiftVL src shift));
 7223   match(Set dst (URShiftVL src shift));
 7224   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
 7225   ins_encode %{
 7226     assert(UseAVX >= 2, "required");
 7227 
 7228     int opcode = this->ideal_Opcode();
 7229     int vlen_enc = vector_length_encoding(this);
 7230     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7231   %}
 7232   ins_pipe( pipe_slow );
 7233 %}
 7234 
 7235 //Long variable right shift arithmetic
 7236 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
 7237   predicate(Matcher::vector_length(n) <= 4 &&
 7238             n->as_ShiftV()->is_var_shift() &&
 7239             UseAVX == 2);
 7240   match(Set dst (RShiftVL src shift));
 7241   effect(TEMP dst, TEMP vtmp);
 7242   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
 7243   ins_encode %{
 7244     int opcode = this->ideal_Opcode();
 7245     int vlen_enc = vector_length_encoding(this);
 7246     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
 7247                  $vtmp$$XMMRegister);
 7248   %}
 7249   ins_pipe( pipe_slow );
 7250 %}
 7251 
 7252 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
 7253   predicate(n->as_ShiftV()->is_var_shift() &&
 7254             UseAVX > 2);
 7255   match(Set dst (RShiftVL src shift));
 7256   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
 7257   ins_encode %{
 7258     int opcode = this->ideal_Opcode();
 7259     int vlen_enc = vector_length_encoding(this);
 7260     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7261   %}
 7262   ins_pipe( pipe_slow );
 7263 %}
 7264 
 7265 // --------------------------------- AND --------------------------------------
 7266 
 7267 instruct vand(vec dst, vec src) %{
 7268   predicate(UseAVX == 0);
 7269   match(Set dst (AndV dst src));
 7270   format %{ "pand    $dst,$src\t! and vectors" %}
 7271   ins_encode %{
 7272     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 7273   %}
 7274   ins_pipe( pipe_slow );
 7275 %}
 7276 
 7277 instruct vand_reg(vec dst, vec src1, vec src2) %{
 7278   predicate(UseAVX > 0);
 7279   match(Set dst (AndV src1 src2));
 7280   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
 7281   ins_encode %{
 7282     int vlen_enc = vector_length_encoding(this);
 7283     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7284   %}
 7285   ins_pipe( pipe_slow );
 7286 %}
 7287 
 7288 instruct vand_mem(vec dst, vec src, memory mem) %{
 7289   predicate((UseAVX > 0) &&
 7290             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7291   match(Set dst (AndV src (LoadVector mem)));
 7292   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
 7293   ins_encode %{
 7294     int vlen_enc = vector_length_encoding(this);
 7295     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7296   %}
 7297   ins_pipe( pipe_slow );
 7298 %}
 7299 
 7300 // --------------------------------- OR ---------------------------------------
 7301 
 7302 instruct vor(vec dst, vec src) %{
 7303   predicate(UseAVX == 0);
 7304   match(Set dst (OrV dst src));
 7305   format %{ "por     $dst,$src\t! or vectors" %}
 7306   ins_encode %{
 7307     __ por($dst$$XMMRegister, $src$$XMMRegister);
 7308   %}
 7309   ins_pipe( pipe_slow );
 7310 %}
 7311 
 7312 instruct vor_reg(vec dst, vec src1, vec src2) %{
 7313   predicate(UseAVX > 0);
 7314   match(Set dst (OrV src1 src2));
 7315   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
 7316   ins_encode %{
 7317     int vlen_enc = vector_length_encoding(this);
 7318     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7319   %}
 7320   ins_pipe( pipe_slow );
 7321 %}
 7322 
 7323 instruct vor_mem(vec dst, vec src, memory mem) %{
 7324   predicate((UseAVX > 0) &&
 7325             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7326   match(Set dst (OrV src (LoadVector mem)));
 7327   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
 7328   ins_encode %{
 7329     int vlen_enc = vector_length_encoding(this);
 7330     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7331   %}
 7332   ins_pipe( pipe_slow );
 7333 %}
 7334 
 7335 // --------------------------------- XOR --------------------------------------
 7336 
 7337 instruct vxor(vec dst, vec src) %{
 7338   predicate(UseAVX == 0);
 7339   match(Set dst (XorV dst src));
 7340   format %{ "pxor    $dst,$src\t! xor vectors" %}
 7341   ins_encode %{
 7342     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
 7343   %}
 7344   ins_pipe( pipe_slow );
 7345 %}
 7346 
 7347 instruct vxor_reg(vec dst, vec src1, vec src2) %{
 7348   predicate(UseAVX > 0);
 7349   match(Set dst (XorV src1 src2));
 7350   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
 7351   ins_encode %{
 7352     int vlen_enc = vector_length_encoding(this);
 7353     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7354   %}
 7355   ins_pipe( pipe_slow );
 7356 %}
 7357 
 7358 instruct vxor_mem(vec dst, vec src, memory mem) %{
 7359   predicate((UseAVX > 0) &&
 7360             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7361   match(Set dst (XorV src (LoadVector mem)));
 7362   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
 7363   ins_encode %{
 7364     int vlen_enc = vector_length_encoding(this);
 7365     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7366   %}
 7367   ins_pipe( pipe_slow );
 7368 %}
 7369 
 7370 // --------------------------------- VectorCast --------------------------------------
 7371 
 7372 instruct vcastBtoX(vec dst, vec src) %{
 7373   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_DOUBLE);
 7374   match(Set dst (VectorCastB2X src));
 7375   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7376   ins_encode %{
 7377     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7378     int vlen_enc = vector_length_encoding(this);
 7379     __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7380   %}
 7381   ins_pipe( pipe_slow );
 7382 %}
 7383 
 7384 instruct vcastBtoD(legVec dst, legVec src) %{
 7385   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7386   match(Set dst (VectorCastB2X src));
 7387   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7388   ins_encode %{
 7389     int vlen_enc = vector_length_encoding(this);
 7390     __ vconvert_b2x(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7391   %}
 7392   ins_pipe( pipe_slow );
 7393 %}
 7394 
 7395 instruct castStoX(vec dst, vec src) %{
 7396   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7397             Matcher::vector_length(n->in(1)) <= 8 && // src
 7398             Matcher::vector_element_basic_type(n) == T_BYTE);
 7399   match(Set dst (VectorCastS2X src));
 7400   format %{ "vector_cast_s2x $dst,$src" %}
 7401   ins_encode %{
 7402     assert(UseAVX > 0, "required");
 7403 
 7404     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, noreg);
 7405     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7406   %}
 7407   ins_pipe( pipe_slow );
 7408 %}
 7409 
 7410 instruct vcastStoX(vec dst, vec src, vec vtmp) %{
 7411   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7412             Matcher::vector_length(n->in(1)) == 16 && // src
 7413             Matcher::vector_element_basic_type(n) == T_BYTE);
 7414   effect(TEMP dst, TEMP vtmp);
 7415   match(Set dst (VectorCastS2X src));
 7416   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp as TEMP" %}
 7417   ins_encode %{
 7418     assert(UseAVX > 0, "required");
 7419 
 7420     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
 7421     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7422     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 7423     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7424   %}
 7425   ins_pipe( pipe_slow );
 7426 %}
 7427 
 7428 instruct vcastStoX_evex(vec dst, vec src) %{
 7429   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
 7430             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7431   match(Set dst (VectorCastS2X src));
 7432   format %{ "vector_cast_s2x $dst,$src\t!" %}
 7433   ins_encode %{
 7434     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7435     int src_vlen_enc = vector_length_encoding(this, $src);
 7436     int vlen_enc = vector_length_encoding(this);
 7437     switch (to_elem_bt) {
 7438       case T_BYTE:
 7439         if (!VM_Version::supports_avx512vl()) {
 7440           vlen_enc = Assembler::AVX_512bit;
 7441         }
 7442         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7443         break;
 7444       case T_INT:
 7445         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7446         break;
 7447       case T_FLOAT:
 7448         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7449         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7450         break;
 7451       case T_LONG:
 7452         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7453         break;
 7454       case T_DOUBLE: {
 7455         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 7456         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 7457         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7458         break;
 7459       }
 7460       default:
 7461         ShouldNotReachHere();
 7462     }
 7463   %}
 7464   ins_pipe( pipe_slow );
 7465 %}
 7466 
 7467 instruct castItoX(vec dst, vec src) %{
 7468   predicate(UseAVX <= 2 &&
 7469             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
 7470             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7471   match(Set dst (VectorCastI2X src));
 7472   format %{ "vector_cast_i2x $dst,$src" %}
 7473   ins_encode %{
 7474     assert(UseAVX > 0, "required");
 7475 
 7476     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7477     int vlen_enc = vector_length_encoding(this, $src);
 7478 
 7479     if (to_elem_bt == T_BYTE) {
 7480       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7481       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7482       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7483     } else {
 7484       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7485       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7486       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7487     }
 7488   %}
 7489   ins_pipe( pipe_slow );
 7490 %}
 7491 
 7492 instruct vcastItoX(vec dst, vec src, vec vtmp) %{
 7493   predicate(UseAVX <= 2 &&
 7494             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
 7495             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7496   match(Set dst (VectorCastI2X src));
 7497   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp as TEMP" %}
 7498   effect(TEMP dst, TEMP vtmp);
 7499   ins_encode %{
 7500     assert(UseAVX > 0, "required");
 7501 
 7502     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7503     int vlen_enc = vector_length_encoding(this, $src);
 7504 
 7505     if (to_elem_bt == T_BYTE) {
 7506       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7507       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7508       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7509       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7510     } else {
 7511       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7512       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7513       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7514       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7515     }
 7516   %}
 7517   ins_pipe( pipe_slow );
 7518 %}
 7519 
 7520 instruct vcastItoX_evex(vec dst, vec src) %{
 7521   predicate(UseAVX > 2 ||
 7522             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7523   match(Set dst (VectorCastI2X src));
 7524   format %{ "vector_cast_i2x $dst,$src\t!" %}
 7525   ins_encode %{
 7526     assert(UseAVX > 0, "required");
 7527 
 7528     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
 7529     int src_vlen_enc = vector_length_encoding(this, $src);
 7530     int dst_vlen_enc = vector_length_encoding(this);
 7531     switch (dst_elem_bt) {
 7532       case T_BYTE:
 7533         if (!VM_Version::supports_avx512vl()) {
 7534           src_vlen_enc = Assembler::AVX_512bit;
 7535         }
 7536         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7537         break;
 7538       case T_SHORT:
 7539         if (!VM_Version::supports_avx512vl()) {
 7540           src_vlen_enc = Assembler::AVX_512bit;
 7541         }
 7542         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7543         break;
 7544       case T_FLOAT:
 7545         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7546         break;
 7547       case T_LONG:
 7548         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7549         break;
 7550       case T_DOUBLE:
 7551         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7552         break;
 7553       default:
 7554         ShouldNotReachHere();
 7555     }
 7556   %}
 7557   ins_pipe( pipe_slow );
 7558 %}
 7559 
 7560 instruct vcastLtoBS(vec dst, vec src) %{
 7561   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
 7562             UseAVX <= 2);
 7563   match(Set dst (VectorCastL2X src));
 7564   format %{ "vector_cast_l2x  $dst,$src" %}
 7565   ins_encode %{
 7566     assert(UseAVX > 0, "required");
 7567 
 7568     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7569     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
 7570     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
 7571                                                       : ExternalAddress(vector_int_to_short_mask());
 7572     if (vlen <= 16) {
 7573       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
 7574       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7575       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7576     } else {
 7577       assert(vlen <= 32, "required");
 7578       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
 7579       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
 7580       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7581       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7582     }
 7583     if (to_elem_bt == T_BYTE) {
 7584       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7585     }
 7586   %}
 7587   ins_pipe( pipe_slow );
 7588 %}
 7589 
 7590 instruct vcastLtoX_evex(vec dst, vec src) %{
 7591   predicate(UseAVX > 2 ||
 7592             (Matcher::vector_element_basic_type(n) == T_INT ||
 7593              Matcher::vector_element_basic_type(n) == T_FLOAT ||
 7594              Matcher::vector_element_basic_type(n) == T_DOUBLE));
 7595   match(Set dst (VectorCastL2X src));
 7596   format %{ "vector_cast_l2x  $dst,$src\t!" %}
 7597   ins_encode %{
 7598     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7599     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7600     int vlen_enc = vector_length_encoding(this, $src);
 7601     switch (to_elem_bt) {
 7602       case T_BYTE:
 7603         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7604           vlen_enc = Assembler::AVX_512bit;
 7605         }
 7606         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7607         break;
 7608       case T_SHORT:
 7609         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7610           vlen_enc = Assembler::AVX_512bit;
 7611         }
 7612         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7613         break;
 7614       case T_INT:
 7615         if (vlen == 8) {
 7616           if ($dst$$XMMRegister != $src$$XMMRegister) {
 7617             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7618           }
 7619         } else if (vlen == 16) {
 7620           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
 7621         } else if (vlen == 32) {
 7622           if (UseAVX > 2) {
 7623             if (!VM_Version::supports_avx512vl()) {
 7624               vlen_enc = Assembler::AVX_512bit;
 7625             }
 7626             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7627           } else {
 7628             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
 7629             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 7630           }
 7631         } else { // vlen == 64
 7632           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7633         }
 7634         break;
 7635       case T_FLOAT:
 7636         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7637         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7638         break;
 7639       case T_DOUBLE:
 7640         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7641         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7642         break;
 7643 
 7644       default: assert(false, "%s", type2name(to_elem_bt));
 7645     }
 7646   %}
 7647   ins_pipe( pipe_slow );
 7648 %}
 7649 
 7650 instruct vcastFtoD_reg(vec dst, vec src) %{
 7651   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7652   match(Set dst (VectorCastF2X src));
 7653   format %{ "vector_cast_f2d  $dst,$src\t!" %}
 7654   ins_encode %{
 7655     int vlen_enc = vector_length_encoding(this);
 7656     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7657   %}
 7658   ins_pipe( pipe_slow );
 7659 %}
 7660 
 7661 
 7662 instruct castFtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7663   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7664             type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
 7665   match(Set dst (VectorCastF2X src));
 7666   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7667   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
 7668   ins_encode %{
 7669     int vlen_enc = vector_length_encoding(this, $src);
 7670     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7671     // JDK-8292878 removed the need for an explicit scratch register needed to load greater than
 7672     // 32 bit addresses for register indirect addressing mode since stub constants
 7673     // are part of code cache and there is a cap of 2G on ReservedCodeCacheSize currently.
 7674     // However, targets are free to increase this limit, but having a large code cache size
 7675     // greater than 2G looks unreasonable in practical scenario, on the hind side with given
 7676     // cap we save a temporary register allocation which in limiting case can prevent
 7677     // spilling in high register pressure blocks.
 7678     __ vector_castF2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7679                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 7680                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7681   %}
 7682   ins_pipe( pipe_slow );
 7683 %}
 7684 
 7685 instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7686   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7687             is_integral_type(Matcher::vector_element_basic_type(n)));
 7688   match(Set dst (VectorCastF2X src));
 7689   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7690   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7691   ins_encode %{
 7692     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7693     if (to_elem_bt == T_LONG) {
 7694       int vlen_enc = vector_length_encoding(this);
 7695       __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7696                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7697                              ExternalAddress(vector_double_signflip()), noreg, vlen_enc);
 7698     } else {
 7699       int vlen_enc = vector_length_encoding(this, $src);
 7700       __ vector_castF2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7701                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7702                              ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7703     }
 7704   %}
 7705   ins_pipe( pipe_slow );
 7706 %}
 7707 
 7708 instruct vcastDtoF_reg(vec dst, vec src) %{
 7709   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 7710   match(Set dst (VectorCastD2X src));
 7711   format %{ "vector_cast_d2x  $dst,$src\t!" %}
 7712   ins_encode %{
 7713     int vlen_enc = vector_length_encoding(this, $src);
 7714     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7715   %}
 7716   ins_pipe( pipe_slow );
 7717 %}
 7718 
 7719 instruct castDtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, vec xtmp5, rFlagsReg cr) %{
 7720   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7721             is_integral_type(Matcher::vector_element_basic_type(n)));
 7722   match(Set dst (VectorCastD2X src));
 7723   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP xtmp5, KILL cr);
 7724   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $xtmp5 as TEMP" %}
 7725   ins_encode %{
 7726     int vlen_enc = vector_length_encoding(this, $src);
 7727     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7728     __ vector_castD2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7729                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, $xtmp5$$XMMRegister,
 7730                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7731   %}
 7732   ins_pipe( pipe_slow );
 7733 %}
 7734 
 7735 instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7736   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7737             is_integral_type(Matcher::vector_element_basic_type(n)));
 7738   match(Set dst (VectorCastD2X src));
 7739   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7740   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7741   ins_encode %{
 7742     int vlen_enc = vector_length_encoding(this, $src);
 7743     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7744     AddressLiteral signflip = VM_Version::supports_avx512dq() ? ExternalAddress(vector_double_signflip()) :
 7745                               ExternalAddress(vector_float_signflip());
 7746     __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7747                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, signflip, noreg, vlen_enc);
 7748   %}
 7749   ins_pipe( pipe_slow );
 7750 %}
 7751 
 7752 instruct vucast(vec dst, vec src) %{
 7753   match(Set dst (VectorUCastB2X src));
 7754   match(Set dst (VectorUCastS2X src));
 7755   match(Set dst (VectorUCastI2X src));
 7756   format %{ "vector_ucast $dst,$src\t!" %}
 7757   ins_encode %{
 7758     assert(UseAVX > 0, "required");
 7759 
 7760     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
 7761     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7762     int vlen_enc = vector_length_encoding(this);
 7763     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
 7764   %}
 7765   ins_pipe( pipe_slow );
 7766 %}
 7767 
 7768 #ifdef _LP64
 7769 instruct vround_float_avx(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7770   predicate(!VM_Version::supports_avx512vl() &&
 7771             Matcher::vector_length_in_bytes(n) < 64 &&
 7772             Matcher::vector_element_basic_type(n) == T_INT);
 7773   match(Set dst (RoundVF src));
 7774   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7775   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
 7776   ins_encode %{
 7777     int vlen_enc = vector_length_encoding(this);
 7778     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7779     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister,
 7780                               ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7781                               $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
 7782   %}
 7783   ins_pipe( pipe_slow );
 7784 %}
 7785 
 7786 instruct vround_float_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7787   predicate((VM_Version::supports_avx512vl() ||
 7788              Matcher::vector_length_in_bytes(n) == 64) &&
 7789              Matcher::vector_element_basic_type(n) == T_INT);
 7790   match(Set dst (RoundVF src));
 7791   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7792   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7793   ins_encode %{
 7794     int vlen_enc = vector_length_encoding(this);
 7795     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7796     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister,
 7797                                ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7798                                $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7799   %}
 7800   ins_pipe( pipe_slow );
 7801 %}
 7802 
 7803 instruct vround_reg_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7804   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 7805   match(Set dst (RoundVD src));
 7806   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2,  KILL cr);
 7807   format %{ "vector_round_long $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7808   ins_encode %{
 7809     int vlen_enc = vector_length_encoding(this);
 7810     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7811     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister,
 7812                                 ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), new_mxcsr, vlen_enc,
 7813                                 $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7814   %}
 7815   ins_pipe( pipe_slow );
 7816 %}
 7817 
 7818 #endif // _LP64
 7819 
 7820 // --------------------------------- VectorMaskCmp --------------------------------------
 7821 
 7822 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7823   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7824             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
 7825             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7826             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7827   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7828   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7829   ins_encode %{
 7830     int vlen_enc = vector_length_encoding(this, $src1);
 7831     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7832     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7833       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7834     } else {
 7835       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7836     }
 7837   %}
 7838   ins_pipe( pipe_slow );
 7839 %}
 7840 
 7841 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7842   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
 7843             n->bottom_type()->isa_vectmask() == nullptr &&
 7844             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7845   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7846   effect(TEMP ktmp);
 7847   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7848   ins_encode %{
 7849     int vlen_enc = Assembler::AVX_512bit;
 7850     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7851     KRegister mask = k0; // The comparison itself is not being masked.
 7852     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7853       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7854       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7855     } else {
 7856       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7857       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7858     }
 7859   %}
 7860   ins_pipe( pipe_slow );
 7861 %}
 7862 
 7863 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
 7864   predicate(n->bottom_type()->isa_vectmask() &&
 7865             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7866   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7867   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
 7868   ins_encode %{
 7869     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7870     int vlen_enc = vector_length_encoding(this, $src1);
 7871     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7872     KRegister mask = k0; // The comparison itself is not being masked.
 7873     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7874       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7875     } else {
 7876       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7877     }
 7878   %}
 7879   ins_pipe( pipe_slow );
 7880 %}
 7881 
 7882 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7883   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7884             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7885             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7886             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7887             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7888             (n->in(2)->get_int() == BoolTest::eq ||
 7889              n->in(2)->get_int() == BoolTest::lt ||
 7890              n->in(2)->get_int() == BoolTest::gt)); // cond
 7891   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7892   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7893   ins_encode %{
 7894     int vlen_enc = vector_length_encoding(this, $src1);
 7895     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7896     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7897     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
 7898   %}
 7899   ins_pipe( pipe_slow );
 7900 %}
 7901 
 7902 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7903   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7904             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7905             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7906             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7907             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7908             (n->in(2)->get_int() == BoolTest::ne ||
 7909              n->in(2)->get_int() == BoolTest::le ||
 7910              n->in(2)->get_int() == BoolTest::ge)); // cond
 7911   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7912   effect(TEMP dst, TEMP xtmp);
 7913   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7914   ins_encode %{
 7915     int vlen_enc = vector_length_encoding(this, $src1);
 7916     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7917     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7918     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7919   %}
 7920   ins_pipe( pipe_slow );
 7921 %}
 7922 
 7923 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7924   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7925             Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7926             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7927             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7928             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7929   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7930   effect(TEMP dst, TEMP xtmp);
 7931   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7932   ins_encode %{
 7933     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
 7934     int vlen_enc = vector_length_encoding(this, $src1);
 7935     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7936     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7937 
 7938     if (vlen_enc == Assembler::AVX_128bit) {
 7939       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7940     } else {
 7941       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7942     }
 7943     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 7944     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7945     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7946   %}
 7947   ins_pipe( pipe_slow );
 7948 %}
 7949 
 7950 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7951   predicate((n->bottom_type()->isa_vectmask() == nullptr &&
 7952              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
 7953              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7954   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7955   effect(TEMP ktmp);
 7956   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7957   ins_encode %{
 7958     assert(UseAVX > 2, "required");
 7959 
 7960     int vlen_enc = vector_length_encoding(this, $src1);
 7961     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7962     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 7963     KRegister mask = k0; // The comparison itself is not being masked.
 7964     bool merge = false;
 7965     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7966 
 7967     switch (src1_elem_bt) {
 7968       case T_INT: {
 7969         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7970         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7971         break;
 7972       }
 7973       case T_LONG: {
 7974         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7975         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7976         break;
 7977       }
 7978       default: assert(false, "%s", type2name(src1_elem_bt));
 7979     }
 7980   %}
 7981   ins_pipe( pipe_slow );
 7982 %}
 7983 
 7984 
 7985 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
 7986   predicate(n->bottom_type()->isa_vectmask() &&
 7987             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7988   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7989   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
 7990   ins_encode %{
 7991     assert(UseAVX > 2, "required");
 7992     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7993 
 7994     int vlen_enc = vector_length_encoding(this, $src1);
 7995     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7996     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 7997     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7998 
 7999     // Comparison i
 8000     switch (src1_elem_bt) {
 8001       case T_BYTE: {
 8002         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8003         break;
 8004       }
 8005       case T_SHORT: {
 8006         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8007         break;
 8008       }
 8009       case T_INT: {
 8010         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8011         break;
 8012       }
 8013       case T_LONG: {
 8014         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8015         break;
 8016       }
 8017       default: assert(false, "%s", type2name(src1_elem_bt));
 8018     }
 8019   %}
 8020   ins_pipe( pipe_slow );
 8021 %}
 8022 
 8023 // Extract
 8024 
 8025 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
 8026   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
 8027   match(Set dst (ExtractI src idx));
 8028   match(Set dst (ExtractS src idx));
 8029 #ifdef _LP64
 8030   match(Set dst (ExtractB src idx));
 8031 #endif
 8032   format %{ "extractI $dst,$src,$idx\t!" %}
 8033   ins_encode %{
 8034     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8035 
 8036     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8037     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8038   %}
 8039   ins_pipe( pipe_slow );
 8040 %}
 8041 
 8042 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
 8043   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
 8044             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
 8045   match(Set dst (ExtractI src idx));
 8046   match(Set dst (ExtractS src idx));
 8047 #ifdef _LP64
 8048   match(Set dst (ExtractB src idx));
 8049 #endif
 8050   effect(TEMP vtmp);
 8051   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8052   ins_encode %{
 8053     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8054 
 8055     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8056     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8057     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
 8058   %}
 8059   ins_pipe( pipe_slow );
 8060 %}
 8061 
 8062 #ifdef _LP64
 8063 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
 8064   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
 8065   match(Set dst (ExtractL src idx));
 8066   format %{ "extractL $dst,$src,$idx\t!" %}
 8067   ins_encode %{
 8068     assert(UseSSE >= 4, "required");
 8069     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8070 
 8071     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8072   %}
 8073   ins_pipe( pipe_slow );
 8074 %}
 8075 
 8076 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
 8077   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8078             Matcher::vector_length(n->in(1)) == 8);  // src
 8079   match(Set dst (ExtractL src idx));
 8080   effect(TEMP vtmp);
 8081   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8082   ins_encode %{
 8083     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8084 
 8085     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8086     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
 8087   %}
 8088   ins_pipe( pipe_slow );
 8089 %}
 8090 #endif
 8091 
 8092 instruct extractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8093   predicate(Matcher::vector_length(n->in(1)) <= 4);
 8094   match(Set dst (ExtractF src idx));
 8095   effect(TEMP dst, TEMP vtmp);
 8096   format %{ "extractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8097   ins_encode %{
 8098     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8099 
 8100     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $vtmp$$XMMRegister);
 8101   %}
 8102   ins_pipe( pipe_slow );
 8103 %}
 8104 
 8105 instruct vextractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8106   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
 8107             Matcher::vector_length(n->in(1)/*src*/) == 16);
 8108   match(Set dst (ExtractF src idx));
 8109   effect(TEMP vtmp);
 8110   format %{ "vextractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8111   ins_encode %{
 8112     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8113 
 8114     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8115     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8116   %}
 8117   ins_pipe( pipe_slow );
 8118 %}
 8119 
 8120 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
 8121   predicate(Matcher::vector_length(n->in(1)) == 2); // src
 8122   match(Set dst (ExtractD src idx));
 8123   format %{ "extractD $dst,$src,$idx\t!" %}
 8124   ins_encode %{
 8125     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8126 
 8127     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8128   %}
 8129   ins_pipe( pipe_slow );
 8130 %}
 8131 
 8132 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
 8133   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8134             Matcher::vector_length(n->in(1)) == 8);  // src
 8135   match(Set dst (ExtractD src idx));
 8136   effect(TEMP vtmp);
 8137   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8138   ins_encode %{
 8139     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8140 
 8141     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8142     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8143   %}
 8144   ins_pipe( pipe_slow );
 8145 %}
 8146 
 8147 // --------------------------------- Vector Blend --------------------------------------
 8148 
 8149 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 8150   predicate(UseAVX == 0);
 8151   match(Set dst (VectorBlend (Binary dst src) mask));
 8152   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
 8153   effect(TEMP tmp);
 8154   ins_encode %{
 8155     assert(UseSSE >= 4, "required");
 8156 
 8157     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
 8158       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
 8159     }
 8160     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
 8161   %}
 8162   ins_pipe( pipe_slow );
 8163 %}
 8164 
 8165 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8166   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8167             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8168             Matcher::vector_length_in_bytes(n) <= 32 &&
 8169             is_integral_type(Matcher::vector_element_basic_type(n)));
 8170   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8171   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8172   ins_encode %{
 8173     int vlen_enc = vector_length_encoding(this);
 8174     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8175   %}
 8176   ins_pipe( pipe_slow );
 8177 %}
 8178 
 8179 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8180   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8181             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8182             Matcher::vector_length_in_bytes(n) <= 32 &&
 8183             !is_integral_type(Matcher::vector_element_basic_type(n)));
 8184   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8185   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8186   ins_encode %{
 8187     int vlen_enc = vector_length_encoding(this);
 8188     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8189   %}
 8190   ins_pipe( pipe_slow );
 8191 %}
 8192 
 8193 instruct vblendvp(legVec dst, legVec src1, legVec src2, legVec mask, legVec vtmp) %{
 8194   predicate(UseAVX > 0 && EnableX86ECoreOpts &&
 8195             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8196             Matcher::vector_length_in_bytes(n) <= 32);
 8197   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8198   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $vtmp as TEMP" %}
 8199   effect(TEMP vtmp, TEMP dst);
 8200   ins_encode %{
 8201     int vlen_enc = vector_length_encoding(this);
 8202     __ vpandn($vtmp$$XMMRegister, $mask$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 8203     __ vpand ($dst$$XMMRegister,  $mask$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8204     __ vpor  ($dst$$XMMRegister,  $dst$$XMMRegister,  $vtmp$$XMMRegister, vlen_enc);
 8205   %}
 8206   ins_pipe( pipe_slow );
 8207 %}
 8208 
 8209 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
 8210   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 8211             n->in(2)->bottom_type()->isa_vectmask() == nullptr);
 8212   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8213   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8214   effect(TEMP ktmp);
 8215   ins_encode %{
 8216      int vlen_enc = Assembler::AVX_512bit;
 8217      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8218     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, noreg);
 8219     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8220   %}
 8221   ins_pipe( pipe_slow );
 8222 %}
 8223 
 8224 
 8225 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask) %{
 8226   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
 8227             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
 8228              VM_Version::supports_avx512bw()));
 8229   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8230   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8231   ins_encode %{
 8232     int vlen_enc = vector_length_encoding(this);
 8233     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8234     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8235   %}
 8236   ins_pipe( pipe_slow );
 8237 %}
 8238 
 8239 // --------------------------------- ABS --------------------------------------
 8240 // a = |a|
 8241 instruct vabsB_reg(vec dst, vec src) %{
 8242   match(Set dst (AbsVB  src));
 8243   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
 8244   ins_encode %{
 8245     uint vlen = Matcher::vector_length(this);
 8246     if (vlen <= 16) {
 8247       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8248     } else {
 8249       int vlen_enc = vector_length_encoding(this);
 8250       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8251     }
 8252   %}
 8253   ins_pipe( pipe_slow );
 8254 %}
 8255 
 8256 instruct vabsS_reg(vec dst, vec src) %{
 8257   match(Set dst (AbsVS  src));
 8258   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
 8259   ins_encode %{
 8260     uint vlen = Matcher::vector_length(this);
 8261     if (vlen <= 8) {
 8262       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8263     } else {
 8264       int vlen_enc = vector_length_encoding(this);
 8265       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8266     }
 8267   %}
 8268   ins_pipe( pipe_slow );
 8269 %}
 8270 
 8271 instruct vabsI_reg(vec dst, vec src) %{
 8272   match(Set dst (AbsVI  src));
 8273   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
 8274   ins_encode %{
 8275     uint vlen = Matcher::vector_length(this);
 8276     if (vlen <= 4) {
 8277       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8278     } else {
 8279       int vlen_enc = vector_length_encoding(this);
 8280       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8281     }
 8282   %}
 8283   ins_pipe( pipe_slow );
 8284 %}
 8285 
 8286 instruct vabsL_reg(vec dst, vec src) %{
 8287   match(Set dst (AbsVL  src));
 8288   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
 8289   ins_encode %{
 8290     assert(UseAVX > 2, "required");
 8291     int vlen_enc = vector_length_encoding(this);
 8292     if (!VM_Version::supports_avx512vl()) {
 8293       vlen_enc = Assembler::AVX_512bit;
 8294     }
 8295     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8296   %}
 8297   ins_pipe( pipe_slow );
 8298 %}
 8299 
 8300 // --------------------------------- ABSNEG --------------------------------------
 8301 
 8302 instruct vabsnegF(vec dst, vec src) %{
 8303   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
 8304   match(Set dst (AbsVF src));
 8305   match(Set dst (NegVF src));
 8306   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
 8307   ins_cost(150);
 8308   ins_encode %{
 8309     int opcode = this->ideal_Opcode();
 8310     int vlen = Matcher::vector_length(this);
 8311     if (vlen == 2) {
 8312       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8313     } else {
 8314       assert(vlen == 8 || vlen == 16, "required");
 8315       int vlen_enc = vector_length_encoding(this);
 8316       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8317     }
 8318   %}
 8319   ins_pipe( pipe_slow );
 8320 %}
 8321 
 8322 instruct vabsneg4F(vec dst) %{
 8323   predicate(Matcher::vector_length(n) == 4);
 8324   match(Set dst (AbsVF dst));
 8325   match(Set dst (NegVF dst));
 8326   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
 8327   ins_cost(150);
 8328   ins_encode %{
 8329     int opcode = this->ideal_Opcode();
 8330     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister);
 8331   %}
 8332   ins_pipe( pipe_slow );
 8333 %}
 8334 
 8335 instruct vabsnegD(vec dst, vec src) %{
 8336   match(Set dst (AbsVD  src));
 8337   match(Set dst (NegVD  src));
 8338   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
 8339   ins_encode %{
 8340     int opcode = this->ideal_Opcode();
 8341     uint vlen = Matcher::vector_length(this);
 8342     if (vlen == 2) {
 8343       assert(UseSSE >= 2, "required");
 8344       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8345     } else {
 8346       int vlen_enc = vector_length_encoding(this);
 8347       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8348     }
 8349   %}
 8350   ins_pipe( pipe_slow );
 8351 %}
 8352 
 8353 //------------------------------------- VectorTest --------------------------------------------
 8354 
 8355 #ifdef _LP64
 8356 instruct vptest_lt16(rFlagsRegU cr, legVec src1, legVec src2, legVec vtmp) %{
 8357   predicate(Matcher::vector_length_in_bytes(n->in(1)) < 16);
 8358   match(Set cr (VectorTest src1 src2));
 8359   effect(TEMP vtmp);
 8360   format %{ "vptest_lt16  $src1, $src2\t! using $vtmp as TEMP" %}
 8361   ins_encode %{
 8362     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8363     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8364     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister, vlen);
 8365   %}
 8366   ins_pipe( pipe_slow );
 8367 %}
 8368 
 8369 instruct vptest_ge16(rFlagsRegU cr, legVec src1, legVec src2) %{
 8370   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16);
 8371   match(Set cr (VectorTest src1 src2));
 8372   format %{ "vptest_ge16  $src1, $src2\n\t" %}
 8373   ins_encode %{
 8374     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8375     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8376     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, vlen);
 8377   %}
 8378   ins_pipe( pipe_slow );
 8379 %}
 8380 
 8381 instruct ktest_alltrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8382   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8383              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8384             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 8385   match(Set cr (VectorTest src1 src2));
 8386   effect(TEMP tmp);
 8387   format %{ "ktest_alltrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8388   ins_encode %{
 8389     uint masklen = Matcher::vector_length(this, $src1);
 8390     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8391     __ andl($tmp$$Register, (1 << masklen) - 1);
 8392     __ cmpl($tmp$$Register, (1 << masklen) - 1);
 8393   %}
 8394   ins_pipe( pipe_slow );
 8395 %}
 8396 
 8397 instruct ktest_anytrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8398   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8399              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8400             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8401   match(Set cr (VectorTest src1 src2));
 8402   effect(TEMP tmp);
 8403   format %{ "ktest_anytrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8404   ins_encode %{
 8405     uint masklen = Matcher::vector_length(this, $src1);
 8406     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8407     __ andl($tmp$$Register, (1 << masklen) - 1);
 8408   %}
 8409   ins_pipe( pipe_slow );
 8410 %}
 8411 
 8412 instruct ktest_ge8(rFlagsRegU cr, kReg src1, kReg src2) %{
 8413   predicate(Matcher::vector_length(n->in(1)) >= 16 ||
 8414             (Matcher::vector_length(n->in(1)) == 8 && VM_Version::supports_avx512dq()));
 8415   match(Set cr (VectorTest src1 src2));
 8416   format %{ "ktest_ge8  $src1, $src2\n\t" %}
 8417   ins_encode %{
 8418     uint masklen = Matcher::vector_length(this, $src1);
 8419     __ kortest(masklen, $src1$$KRegister, $src1$$KRegister);
 8420   %}
 8421   ins_pipe( pipe_slow );
 8422 %}
 8423 #endif
 8424 
 8425 //------------------------------------- LoadMask --------------------------------------------
 8426 
 8427 instruct loadMask(legVec dst, legVec src) %{
 8428   predicate(n->bottom_type()->isa_vectmask() == nullptr && !VM_Version::supports_avx512vlbw());
 8429   match(Set dst (VectorLoadMask src));
 8430   effect(TEMP dst);
 8431   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
 8432   ins_encode %{
 8433     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8434     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8435     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
 8436   %}
 8437   ins_pipe( pipe_slow );
 8438 %}
 8439 
 8440 instruct loadMask64(kReg dst, vec src, vec xtmp) %{
 8441   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8442   match(Set dst (VectorLoadMask src));
 8443   effect(TEMP xtmp);
 8444   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp as TEMP" %}
 8445   ins_encode %{
 8446     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8447                         true, Assembler::AVX_512bit);
 8448   %}
 8449   ins_pipe( pipe_slow );
 8450 %}
 8451 
 8452 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
 8453   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8454   match(Set dst (VectorLoadMask src));
 8455   effect(TEMP xtmp);
 8456   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
 8457   ins_encode %{
 8458     int vlen_enc = vector_length_encoding(in(1));
 8459     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8460                         false, vlen_enc);
 8461   %}
 8462   ins_pipe( pipe_slow );
 8463 %}
 8464 
 8465 //------------------------------------- StoreMask --------------------------------------------
 8466 
 8467 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
 8468   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8469   match(Set dst (VectorStoreMask src size));
 8470   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8471   ins_encode %{
 8472     int vlen = Matcher::vector_length(this);
 8473     if (vlen <= 16 && UseAVX <= 2) {
 8474       assert(UseSSE >= 3, "required");
 8475       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8476     } else {
 8477       assert(UseAVX > 0, "required");
 8478       int src_vlen_enc = vector_length_encoding(this, $src);
 8479       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8480     }
 8481   %}
 8482   ins_pipe( pipe_slow );
 8483 %}
 8484 
 8485 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
 8486   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8487   match(Set dst (VectorStoreMask src size));
 8488   effect(TEMP_DEF dst, TEMP xtmp);
 8489   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8490   ins_encode %{
 8491     int vlen_enc = Assembler::AVX_128bit;
 8492     int vlen = Matcher::vector_length(this);
 8493     if (vlen <= 8) {
 8494       assert(UseSSE >= 3, "required");
 8495       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8496       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8497       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8498     } else {
 8499       assert(UseAVX > 0, "required");
 8500       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8501       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8502       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8503     }
 8504   %}
 8505   ins_pipe( pipe_slow );
 8506 %}
 8507 
 8508 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
 8509   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8510   match(Set dst (VectorStoreMask src size));
 8511   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8512   effect(TEMP_DEF dst, TEMP xtmp);
 8513   ins_encode %{
 8514     int vlen_enc = Assembler::AVX_128bit;
 8515     int vlen = Matcher::vector_length(this);
 8516     if (vlen <= 4) {
 8517       assert(UseSSE >= 3, "required");
 8518       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8519       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8520       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8521       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8522     } else {
 8523       assert(UseAVX > 0, "required");
 8524       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8525       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8526       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8527       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8528       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8529     }
 8530   %}
 8531   ins_pipe( pipe_slow );
 8532 %}
 8533 
 8534 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
 8535   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
 8536   match(Set dst (VectorStoreMask src size));
 8537   effect(TEMP_DEF dst, TEMP xtmp);
 8538   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8539   ins_encode %{
 8540     assert(UseSSE >= 3, "required");
 8541     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8542     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
 8543     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
 8544     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8545     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8546   %}
 8547   ins_pipe( pipe_slow );
 8548 %}
 8549 
 8550 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
 8551   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
 8552   match(Set dst (VectorStoreMask src size));
 8553   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
 8554   effect(TEMP_DEF dst, TEMP vtmp);
 8555   ins_encode %{
 8556     int vlen_enc = Assembler::AVX_128bit;
 8557     __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
 8558     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 8559     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
 8560     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8561     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8562     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8563     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8564   %}
 8565   ins_pipe( pipe_slow );
 8566 %}
 8567 
 8568 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
 8569   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8570   match(Set dst (VectorStoreMask src size));
 8571   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8572   ins_encode %{
 8573     int src_vlen_enc = vector_length_encoding(this, $src);
 8574     int dst_vlen_enc = vector_length_encoding(this);
 8575     if (!VM_Version::supports_avx512vl()) {
 8576       src_vlen_enc = Assembler::AVX_512bit;
 8577     }
 8578     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8579     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8580   %}
 8581   ins_pipe( pipe_slow );
 8582 %}
 8583 
 8584 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
 8585   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8586   match(Set dst (VectorStoreMask src size));
 8587   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8588   ins_encode %{
 8589     int src_vlen_enc = vector_length_encoding(this, $src);
 8590     int dst_vlen_enc = vector_length_encoding(this);
 8591     if (!VM_Version::supports_avx512vl()) {
 8592       src_vlen_enc = Assembler::AVX_512bit;
 8593     }
 8594     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8595     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8596   %}
 8597   ins_pipe( pipe_slow );
 8598 %}
 8599 
 8600 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size) %{
 8601   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8602   match(Set dst (VectorStoreMask mask size));
 8603   effect(TEMP_DEF dst);
 8604   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8605   ins_encode %{
 8606     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
 8607     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
 8608                  false, Assembler::AVX_512bit, noreg);
 8609     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
 8610   %}
 8611   ins_pipe( pipe_slow );
 8612 %}
 8613 
 8614 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
 8615   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8616   match(Set dst (VectorStoreMask mask size));
 8617   effect(TEMP_DEF dst);
 8618   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8619   ins_encode %{
 8620     int dst_vlen_enc = vector_length_encoding(this);
 8621     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
 8622     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8623   %}
 8624   ins_pipe( pipe_slow );
 8625 %}
 8626 
 8627 instruct vmaskcast_evex(kReg dst) %{
 8628   match(Set dst (VectorMaskCast dst));
 8629   ins_cost(0);
 8630   format %{ "vector_mask_cast $dst" %}
 8631   ins_encode %{
 8632     // empty
 8633   %}
 8634   ins_pipe(empty);
 8635 %}
 8636 
 8637 instruct vmaskcast(vec dst) %{
 8638   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
 8639   match(Set dst (VectorMaskCast dst));
 8640   ins_cost(0);
 8641   format %{ "vector_mask_cast $dst" %}
 8642   ins_encode %{
 8643     // empty
 8644   %}
 8645   ins_pipe(empty);
 8646 %}
 8647 
 8648 instruct vmaskcast_avx(vec dst, vec src) %{
 8649   predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)));
 8650   match(Set dst (VectorMaskCast src));
 8651   format %{ "vector_mask_cast $dst, $src" %}
 8652   ins_encode %{
 8653     int vlen = Matcher::vector_length(this);
 8654     BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
 8655     BasicType dst_bt = Matcher::vector_element_basic_type(this);
 8656     __ vector_mask_cast($dst$$XMMRegister, $src$$XMMRegister, dst_bt, src_bt, vlen);
 8657   %}
 8658   ins_pipe(pipe_slow);
 8659 %}
 8660 
 8661 //-------------------------------- Load Iota Indices ----------------------------------
 8662 
 8663 instruct loadIotaIndices(vec dst, immI_0 src) %{
 8664   match(Set dst (VectorLoadConst src));
 8665   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
 8666   ins_encode %{
 8667      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8668      BasicType bt = Matcher::vector_element_basic_type(this);
 8669      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, bt);
 8670   %}
 8671   ins_pipe( pipe_slow );
 8672 %}
 8673 
 8674 #ifdef _LP64
 8675 instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
 8676   match(Set dst (PopulateIndex src1 src2));
 8677   effect(TEMP dst, TEMP vtmp);
 8678   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8679   ins_encode %{
 8680      assert($src2$$constant == 1, "required");
 8681      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8682      int vlen_enc = vector_length_encoding(this);
 8683      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8684      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8685      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8686      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8687   %}
 8688   ins_pipe( pipe_slow );
 8689 %}
 8690 
 8691 instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
 8692   match(Set dst (PopulateIndex src1 src2));
 8693   effect(TEMP dst, TEMP vtmp);
 8694   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8695   ins_encode %{
 8696      assert($src2$$constant == 1, "required");
 8697      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8698      int vlen_enc = vector_length_encoding(this);
 8699      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8700      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8701      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8702      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8703   %}
 8704   ins_pipe( pipe_slow );
 8705 %}
 8706 #endif
 8707 //-------------------------------- Rearrange ----------------------------------
 8708 
 8709 // LoadShuffle/Rearrange for Byte
 8710 
 8711 instruct loadShuffleB(vec dst) %{
 8712   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 8713   match(Set dst (VectorLoadShuffle dst));
 8714   format %{ "vector_load_shuffle $dst, $dst" %}
 8715   ins_encode %{
 8716     // empty
 8717   %}
 8718   ins_pipe( pipe_slow );
 8719 %}
 8720 
 8721 instruct rearrangeB(vec dst, vec shuffle) %{
 8722   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8723             Matcher::vector_length(n) < 32);
 8724   match(Set dst (VectorRearrange dst shuffle));
 8725   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8726   ins_encode %{
 8727     assert(UseSSE >= 4, "required");
 8728     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8729   %}
 8730   ins_pipe( pipe_slow );
 8731 %}
 8732 
 8733 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8734   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8735             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
 8736   match(Set dst (VectorRearrange src shuffle));
 8737   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8738   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8739   ins_encode %{
 8740     assert(UseAVX >= 2, "required");
 8741     // Swap src into vtmp1
 8742     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8743     // Shuffle swapped src to get entries from other 128 bit lane
 8744     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8745     // Shuffle original src to get entries from self 128 bit lane
 8746     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8747     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8748     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8749     // Perform the blend
 8750     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8751   %}
 8752   ins_pipe( pipe_slow );
 8753 %}
 8754 
 8755 
 8756 instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
 8757   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8758             Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
 8759   match(Set dst (VectorRearrange src shuffle));
 8760   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 8761   format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
 8762   ins_encode %{
 8763     int vlen_enc = vector_length_encoding(this);
 8764     __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
 8765                        $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
 8766                        $rtmp$$Register, $ktmp$$KRegister, vlen_enc);
 8767   %}
 8768   ins_pipe( pipe_slow );
 8769 %}
 8770 
 8771 instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
 8772   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8773             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
 8774   match(Set dst (VectorRearrange src shuffle));
 8775   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8776   ins_encode %{
 8777     int vlen_enc = vector_length_encoding(this);
 8778     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8779   %}
 8780   ins_pipe( pipe_slow );
 8781 %}
 8782 
 8783 // LoadShuffle/Rearrange for Short
 8784 
 8785 instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
 8786   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8787             Matcher::vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
 8788   match(Set dst (VectorLoadShuffle src));
 8789   effect(TEMP dst, TEMP vtmp);
 8790   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8791   ins_encode %{
 8792     // Create a byte shuffle mask from short shuffle mask
 8793     // only byte shuffle instruction available on these platforms
 8794     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8795     if (UseAVX == 0) {
 8796       assert(vlen_in_bytes <= 16, "required");
 8797       // Multiply each shuffle by two to get byte index
 8798       __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
 8799       __ psllw($vtmp$$XMMRegister, 1);
 8800 
 8801       // Duplicate to create 2 copies of byte index
 8802       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8803       __ psllw($dst$$XMMRegister, 8);
 8804       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
 8805 
 8806       // Add one to get alternate byte index
 8807       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), noreg);
 8808       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8809     } else {
 8810       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
 8811       int vlen_enc = vector_length_encoding(this);
 8812       // Multiply each shuffle by two to get byte index
 8813       __ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8814       __ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
 8815 
 8816       // Duplicate to create 2 copies of byte index
 8817       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
 8818       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8819 
 8820       // Add one to get alternate byte index
 8821       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, noreg);
 8822     }
 8823   %}
 8824   ins_pipe( pipe_slow );
 8825 %}
 8826 
 8827 instruct rearrangeS(vec dst, vec shuffle) %{
 8828   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8829             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
 8830   match(Set dst (VectorRearrange dst shuffle));
 8831   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8832   ins_encode %{
 8833     assert(UseSSE >= 4, "required");
 8834     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8835   %}
 8836   ins_pipe( pipe_slow );
 8837 %}
 8838 
 8839 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8840   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8841             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
 8842   match(Set dst (VectorRearrange src shuffle));
 8843   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8844   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8845   ins_encode %{
 8846     assert(UseAVX >= 2, "required");
 8847     // Swap src into vtmp1
 8848     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8849     // Shuffle swapped src to get entries from other 128 bit lane
 8850     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8851     // Shuffle original src to get entries from self 128 bit lane
 8852     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8853     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8854     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8855     // Perform the blend
 8856     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8857   %}
 8858   ins_pipe( pipe_slow );
 8859 %}
 8860 
 8861 instruct loadShuffleS_evex(vec dst, vec src) %{
 8862   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8863             VM_Version::supports_avx512bw());
 8864   match(Set dst (VectorLoadShuffle src));
 8865   format %{ "vector_load_shuffle $dst, $src" %}
 8866   ins_encode %{
 8867     int vlen_enc = vector_length_encoding(this);
 8868     if (!VM_Version::supports_avx512vl()) {
 8869       vlen_enc = Assembler::AVX_512bit;
 8870     }
 8871     __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8872   %}
 8873   ins_pipe( pipe_slow );
 8874 %}
 8875 
 8876 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
 8877   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8878             VM_Version::supports_avx512bw());
 8879   match(Set dst (VectorRearrange src shuffle));
 8880   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8881   ins_encode %{
 8882     int vlen_enc = vector_length_encoding(this);
 8883     if (!VM_Version::supports_avx512vl()) {
 8884       vlen_enc = Assembler::AVX_512bit;
 8885     }
 8886     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8887   %}
 8888   ins_pipe( pipe_slow );
 8889 %}
 8890 
 8891 // LoadShuffle/Rearrange for Integer and Float
 8892 
 8893 instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
 8894   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8895             Matcher::vector_length(n) == 4 && UseAVX == 0);
 8896   match(Set dst (VectorLoadShuffle src));
 8897   effect(TEMP dst, TEMP vtmp);
 8898   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8899   ins_encode %{
 8900     assert(UseSSE >= 4, "required");
 8901 
 8902     // Create a byte shuffle mask from int shuffle mask
 8903     // only byte shuffle instruction available on these platforms
 8904 
 8905     // Duplicate and multiply each shuffle by 4
 8906     __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
 8907     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8908     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8909     __ psllw($vtmp$$XMMRegister, 2);
 8910 
 8911     // Duplicate again to create 4 copies of byte index
 8912     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8913     __ psllw($dst$$XMMRegister, 8);
 8914     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
 8915 
 8916     // Add 3,2,1,0 to get alternate byte index
 8917     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), noreg);
 8918     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8919   %}
 8920   ins_pipe( pipe_slow );
 8921 %}
 8922 
 8923 instruct rearrangeI(vec dst, vec shuffle) %{
 8924   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8925             UseAVX == 0);
 8926   match(Set dst (VectorRearrange dst shuffle));
 8927   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8928   ins_encode %{
 8929     assert(UseSSE >= 4, "required");
 8930     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8931   %}
 8932   ins_pipe( pipe_slow );
 8933 %}
 8934 
 8935 instruct loadShuffleI_avx(vec dst, vec src) %{
 8936   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8937             UseAVX > 0);
 8938   match(Set dst (VectorLoadShuffle src));
 8939   format %{ "vector_load_shuffle $dst, $src" %}
 8940   ins_encode %{
 8941     int vlen_enc = vector_length_encoding(this);
 8942     __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8943   %}
 8944   ins_pipe( pipe_slow );
 8945 %}
 8946 
 8947 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
 8948   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8949             UseAVX > 0);
 8950   match(Set dst (VectorRearrange src shuffle));
 8951   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8952   ins_encode %{
 8953     int vlen_enc = vector_length_encoding(this);
 8954     BasicType bt = Matcher::vector_element_basic_type(this);
 8955     __ vector_rearrange_int_float(bt, $dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8956   %}
 8957   ins_pipe( pipe_slow );
 8958 %}
 8959 
 8960 // LoadShuffle/Rearrange for Long and Double
 8961 
 8962 instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
 8963   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8964             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8965   match(Set dst (VectorLoadShuffle src));
 8966   effect(TEMP dst, TEMP vtmp);
 8967   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8968   ins_encode %{
 8969     assert(UseAVX >= 2, "required");
 8970 
 8971     int vlen_enc = vector_length_encoding(this);
 8972     // Create a double word shuffle mask from long shuffle mask
 8973     // only double word shuffle instruction available on these platforms
 8974 
 8975     // Multiply each shuffle by two to get double word index
 8976     __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8977     __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
 8978 
 8979     // Duplicate each double word shuffle
 8980     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
 8981     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8982 
 8983     // Add one to get alternate double word index
 8984     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, noreg);
 8985   %}
 8986   ins_pipe( pipe_slow );
 8987 %}
 8988 
 8989 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
 8990   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8991             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8992   match(Set dst (VectorRearrange src shuffle));
 8993   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8994   ins_encode %{
 8995     assert(UseAVX >= 2, "required");
 8996 
 8997     int vlen_enc = vector_length_encoding(this);
 8998     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8999   %}
 9000   ins_pipe( pipe_slow );
 9001 %}
 9002 
 9003 instruct loadShuffleL_evex(vec dst, vec src) %{
 9004   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9005             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 9006   match(Set dst (VectorLoadShuffle src));
 9007   format %{ "vector_load_shuffle $dst, $src" %}
 9008   ins_encode %{
 9009     assert(UseAVX > 2, "required");
 9010 
 9011     int vlen_enc = vector_length_encoding(this);
 9012     __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9013   %}
 9014   ins_pipe( pipe_slow );
 9015 %}
 9016 
 9017 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
 9018   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9019             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 9020   match(Set dst (VectorRearrange src shuffle));
 9021   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9022   ins_encode %{
 9023     assert(UseAVX > 2, "required");
 9024 
 9025     int vlen_enc = vector_length_encoding(this);
 9026     if (vlen_enc == Assembler::AVX_128bit) {
 9027       vlen_enc = Assembler::AVX_256bit;
 9028     }
 9029     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9030   %}
 9031   ins_pipe( pipe_slow );
 9032 %}
 9033 
 9034 // --------------------------------- FMA --------------------------------------
 9035 // a * b + c
 9036 
 9037 instruct vfmaF_reg(vec a, vec b, vec c) %{
 9038   match(Set c (FmaVF  c (Binary a b)));
 9039   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 9040   ins_cost(150);
 9041   ins_encode %{
 9042     assert(UseFMA, "not enabled");
 9043     int vlen_enc = vector_length_encoding(this);
 9044     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9045   %}
 9046   ins_pipe( pipe_slow );
 9047 %}
 9048 
 9049 instruct vfmaF_mem(vec a, memory b, vec c) %{
 9050   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9051   match(Set c (FmaVF  c (Binary a (LoadVector b))));
 9052   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 9053   ins_cost(150);
 9054   ins_encode %{
 9055     assert(UseFMA, "not enabled");
 9056     int vlen_enc = vector_length_encoding(this);
 9057     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9058   %}
 9059   ins_pipe( pipe_slow );
 9060 %}
 9061 
 9062 instruct vfmaD_reg(vec a, vec b, vec c) %{
 9063   match(Set c (FmaVD  c (Binary a b)));
 9064   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9065   ins_cost(150);
 9066   ins_encode %{
 9067     assert(UseFMA, "not enabled");
 9068     int vlen_enc = vector_length_encoding(this);
 9069     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9070   %}
 9071   ins_pipe( pipe_slow );
 9072 %}
 9073 
 9074 instruct vfmaD_mem(vec a, memory b, vec c) %{
 9075   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9076   match(Set c (FmaVD  c (Binary a (LoadVector b))));
 9077   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9078   ins_cost(150);
 9079   ins_encode %{
 9080     assert(UseFMA, "not enabled");
 9081     int vlen_enc = vector_length_encoding(this);
 9082     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9083   %}
 9084   ins_pipe( pipe_slow );
 9085 %}
 9086 
 9087 // --------------------------------- Vector Multiply Add --------------------------------------
 9088 
 9089 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
 9090   predicate(UseAVX == 0);
 9091   match(Set dst (MulAddVS2VI dst src1));
 9092   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
 9093   ins_encode %{
 9094     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
 9095   %}
 9096   ins_pipe( pipe_slow );
 9097 %}
 9098 
 9099 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
 9100   predicate(UseAVX > 0);
 9101   match(Set dst (MulAddVS2VI src1 src2));
 9102   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
 9103   ins_encode %{
 9104     int vlen_enc = vector_length_encoding(this);
 9105     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9106   %}
 9107   ins_pipe( pipe_slow );
 9108 %}
 9109 
 9110 // --------------------------------- Vector Multiply Add Add ----------------------------------
 9111 
 9112 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
 9113   predicate(VM_Version::supports_avx512_vnni());
 9114   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
 9115   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
 9116   ins_encode %{
 9117     assert(UseAVX > 2, "required");
 9118     int vlen_enc = vector_length_encoding(this);
 9119     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9120   %}
 9121   ins_pipe( pipe_slow );
 9122   ins_cost(10);
 9123 %}
 9124 
 9125 // --------------------------------- PopCount --------------------------------------
 9126 
 9127 instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
 9128   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9129   match(Set dst (PopCountVI src));
 9130   match(Set dst (PopCountVL src));
 9131   format %{ "vector_popcount_integral $dst, $src" %}
 9132   ins_encode %{
 9133     int opcode = this->ideal_Opcode();
 9134     int vlen_enc = vector_length_encoding(this, $src);
 9135     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9136     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
 9137   %}
 9138   ins_pipe( pipe_slow );
 9139 %}
 9140 
 9141 instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9142   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9143   match(Set dst (PopCountVI src mask));
 9144   match(Set dst (PopCountVL src mask));
 9145   format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
 9146   ins_encode %{
 9147     int vlen_enc = vector_length_encoding(this, $src);
 9148     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9149     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9150     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
 9151   %}
 9152   ins_pipe( pipe_slow );
 9153 %}
 9154 
 9155 instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
 9156   predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9157   match(Set dst (PopCountVI src));
 9158   match(Set dst (PopCountVL src));
 9159   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9160   format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
 9161   ins_encode %{
 9162     int opcode = this->ideal_Opcode();
 9163     int vlen_enc = vector_length_encoding(this, $src);
 9164     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9165     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9166                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
 9167   %}
 9168   ins_pipe( pipe_slow );
 9169 %}
 9170 
 9171 // --------------------------------- Vector Trailing Zeros Count --------------------------------------
 9172 
 9173 instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
 9174   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9175                                               Matcher::vector_length_in_bytes(n->in(1))));
 9176   match(Set dst (CountTrailingZerosV src));
 9177   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 9178   ins_cost(400);
 9179   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
 9180   ins_encode %{
 9181     int vlen_enc = vector_length_encoding(this, $src);
 9182     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9183     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9184                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9185   %}
 9186   ins_pipe( pipe_slow );
 9187 %}
 9188 
 9189 instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9190   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9191             VM_Version::supports_avx512cd() &&
 9192             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9193   match(Set dst (CountTrailingZerosV src));
 9194   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9195   ins_cost(400);
 9196   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
 9197   ins_encode %{
 9198     int vlen_enc = vector_length_encoding(this, $src);
 9199     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9200     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9201                                         $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9202   %}
 9203   ins_pipe( pipe_slow );
 9204 %}
 9205 
 9206 instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
 9207   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9208   match(Set dst (CountTrailingZerosV src));
 9209   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
 9210   ins_cost(400);
 9211   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
 9212   ins_encode %{
 9213     int vlen_enc = vector_length_encoding(this, $src);
 9214     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9215     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9216                                         $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 9217                                         $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
 9218   %}
 9219   ins_pipe( pipe_slow );
 9220 %}
 9221 
 9222 instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9223   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9224   match(Set dst (CountTrailingZerosV src));
 9225   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9226   format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9227   ins_encode %{
 9228     int vlen_enc = vector_length_encoding(this, $src);
 9229     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9230     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9231                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9232   %}
 9233   ins_pipe( pipe_slow );
 9234 %}
 9235 
 9236 
 9237 // --------------------------------- Bitwise Ternary Logic ----------------------------------
 9238 
 9239 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
 9240   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
 9241   effect(TEMP dst);
 9242   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9243   ins_encode %{
 9244     int vector_len = vector_length_encoding(this);
 9245     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
 9246   %}
 9247   ins_pipe( pipe_slow );
 9248 %}
 9249 
 9250 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
 9251   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
 9252   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
 9253   effect(TEMP dst);
 9254   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9255   ins_encode %{
 9256     int vector_len = vector_length_encoding(this);
 9257     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
 9258   %}
 9259   ins_pipe( pipe_slow );
 9260 %}
 9261 
 9262 // --------------------------------- Rotation Operations ----------------------------------
 9263 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
 9264   match(Set dst (RotateLeftV src shift));
 9265   match(Set dst (RotateRightV src shift));
 9266   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
 9267   ins_encode %{
 9268     int opcode      = this->ideal_Opcode();
 9269     int vector_len  = vector_length_encoding(this);
 9270     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9271     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 9272   %}
 9273   ins_pipe( pipe_slow );
 9274 %}
 9275 
 9276 instruct vprorate(vec dst, vec src, vec shift) %{
 9277   match(Set dst (RotateLeftV src shift));
 9278   match(Set dst (RotateRightV src shift));
 9279   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
 9280   ins_encode %{
 9281     int opcode      = this->ideal_Opcode();
 9282     int vector_len  = vector_length_encoding(this);
 9283     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9284     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
 9285   %}
 9286   ins_pipe( pipe_slow );
 9287 %}
 9288 
 9289 // ---------------------------------- Masked Operations ------------------------------------
 9290 instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
 9291   predicate(!n->in(3)->bottom_type()->isa_vectmask());
 9292   match(Set dst (LoadVectorMasked mem mask));
 9293   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9294   ins_encode %{
 9295     BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
 9296     int vlen_enc = vector_length_encoding(this);
 9297     __ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
 9298   %}
 9299   ins_pipe( pipe_slow );
 9300 %}
 9301 
 9302 
 9303 instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
 9304   predicate(n->in(3)->bottom_type()->isa_vectmask());
 9305   match(Set dst (LoadVectorMasked mem mask));
 9306   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9307   ins_encode %{
 9308     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
 9309     int vector_len = vector_length_encoding(this);
 9310     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
 9311   %}
 9312   ins_pipe( pipe_slow );
 9313 %}
 9314 
 9315 instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
 9316   predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9317   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9318   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9319   ins_encode %{
 9320     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9321     int vlen_enc = vector_length_encoding(src_node);
 9322     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9323     __ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 9324   %}
 9325   ins_pipe( pipe_slow );
 9326 %}
 9327 
 9328 instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
 9329   predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9330   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9331   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9332   ins_encode %{
 9333     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9334     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9335     int vlen_enc = vector_length_encoding(src_node);
 9336     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
 9337   %}
 9338   ins_pipe( pipe_slow );
 9339 %}
 9340 
 9341 #ifdef _LP64
 9342 instruct verify_vector_alignment(rRegP addr, immL32 mask, rFlagsReg cr) %{
 9343   match(Set addr (VerifyVectorAlignment addr mask));
 9344   effect(KILL cr);
 9345   format %{ "verify_vector_alignment $addr $mask \t! verify alignment" %}
 9346   ins_encode %{
 9347     Label Lskip;
 9348     // check if masked bits of addr are zero
 9349     __ testq($addr$$Register, $mask$$constant);
 9350     __ jccb(Assembler::equal, Lskip);
 9351     __ stop("verify_vector_alignment found a misaligned vector memory access");
 9352     __ bind(Lskip);
 9353   %}
 9354   ins_pipe(pipe_slow);
 9355 %}
 9356 
 9357 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 9358   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
 9359   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
 9360   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
 9361   ins_encode %{
 9362     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
 9363     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
 9364 
 9365     Label DONE;
 9366     int vlen_enc = vector_length_encoding(this, $src1);
 9367     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9368 
 9369     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
 9370     __ mov64($dst$$Register, -1L);
 9371     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
 9372     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
 9373     __ jccb(Assembler::carrySet, DONE);
 9374     __ kmovql($dst$$Register, $ktmp1$$KRegister);
 9375     __ notq($dst$$Register);
 9376     __ tzcntq($dst$$Register, $dst$$Register);
 9377     __ bind(DONE);
 9378   %}
 9379   ins_pipe( pipe_slow );
 9380 %}
 9381 
 9382 
 9383 instruct vmask_gen(kReg dst, rRegL len, rRegL temp, rFlagsReg cr) %{
 9384   match(Set dst (VectorMaskGen len));
 9385   effect(TEMP temp, KILL cr);
 9386   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
 9387   ins_encode %{
 9388     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
 9389   %}
 9390   ins_pipe( pipe_slow );
 9391 %}
 9392 
 9393 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
 9394   match(Set dst (VectorMaskGen len));
 9395   format %{ "vector_mask_gen $len \t! vector mask generator" %}
 9396   effect(TEMP temp);
 9397   ins_encode %{
 9398     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
 9399     __ kmovql($dst$$KRegister, $temp$$Register);
 9400   %}
 9401   ins_pipe( pipe_slow );
 9402 %}
 9403 
 9404 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
 9405   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9406   match(Set dst (VectorMaskToLong mask));
 9407   effect(TEMP dst, KILL cr);
 9408   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
 9409   ins_encode %{
 9410     int opcode = this->ideal_Opcode();
 9411     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9412     int mask_len = Matcher::vector_length(this, $mask);
 9413     int mask_size = mask_len * type2aelembytes(mbt);
 9414     int vlen_enc = vector_length_encoding(this, $mask);
 9415     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9416                              $dst$$Register, mask_len, mask_size, vlen_enc);
 9417   %}
 9418   ins_pipe( pipe_slow );
 9419 %}
 9420 
 9421 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
 9422   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9423   match(Set dst (VectorMaskToLong mask));
 9424   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
 9425   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9426   ins_encode %{
 9427     int opcode = this->ideal_Opcode();
 9428     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9429     int mask_len = Matcher::vector_length(this, $mask);
 9430     int vlen_enc = vector_length_encoding(this, $mask);
 9431     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9432                              $dst$$Register, mask_len, mbt, vlen_enc);
 9433   %}
 9434   ins_pipe( pipe_slow );
 9435 %}
 9436 
 9437 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
 9438   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9439   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
 9440   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
 9441   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9442   ins_encode %{
 9443     int opcode = this->ideal_Opcode();
 9444     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9445     int mask_len = Matcher::vector_length(this, $mask);
 9446     int vlen_enc = vector_length_encoding(this, $mask);
 9447     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9448                              $dst$$Register, mask_len, mbt, vlen_enc);
 9449   %}
 9450   ins_pipe( pipe_slow );
 9451 %}
 9452 
 9453 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9454   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9455   match(Set dst (VectorMaskTrueCount mask));
 9456   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9457   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
 9458   ins_encode %{
 9459     int opcode = this->ideal_Opcode();
 9460     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9461     int mask_len = Matcher::vector_length(this, $mask);
 9462     int mask_size = mask_len * type2aelembytes(mbt);
 9463     int vlen_enc = vector_length_encoding(this, $mask);
 9464     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9465                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9466   %}
 9467   ins_pipe( pipe_slow );
 9468 %}
 9469 
 9470 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9471   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9472   match(Set dst (VectorMaskTrueCount mask));
 9473   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9474   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9475   ins_encode %{
 9476     int opcode = this->ideal_Opcode();
 9477     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9478     int mask_len = Matcher::vector_length(this, $mask);
 9479     int vlen_enc = vector_length_encoding(this, $mask);
 9480     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9481                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9482   %}
 9483   ins_pipe( pipe_slow );
 9484 %}
 9485 
 9486 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9487   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9488   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
 9489   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9490   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9491   ins_encode %{
 9492     int opcode = this->ideal_Opcode();
 9493     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9494     int mask_len = Matcher::vector_length(this, $mask);
 9495     int vlen_enc = vector_length_encoding(this, $mask);
 9496     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9497                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9498   %}
 9499   ins_pipe( pipe_slow );
 9500 %}
 9501 
 9502 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9503   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9504   match(Set dst (VectorMaskFirstTrue mask));
 9505   match(Set dst (VectorMaskLastTrue mask));
 9506   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9507   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
 9508   ins_encode %{
 9509     int opcode = this->ideal_Opcode();
 9510     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9511     int mask_len = Matcher::vector_length(this, $mask);
 9512     int mask_size = mask_len * type2aelembytes(mbt);
 9513     int vlen_enc = vector_length_encoding(this, $mask);
 9514     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9515                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9516   %}
 9517   ins_pipe( pipe_slow );
 9518 %}
 9519 
 9520 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9521   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9522   match(Set dst (VectorMaskFirstTrue mask));
 9523   match(Set dst (VectorMaskLastTrue mask));
 9524   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9525   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9526   ins_encode %{
 9527     int opcode = this->ideal_Opcode();
 9528     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9529     int mask_len = Matcher::vector_length(this, $mask);
 9530     int vlen_enc = vector_length_encoding(this, $mask);
 9531     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9532                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9533   %}
 9534   ins_pipe( pipe_slow );
 9535 %}
 9536 
 9537 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9538   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9539   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
 9540   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
 9541   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9542   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9543   ins_encode %{
 9544     int opcode = this->ideal_Opcode();
 9545     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9546     int mask_len = Matcher::vector_length(this, $mask);
 9547     int vlen_enc = vector_length_encoding(this, $mask);
 9548     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9549                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9550   %}
 9551   ins_pipe( pipe_slow );
 9552 %}
 9553 
 9554 // --------------------------------- Compress/Expand Operations ---------------------------
 9555 #ifdef _LP64
 9556 instruct vcompress_reg_avx(vec dst, vec src, vec mask, rRegI rtmp, rRegL rscratch, vec perm, vec xtmp, rFlagsReg cr) %{
 9557   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 9558   match(Set dst (CompressV src mask));
 9559   match(Set dst (ExpandV src mask));
 9560   effect(TEMP_DEF dst, TEMP perm, TEMP xtmp, TEMP rtmp, TEMP rscratch, KILL cr);
 9561   format %{ "vector_compress $dst, $src, $mask \t!using $xtmp, $rtmp, $rscratch and $perm as TEMP" %}
 9562   ins_encode %{
 9563     int opcode = this->ideal_Opcode();
 9564     int vlen_enc = vector_length_encoding(this);
 9565     BasicType bt  = Matcher::vector_element_basic_type(this);
 9566     __ vector_compress_expand_avx2(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$XMMRegister, $rtmp$$Register,
 9567                                    $rscratch$$Register, $perm$$XMMRegister, $xtmp$$XMMRegister, bt, vlen_enc);
 9568   %}
 9569   ins_pipe( pipe_slow );
 9570 %}
 9571 #endif
 9572 
 9573 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
 9574   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 9575   match(Set dst (CompressV src mask));
 9576   match(Set dst (ExpandV src mask));
 9577   format %{ "vector_compress_expand $dst, $src, $mask" %}
 9578   ins_encode %{
 9579     int opcode = this->ideal_Opcode();
 9580     int vector_len = vector_length_encoding(this);
 9581     BasicType bt  = Matcher::vector_element_basic_type(this);
 9582     __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
 9583   %}
 9584   ins_pipe( pipe_slow );
 9585 %}
 9586 
 9587 instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
 9588   match(Set dst (CompressM mask));
 9589   effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
 9590   format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
 9591   ins_encode %{
 9592     assert(this->in(1)->bottom_type()->isa_vectmask(), "");
 9593     int mask_len = Matcher::vector_length(this);
 9594     __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
 9595   %}
 9596   ins_pipe( pipe_slow );
 9597 %}
 9598 
 9599 #endif // _LP64
 9600 
 9601 // -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
 9602 
 9603 instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9604   predicate(!VM_Version::supports_gfni());
 9605   match(Set dst (ReverseV src));
 9606   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9607   format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9608   ins_encode %{
 9609     int vec_enc = vector_length_encoding(this);
 9610     BasicType bt = Matcher::vector_element_basic_type(this);
 9611     __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9612                           $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9613   %}
 9614   ins_pipe( pipe_slow );
 9615 %}
 9616 
 9617 instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp) %{
 9618   predicate(VM_Version::supports_gfni());
 9619   match(Set dst (ReverseV src));
 9620   effect(TEMP dst, TEMP xtmp);
 9621   format %{ "vector_reverse_bit_gfni $dst, $src!\t using $xtmp as TEMP" %}
 9622   ins_encode %{
 9623     int vec_enc = vector_length_encoding(this);
 9624     BasicType bt  = Matcher::vector_element_basic_type(this);
 9625     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, 0x8040201008040201L, 1));
 9626     __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, addr, vec_enc,
 9627                                $xtmp$$XMMRegister);
 9628   %}
 9629   ins_pipe( pipe_slow );
 9630 %}
 9631 
 9632 instruct vreverse_byte_reg(vec dst, vec src) %{
 9633   predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
 9634   match(Set dst (ReverseBytesV src));
 9635   effect(TEMP dst);
 9636   format %{ "vector_reverse_byte $dst, $src" %}
 9637   ins_encode %{
 9638     int vec_enc = vector_length_encoding(this);
 9639     BasicType bt = Matcher::vector_element_basic_type(this);
 9640     __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, vec_enc);
 9641   %}
 9642   ins_pipe( pipe_slow );
 9643 %}
 9644 
 9645 instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9646   predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
 9647   match(Set dst (ReverseBytesV src));
 9648   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9649   format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9650   ins_encode %{
 9651     int vec_enc = vector_length_encoding(this);
 9652     BasicType bt = Matcher::vector_element_basic_type(this);
 9653     __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9654                              $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9655   %}
 9656   ins_pipe( pipe_slow );
 9657 %}
 9658 
 9659 // ---------------------------------- Vector Count Leading Zeros -----------------------------------
 9660 
 9661 instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
 9662   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9663                                               Matcher::vector_length_in_bytes(n->in(1))));
 9664   match(Set dst (CountLeadingZerosV src));
 9665   format %{ "vector_count_leading_zeros $dst, $src" %}
 9666   ins_encode %{
 9667      int vlen_enc = vector_length_encoding(this, $src);
 9668      BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9669      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9670                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
 9671   %}
 9672   ins_pipe( pipe_slow );
 9673 %}
 9674 
 9675 instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9676   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9677                                               Matcher::vector_length_in_bytes(n->in(1))));
 9678   match(Set dst (CountLeadingZerosV src mask));
 9679   format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
 9680   ins_encode %{
 9681     int vlen_enc = vector_length_encoding(this, $src);
 9682     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9683     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9684     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
 9685                                        xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
 9686   %}
 9687   ins_pipe( pipe_slow );
 9688 %}
 9689 
 9690 instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
 9691   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9692             VM_Version::supports_avx512cd() &&
 9693             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9694   match(Set dst (CountLeadingZerosV src));
 9695   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 9696   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
 9697   ins_encode %{
 9698     int vlen_enc = vector_length_encoding(this, $src);
 9699     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9700     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9701                                        $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
 9702   %}
 9703   ins_pipe( pipe_slow );
 9704 %}
 9705 
 9706 instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
 9707   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9708   match(Set dst (CountLeadingZerosV src));
 9709   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 9710   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
 9711   ins_encode %{
 9712     int vlen_enc = vector_length_encoding(this, $src);
 9713     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9714     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9715                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
 9716                                        $rtmp$$Register, true, vlen_enc);
 9717   %}
 9718   ins_pipe( pipe_slow );
 9719 %}
 9720 
 9721 instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
 9722   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
 9723             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9724   match(Set dst (CountLeadingZerosV src));
 9725   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
 9726   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
 9727   ins_encode %{
 9728     int vlen_enc = vector_length_encoding(this, $src);
 9729     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9730     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9731                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
 9732   %}
 9733   ins_pipe( pipe_slow );
 9734 %}
 9735 
 9736 instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9737   predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
 9738             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9739   match(Set dst (CountLeadingZerosV src));
 9740   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9741   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9742   ins_encode %{
 9743     int vlen_enc = vector_length_encoding(this, $src);
 9744     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9745     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9746                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9747   %}
 9748   ins_pipe( pipe_slow );
 9749 %}
 9750 
 9751 // ---------------------------------- Vector Masked Operations ------------------------------------
 9752 
 9753 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
 9754   match(Set dst (AddVB (Binary dst src2) mask));
 9755   match(Set dst (AddVS (Binary dst src2) mask));
 9756   match(Set dst (AddVI (Binary dst src2) mask));
 9757   match(Set dst (AddVL (Binary dst src2) mask));
 9758   match(Set dst (AddVF (Binary dst src2) mask));
 9759   match(Set dst (AddVD (Binary dst src2) mask));
 9760   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9761   ins_encode %{
 9762     int vlen_enc = vector_length_encoding(this);
 9763     BasicType bt = Matcher::vector_element_basic_type(this);
 9764     int opc = this->ideal_Opcode();
 9765     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9766                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9767   %}
 9768   ins_pipe( pipe_slow );
 9769 %}
 9770 
 9771 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
 9772   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
 9773   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
 9774   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
 9775   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
 9776   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
 9777   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
 9778   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9779   ins_encode %{
 9780     int vlen_enc = vector_length_encoding(this);
 9781     BasicType bt = Matcher::vector_element_basic_type(this);
 9782     int opc = this->ideal_Opcode();
 9783     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9784                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9785   %}
 9786   ins_pipe( pipe_slow );
 9787 %}
 9788 
 9789 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
 9790   match(Set dst (XorV (Binary dst src2) mask));
 9791   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9792   ins_encode %{
 9793     int vlen_enc = vector_length_encoding(this);
 9794     BasicType bt = Matcher::vector_element_basic_type(this);
 9795     int opc = this->ideal_Opcode();
 9796     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9797                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9798   %}
 9799   ins_pipe( pipe_slow );
 9800 %}
 9801 
 9802 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
 9803   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
 9804   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9805   ins_encode %{
 9806     int vlen_enc = vector_length_encoding(this);
 9807     BasicType bt = Matcher::vector_element_basic_type(this);
 9808     int opc = this->ideal_Opcode();
 9809     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9810                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9811   %}
 9812   ins_pipe( pipe_slow );
 9813 %}
 9814 
 9815 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
 9816   match(Set dst (OrV (Binary dst src2) mask));
 9817   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9818   ins_encode %{
 9819     int vlen_enc = vector_length_encoding(this);
 9820     BasicType bt = Matcher::vector_element_basic_type(this);
 9821     int opc = this->ideal_Opcode();
 9822     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9823                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9824   %}
 9825   ins_pipe( pipe_slow );
 9826 %}
 9827 
 9828 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
 9829   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
 9830   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9831   ins_encode %{
 9832     int vlen_enc = vector_length_encoding(this);
 9833     BasicType bt = Matcher::vector_element_basic_type(this);
 9834     int opc = this->ideal_Opcode();
 9835     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9836                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9837   %}
 9838   ins_pipe( pipe_slow );
 9839 %}
 9840 
 9841 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
 9842   match(Set dst (AndV (Binary dst src2) mask));
 9843   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9844   ins_encode %{
 9845     int vlen_enc = vector_length_encoding(this);
 9846     BasicType bt = Matcher::vector_element_basic_type(this);
 9847     int opc = this->ideal_Opcode();
 9848     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9849                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9850   %}
 9851   ins_pipe( pipe_slow );
 9852 %}
 9853 
 9854 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
 9855   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
 9856   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9857   ins_encode %{
 9858     int vlen_enc = vector_length_encoding(this);
 9859     BasicType bt = Matcher::vector_element_basic_type(this);
 9860     int opc = this->ideal_Opcode();
 9861     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9862                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9863   %}
 9864   ins_pipe( pipe_slow );
 9865 %}
 9866 
 9867 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
 9868   match(Set dst (SubVB (Binary dst src2) mask));
 9869   match(Set dst (SubVS (Binary dst src2) mask));
 9870   match(Set dst (SubVI (Binary dst src2) mask));
 9871   match(Set dst (SubVL (Binary dst src2) mask));
 9872   match(Set dst (SubVF (Binary dst src2) mask));
 9873   match(Set dst (SubVD (Binary dst src2) mask));
 9874   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9875   ins_encode %{
 9876     int vlen_enc = vector_length_encoding(this);
 9877     BasicType bt = Matcher::vector_element_basic_type(this);
 9878     int opc = this->ideal_Opcode();
 9879     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9880                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9881   %}
 9882   ins_pipe( pipe_slow );
 9883 %}
 9884 
 9885 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
 9886   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
 9887   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
 9888   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
 9889   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
 9890   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
 9891   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
 9892   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9893   ins_encode %{
 9894     int vlen_enc = vector_length_encoding(this);
 9895     BasicType bt = Matcher::vector_element_basic_type(this);
 9896     int opc = this->ideal_Opcode();
 9897     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9898                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9899   %}
 9900   ins_pipe( pipe_slow );
 9901 %}
 9902 
 9903 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
 9904   match(Set dst (MulVS (Binary dst src2) mask));
 9905   match(Set dst (MulVI (Binary dst src2) mask));
 9906   match(Set dst (MulVL (Binary dst src2) mask));
 9907   match(Set dst (MulVF (Binary dst src2) mask));
 9908   match(Set dst (MulVD (Binary dst src2) mask));
 9909   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9910   ins_encode %{
 9911     int vlen_enc = vector_length_encoding(this);
 9912     BasicType bt = Matcher::vector_element_basic_type(this);
 9913     int opc = this->ideal_Opcode();
 9914     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9915                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9916   %}
 9917   ins_pipe( pipe_slow );
 9918 %}
 9919 
 9920 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
 9921   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
 9922   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
 9923   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
 9924   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
 9925   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
 9926   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9927   ins_encode %{
 9928     int vlen_enc = vector_length_encoding(this);
 9929     BasicType bt = Matcher::vector_element_basic_type(this);
 9930     int opc = this->ideal_Opcode();
 9931     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9932                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9933   %}
 9934   ins_pipe( pipe_slow );
 9935 %}
 9936 
 9937 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
 9938   match(Set dst (SqrtVF dst mask));
 9939   match(Set dst (SqrtVD dst mask));
 9940   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
 9941   ins_encode %{
 9942     int vlen_enc = vector_length_encoding(this);
 9943     BasicType bt = Matcher::vector_element_basic_type(this);
 9944     int opc = this->ideal_Opcode();
 9945     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9946                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9947   %}
 9948   ins_pipe( pipe_slow );
 9949 %}
 9950 
 9951 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
 9952   match(Set dst (DivVF (Binary dst src2) mask));
 9953   match(Set dst (DivVD (Binary dst src2) mask));
 9954   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9955   ins_encode %{
 9956     int vlen_enc = vector_length_encoding(this);
 9957     BasicType bt = Matcher::vector_element_basic_type(this);
 9958     int opc = this->ideal_Opcode();
 9959     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9960                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9961   %}
 9962   ins_pipe( pipe_slow );
 9963 %}
 9964 
 9965 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
 9966   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
 9967   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
 9968   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9969   ins_encode %{
 9970     int vlen_enc = vector_length_encoding(this);
 9971     BasicType bt = Matcher::vector_element_basic_type(this);
 9972     int opc = this->ideal_Opcode();
 9973     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9974                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9975   %}
 9976   ins_pipe( pipe_slow );
 9977 %}
 9978 
 9979 
 9980 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9981   match(Set dst (RotateLeftV (Binary dst shift) mask));
 9982   match(Set dst (RotateRightV (Binary dst shift) mask));
 9983   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
 9984   ins_encode %{
 9985     int vlen_enc = vector_length_encoding(this);
 9986     BasicType bt = Matcher::vector_element_basic_type(this);
 9987     int opc = this->ideal_Opcode();
 9988     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9989                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9990   %}
 9991   ins_pipe( pipe_slow );
 9992 %}
 9993 
 9994 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
 9995   match(Set dst (RotateLeftV (Binary dst src2) mask));
 9996   match(Set dst (RotateRightV (Binary dst src2) mask));
 9997   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
 9998   ins_encode %{
 9999     int vlen_enc = vector_length_encoding(this);
10000     BasicType bt = Matcher::vector_element_basic_type(this);
10001     int opc = this->ideal_Opcode();
10002     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10003                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10004   %}
10005   ins_pipe( pipe_slow );
10006 %}
10007 
10008 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10009   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
10010   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
10011   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
10012   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
10013   ins_encode %{
10014     int vlen_enc = vector_length_encoding(this);
10015     BasicType bt = Matcher::vector_element_basic_type(this);
10016     int opc = this->ideal_Opcode();
10017     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10018                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10019   %}
10020   ins_pipe( pipe_slow );
10021 %}
10022 
10023 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
10024   predicate(!n->as_ShiftV()->is_var_shift());
10025   match(Set dst (LShiftVS (Binary dst src2) mask));
10026   match(Set dst (LShiftVI (Binary dst src2) mask));
10027   match(Set dst (LShiftVL (Binary dst src2) mask));
10028   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
10029   ins_encode %{
10030     int vlen_enc = vector_length_encoding(this);
10031     BasicType bt = Matcher::vector_element_basic_type(this);
10032     int opc = this->ideal_Opcode();
10033     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10034                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10035   %}
10036   ins_pipe( pipe_slow );
10037 %}
10038 
10039 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10040   predicate(n->as_ShiftV()->is_var_shift());
10041   match(Set dst (LShiftVS (Binary dst src2) mask));
10042   match(Set dst (LShiftVI (Binary dst src2) mask));
10043   match(Set dst (LShiftVL (Binary dst src2) mask));
10044   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
10045   ins_encode %{
10046     int vlen_enc = vector_length_encoding(this);
10047     BasicType bt = Matcher::vector_element_basic_type(this);
10048     int opc = this->ideal_Opcode();
10049     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10050                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10051   %}
10052   ins_pipe( pipe_slow );
10053 %}
10054 
10055 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10056   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
10057   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
10058   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
10059   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
10060   ins_encode %{
10061     int vlen_enc = vector_length_encoding(this);
10062     BasicType bt = Matcher::vector_element_basic_type(this);
10063     int opc = this->ideal_Opcode();
10064     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10065                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10066   %}
10067   ins_pipe( pipe_slow );
10068 %}
10069 
10070 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
10071   predicate(!n->as_ShiftV()->is_var_shift());
10072   match(Set dst (RShiftVS (Binary dst src2) mask));
10073   match(Set dst (RShiftVI (Binary dst src2) mask));
10074   match(Set dst (RShiftVL (Binary dst src2) mask));
10075   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10076   ins_encode %{
10077     int vlen_enc = vector_length_encoding(this);
10078     BasicType bt = Matcher::vector_element_basic_type(this);
10079     int opc = this->ideal_Opcode();
10080     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10081                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10082   %}
10083   ins_pipe( pipe_slow );
10084 %}
10085 
10086 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10087   predicate(n->as_ShiftV()->is_var_shift());
10088   match(Set dst (RShiftVS (Binary dst src2) mask));
10089   match(Set dst (RShiftVI (Binary dst src2) mask));
10090   match(Set dst (RShiftVL (Binary dst src2) mask));
10091   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10092   ins_encode %{
10093     int vlen_enc = vector_length_encoding(this);
10094     BasicType bt = Matcher::vector_element_basic_type(this);
10095     int opc = this->ideal_Opcode();
10096     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10097                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10098   %}
10099   ins_pipe( pipe_slow );
10100 %}
10101 
10102 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10103   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
10104   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
10105   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
10106   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
10107   ins_encode %{
10108     int vlen_enc = vector_length_encoding(this);
10109     BasicType bt = Matcher::vector_element_basic_type(this);
10110     int opc = this->ideal_Opcode();
10111     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10112                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10113   %}
10114   ins_pipe( pipe_slow );
10115 %}
10116 
10117 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
10118   predicate(!n->as_ShiftV()->is_var_shift());
10119   match(Set dst (URShiftVS (Binary dst src2) mask));
10120   match(Set dst (URShiftVI (Binary dst src2) mask));
10121   match(Set dst (URShiftVL (Binary dst src2) mask));
10122   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10123   ins_encode %{
10124     int vlen_enc = vector_length_encoding(this);
10125     BasicType bt = Matcher::vector_element_basic_type(this);
10126     int opc = this->ideal_Opcode();
10127     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10128                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10129   %}
10130   ins_pipe( pipe_slow );
10131 %}
10132 
10133 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10134   predicate(n->as_ShiftV()->is_var_shift());
10135   match(Set dst (URShiftVS (Binary dst src2) mask));
10136   match(Set dst (URShiftVI (Binary dst src2) mask));
10137   match(Set dst (URShiftVL (Binary dst src2) mask));
10138   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10139   ins_encode %{
10140     int vlen_enc = vector_length_encoding(this);
10141     BasicType bt = Matcher::vector_element_basic_type(this);
10142     int opc = this->ideal_Opcode();
10143     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10144                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10145   %}
10146   ins_pipe( pipe_slow );
10147 %}
10148 
10149 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
10150   match(Set dst (MaxV (Binary dst src2) mask));
10151   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10152   ins_encode %{
10153     int vlen_enc = vector_length_encoding(this);
10154     BasicType bt = Matcher::vector_element_basic_type(this);
10155     int opc = this->ideal_Opcode();
10156     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10157                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10158   %}
10159   ins_pipe( pipe_slow );
10160 %}
10161 
10162 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
10163   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
10164   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10165   ins_encode %{
10166     int vlen_enc = vector_length_encoding(this);
10167     BasicType bt = Matcher::vector_element_basic_type(this);
10168     int opc = this->ideal_Opcode();
10169     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10170                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10171   %}
10172   ins_pipe( pipe_slow );
10173 %}
10174 
10175 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
10176   match(Set dst (MinV (Binary dst src2) mask));
10177   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10178   ins_encode %{
10179     int vlen_enc = vector_length_encoding(this);
10180     BasicType bt = Matcher::vector_element_basic_type(this);
10181     int opc = this->ideal_Opcode();
10182     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10183                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10184   %}
10185   ins_pipe( pipe_slow );
10186 %}
10187 
10188 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
10189   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
10190   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10191   ins_encode %{
10192     int vlen_enc = vector_length_encoding(this);
10193     BasicType bt = Matcher::vector_element_basic_type(this);
10194     int opc = this->ideal_Opcode();
10195     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10196                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10197   %}
10198   ins_pipe( pipe_slow );
10199 %}
10200 
10201 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
10202   match(Set dst (VectorRearrange (Binary dst src2) mask));
10203   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
10204   ins_encode %{
10205     int vlen_enc = vector_length_encoding(this);
10206     BasicType bt = Matcher::vector_element_basic_type(this);
10207     int opc = this->ideal_Opcode();
10208     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10209                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10210   %}
10211   ins_pipe( pipe_slow );
10212 %}
10213 
10214 instruct vabs_masked(vec dst, kReg mask) %{
10215   match(Set dst (AbsVB dst mask));
10216   match(Set dst (AbsVS dst mask));
10217   match(Set dst (AbsVI dst mask));
10218   match(Set dst (AbsVL dst mask));
10219   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
10220   ins_encode %{
10221     int vlen_enc = vector_length_encoding(this);
10222     BasicType bt = Matcher::vector_element_basic_type(this);
10223     int opc = this->ideal_Opcode();
10224     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10225                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
10226   %}
10227   ins_pipe( pipe_slow );
10228 %}
10229 
10230 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
10231   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
10232   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
10233   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10234   ins_encode %{
10235     assert(UseFMA, "Needs FMA instructions support.");
10236     int vlen_enc = vector_length_encoding(this);
10237     BasicType bt = Matcher::vector_element_basic_type(this);
10238     int opc = this->ideal_Opcode();
10239     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10240                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
10241   %}
10242   ins_pipe( pipe_slow );
10243 %}
10244 
10245 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
10246   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
10247   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
10248   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10249   ins_encode %{
10250     assert(UseFMA, "Needs FMA instructions support.");
10251     int vlen_enc = vector_length_encoding(this);
10252     BasicType bt = Matcher::vector_element_basic_type(this);
10253     int opc = this->ideal_Opcode();
10254     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10255                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
10256   %}
10257   ins_pipe( pipe_slow );
10258 %}
10259 
10260 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask) %{
10261   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
10262   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask" %}
10263   ins_encode %{
10264     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
10265     int vlen_enc = vector_length_encoding(this, $src1);
10266     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
10267 
10268     // Comparison i
10269     switch (src1_elem_bt) {
10270       case T_BYTE: {
10271         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10272         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10273         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10274         break;
10275       }
10276       case T_SHORT: {
10277         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10278         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10279         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10280         break;
10281       }
10282       case T_INT: {
10283         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10284         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10285         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10286         break;
10287       }
10288       case T_LONG: {
10289         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10290         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10291         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10292         break;
10293       }
10294       case T_FLOAT: {
10295         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10296         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10297         break;
10298       }
10299       case T_DOUBLE: {
10300         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10301         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10302         break;
10303       }
10304       default: assert(false, "%s", type2name(src1_elem_bt)); break;
10305     }
10306   %}
10307   ins_pipe( pipe_slow );
10308 %}
10309 
10310 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
10311   predicate(Matcher::vector_length(n) <= 32);
10312   match(Set dst (MaskAll src));
10313   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
10314   ins_encode %{
10315     int mask_len = Matcher::vector_length(this);
10316     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
10317   %}
10318   ins_pipe( pipe_slow );
10319 %}
10320 
10321 #ifdef _LP64
10322 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
10323   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
10324   match(Set dst (XorVMask src (MaskAll cnt)));
10325   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
10326   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
10327   ins_encode %{
10328     uint masklen = Matcher::vector_length(this);
10329     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
10330   %}
10331   ins_pipe( pipe_slow );
10332 %}
10333 
10334 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
10335   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
10336             (Matcher::vector_length(n) == 16) ||
10337             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
10338   match(Set dst (XorVMask src (MaskAll cnt)));
10339   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
10340   ins_encode %{
10341     uint masklen = Matcher::vector_length(this);
10342     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
10343   %}
10344   ins_pipe( pipe_slow );
10345 %}
10346 
10347 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
10348   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) <= 8);
10349   match(Set dst (VectorLongToMask src));
10350   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
10351   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
10352   ins_encode %{
10353     int mask_len = Matcher::vector_length(this);
10354     int vec_enc  = vector_length_encoding(mask_len);
10355     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10356                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
10357   %}
10358   ins_pipe( pipe_slow );
10359 %}
10360 
10361 
10362 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
10363   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) > 8);
10364   match(Set dst (VectorLongToMask src));
10365   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
10366   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
10367   ins_encode %{
10368     int mask_len = Matcher::vector_length(this);
10369     assert(mask_len <= 32, "invalid mask length");
10370     int vec_enc  = vector_length_encoding(mask_len);
10371     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10372                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
10373   %}
10374   ins_pipe( pipe_slow );
10375 %}
10376 
10377 instruct long_to_mask_evex(kReg dst, rRegL src) %{
10378   predicate(n->bottom_type()->isa_vectmask());
10379   match(Set dst (VectorLongToMask src));
10380   format %{ "long_to_mask_evex $dst, $src\t!" %}
10381   ins_encode %{
10382     __ kmov($dst$$KRegister, $src$$Register);
10383   %}
10384   ins_pipe( pipe_slow );
10385 %}
10386 #endif
10387 
10388 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10389   match(Set dst (AndVMask src1 src2));
10390   match(Set dst (OrVMask src1 src2));
10391   match(Set dst (XorVMask src1 src2));
10392   effect(TEMP kscratch);
10393   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10394   ins_encode %{
10395     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10396     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10397     assert(Type::equals(mask1->bottom_type(), mask2->bottom_type()), "Mask types must be equal");
10398     uint masklen = Matcher::vector_length(this);
10399     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10400     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10401   %}
10402   ins_pipe( pipe_slow );
10403 %}
10404 
10405 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10406   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10407   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10408   ins_encode %{
10409     int vlen_enc = vector_length_encoding(this);
10410     BasicType bt = Matcher::vector_element_basic_type(this);
10411     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10412                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10413   %}
10414   ins_pipe( pipe_slow );
10415 %}
10416 
10417 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10418   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10419   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10420   ins_encode %{
10421     int vlen_enc = vector_length_encoding(this);
10422     BasicType bt = Matcher::vector_element_basic_type(this);
10423     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10424                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10425   %}
10426   ins_pipe( pipe_slow );
10427 %}
10428 
10429 instruct castMM(kReg dst)
10430 %{
10431   match(Set dst (CastVV dst));
10432 
10433   size(0);
10434   format %{ "# castVV of $dst" %}
10435   ins_encode(/* empty encoding */);
10436   ins_cost(0);
10437   ins_pipe(empty);
10438 %}
10439 
10440 instruct castVV(vec dst)
10441 %{
10442   match(Set dst (CastVV dst));
10443 
10444   size(0);
10445   format %{ "# castVV of $dst" %}
10446   ins_encode(/* empty encoding */);
10447   ins_cost(0);
10448   ins_pipe(empty);
10449 %}
10450 
10451 instruct castVVLeg(legVec dst)
10452 %{
10453   match(Set dst (CastVV dst));
10454 
10455   size(0);
10456   format %{ "# castVV of $dst" %}
10457   ins_encode(/* empty encoding */);
10458   ins_cost(0);
10459   ins_pipe(empty);
10460 %}
10461 
10462 instruct FloatClassCheck_reg_reg_vfpclass(rRegI dst, regF src, kReg ktmp, rFlagsReg cr)
10463 %{
10464   match(Set dst (IsInfiniteF src));
10465   effect(TEMP ktmp, KILL cr);
10466   format %{ "float_class_check $dst, $src" %}
10467   ins_encode %{
10468     __ vfpclassss($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10469     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10470   %}
10471   ins_pipe(pipe_slow);
10472 %}
10473 
10474 instruct DoubleClassCheck_reg_reg_vfpclass(rRegI dst, regD src, kReg ktmp, rFlagsReg cr)
10475 %{
10476   match(Set dst (IsInfiniteD src));
10477   effect(TEMP ktmp, KILL cr);
10478   format %{ "double_class_check $dst, $src" %}
10479   ins_encode %{
10480     __ vfpclasssd($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10481     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10482   %}
10483   ins_pipe(pipe_slow);
10484 %}
10485 
10486 
10487 instruct vector_selectfrom_twovectors_reg_evex(vec index, vec src1, vec src2)
10488 %{
10489   match(Set index (SelectFromTwoVector (Binary index src1) src2));
10490   format %{ "select_from_two_vector $index, $src1, $src2 \t!" %}
10491   ins_encode %{
10492     int vlen_enc = vector_length_encoding(this);
10493     BasicType bt = Matcher::vector_element_basic_type(this);
10494     __ select_from_two_vectors_evex(bt, $index$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
10495   %}
10496   ins_pipe(pipe_slow);
10497 %}