1 //
    2 // Copyright (c) 2011, 2024, Oracle and/or its affiliates. All rights reserved.
    3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4 //
    5 // This code is free software; you can redistribute it and/or modify it
    6 // under the terms of the GNU General Public License version 2 only, as
    7 // published by the Free Software Foundation.
    8 //
    9 // This code is distributed in the hope that it will be useful, but WITHOUT
   10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12 // version 2 for more details (a copy is included in the LICENSE file that
   13 // accompanied this code).
   14 //
   15 // You should have received a copy of the GNU General Public License version
   16 // 2 along with this work; if not, write to the Free Software Foundation,
   17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18 //
   19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20 // or visit www.oracle.com if you need additional information or have any
   21 // questions.
   22 //
   23 //
   24 
   25 // X86 Common Architecture Description File
   26 
   27 //----------REGISTER DEFINITION BLOCK------------------------------------------
   28 // This information is used by the matcher and the register allocator to
   29 // describe individual registers and classes of registers within the target
   30 // architecture.
   31 
   32 register %{
   33 //----------Architecture Description Register Definitions----------------------
   34 // General Registers
   35 // "reg_def"  name ( register save type, C convention save type,
   36 //                   ideal register type, encoding );
   37 // Register Save Types:
   38 //
   39 // NS  = No-Save:       The register allocator assumes that these registers
   40 //                      can be used without saving upon entry to the method, &
   41 //                      that they do not need to be saved at call sites.
   42 //
   43 // SOC = Save-On-Call:  The register allocator assumes that these registers
   44 //                      can be used without saving upon entry to the method,
   45 //                      but that they must be saved at call sites.
   46 //
   47 // SOE = Save-On-Entry: The register allocator assumes that these registers
   48 //                      must be saved before using them upon entry to the
   49 //                      method, but they do not need to be saved at call
   50 //                      sites.
   51 //
   52 // AS  = Always-Save:   The register allocator assumes that these registers
   53 //                      must be saved before using them upon entry to the
   54 //                      method, & that they must be saved at call sites.
   55 //
   56 // Ideal Register Type is used to determine how to save & restore a
   57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
   58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
   59 //
   60 // The encoding number is the actual bit-pattern placed into the opcodes.
   61 
   62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
   63 // Word a in each register holds a Float, words ab hold a Double.
   64 // The whole registers are used in SSE4.2 version intrinsics,
   65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
   66 // UseXMMForArrayCopy and UseSuperword flags).
   67 // For pre EVEX enabled architectures:
   68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
   69 // For EVEX enabled architectures:
   70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
   71 //
   72 // Linux ABI:   No register preserved across function calls
   73 //              XMM0-XMM7 might hold parameters
   74 // Windows ABI: XMM6-XMM15 preserved across function calls
   75 //              XMM0-XMM3 might hold parameters
   76 
   77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
   79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
   80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
   81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
   82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
   83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
   84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
   85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
   86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
   87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
   88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
   89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
   90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
   91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
   92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
   93 
   94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
   96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
   97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
   98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
   99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
  103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
  104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
  105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
  106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
  107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
  108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
  109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
  110 
  111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
  120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
  121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
  122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
  123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
  124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
  125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
  126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
  127 
  128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
  129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
  130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
  131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
  132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
  133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
  134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
  135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
  136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
  137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
  138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
  139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
  140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
  141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
  142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
  143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
  144 
  145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
  146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
  147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
  148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
  149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
  150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
  151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
  152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
  153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
  154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
  155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
  156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
  157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
  158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
  159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
  160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
  161 
  162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
  163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
  164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
  165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
  166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
  167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
  168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
  169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
  170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
  171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
  172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
  173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
  174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
  175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
  176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
  177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
  178 
  179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
  180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
  181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
  182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
  183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
  184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
  185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
  186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
  187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
  188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
  189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
  190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
  191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
  192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
  193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
  194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
  195 
  196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
  197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
  198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
  199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
  200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
  201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
  202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
  203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
  204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
  205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
  206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
  207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
  208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
  209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
  210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
  211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
  212 
  213 #ifdef _LP64
  214 
  215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
  216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
  217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
  218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
  219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
  220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
  221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
  222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
  223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
  224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
  225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
  226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
  227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
  228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
  229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
  230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
  231 
  232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
  233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
  234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
  235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
  236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
  237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
  238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
  239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
  240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
  241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
  242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
  243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
  244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
  245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
  246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
  247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
  248 
  249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
  250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
  251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
  252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
  253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
  254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
  255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
  256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
  257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
  258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
  259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
  260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
  261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
  262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
  263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
  264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
  265 
  266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
  267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
  268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
  269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
  270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
  271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
  272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
  273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
  274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
  275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
  276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
  277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
  278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
  279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
  280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
  281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
  282 
  283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
  284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
  285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
  286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
  287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
  288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
  289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
  290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
  291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
  292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
  293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
  294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
  295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
  296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
  297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
  298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
  299 
  300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
  301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
  302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
  303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
  304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
  305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
  306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
  307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
  308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
  309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
  310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
  311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
  312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
  313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
  314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
  315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
  316 
  317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
  318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
  319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
  320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
  321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
  322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
  323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
  324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
  325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
  326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
  327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
  328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
  329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
  330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
  331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
  332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
  333 
  334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
  335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
  336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
  337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
  338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
  339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
  340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
  341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
  342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
  343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
  344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
  345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
  346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
  347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
  348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
  349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
  350 
  351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
  352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
  353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
  354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
  355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
  356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
  357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
  358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
  359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
  360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
  361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
  362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
  363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
  364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
  365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
  366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
  367 
  368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
  369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
  370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
  371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
  372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
  373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
  374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
  375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
  376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
  377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
  378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
  379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
  380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
  381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
  382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
  383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
  384 
  385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
  386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
  387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
  388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
  389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
  390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
  391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
  392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
  393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
  394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
  395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
  396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
  397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
  398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
  399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
  400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
  401 
  402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
  403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
  404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
  405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
  406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
  407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
  408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
  409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
  410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
  411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
  412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
  413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
  414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
  415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
  416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
  417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
  418 
  419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
  420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
  421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
  422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
  423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
  424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
  425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
  426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
  427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
  428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
  429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
  430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
  431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
  432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
  433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
  434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
  435 
  436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
  437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
  438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
  439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
  440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
  441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
  442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
  443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
  444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
  445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
  446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
  447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
  448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
  449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
  450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
  451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
  452 
  453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
  454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
  455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
  456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
  457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
  458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
  459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
  460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
  461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
  462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
  463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
  464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
  465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
  466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
  467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
  468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
  469 
  470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
  471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
  472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
  473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
  474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
  475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
  476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
  477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
  478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
  479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
  480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
  481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
  482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
  483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
  484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
  485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
  486 
  487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
  488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
  489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
  490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
  491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
  492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
  493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
  494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
  495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
  496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
  497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
  498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
  499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
  500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
  501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
  502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
  503 
  504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
  505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
  506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
  507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
  508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
  509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
  510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
  511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
  512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
  513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
  514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
  515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
  516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
  517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
  518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
  519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
  520 
  521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
  522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
  523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
  524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
  525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
  526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
  527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
  528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
  529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
  530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
  531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
  532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
  533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
  534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
  535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
  536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
  537 
  538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
  539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
  540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
  541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
  542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
  543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
  544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
  545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
  546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
  547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
  548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
  549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
  550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
  551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
  552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
  553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
  554 
  555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
  556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
  557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
  558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
  559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
  560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
  561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
  562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
  563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
  564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
  565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
  566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
  567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
  568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
  569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
  570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
  571 
  572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
  573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
  574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
  575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
  576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
  577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
  578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
  579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
  580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
  581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
  582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
  583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
  584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
  585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
  586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
  587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
  588 
  589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
  590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
  591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
  592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
  593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
  594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
  595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
  596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
  597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
  598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
  599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
  600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
  601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
  602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
  603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
  604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
  605 
  606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
  607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
  608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
  609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
  610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
  611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
  612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
  613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
  614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
  615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
  616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
  617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
  618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
  619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
  620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
  621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
  622 
  623 #endif // _LP64
  624 
  625 #ifdef _LP64
  626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
  627 #else
  628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
  629 #endif // _LP64
  630 
  631 // AVX3 Mask Registers.
  632 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
  633 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
  634 
  635 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
  636 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
  637 
  638 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
  639 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
  640 
  641 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
  642 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
  643 
  644 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
  645 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
  646 
  647 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
  648 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
  649 
  650 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
  651 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
  652 
  653 
  654 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
  655                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
  656                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
  657                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
  658                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
  659                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
  660                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
  661                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
  662 #ifdef _LP64
  663                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
  664                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
  665                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
  666                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
  667                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
  668                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
  669                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
  670                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
  671                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
  672                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
  673                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
  674                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
  675                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
  676                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
  677                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
  678                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
  679                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
  680                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
  681                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
  682                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
  683                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
  684                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
  685                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
  686                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
  687 #endif
  688                       );
  689 
  690 alloc_class chunk2(K7, K7_H,
  691                    K6, K6_H,
  692                    K5, K5_H,
  693                    K4, K4_H,
  694                    K3, K3_H,
  695                    K2, K2_H,
  696                    K1, K1_H);
  697 
  698 reg_class  vectmask_reg(K1, K1_H,
  699                         K2, K2_H,
  700                         K3, K3_H,
  701                         K4, K4_H,
  702                         K5, K5_H,
  703                         K6, K6_H,
  704                         K7, K7_H);
  705 
  706 reg_class vectmask_reg_K1(K1, K1_H);
  707 reg_class vectmask_reg_K2(K2, K2_H);
  708 reg_class vectmask_reg_K3(K3, K3_H);
  709 reg_class vectmask_reg_K4(K4, K4_H);
  710 reg_class vectmask_reg_K5(K5, K5_H);
  711 reg_class vectmask_reg_K6(K6, K6_H);
  712 reg_class vectmask_reg_K7(K7, K7_H);
  713 
  714 // flags allocation class should be last.
  715 alloc_class chunk3(RFLAGS);
  716 
  717 
  718 // Singleton class for condition codes
  719 reg_class int_flags(RFLAGS);
  720 
  721 // Class for pre evex float registers
  722 reg_class float_reg_legacy(XMM0,
  723                     XMM1,
  724                     XMM2,
  725                     XMM3,
  726                     XMM4,
  727                     XMM5,
  728                     XMM6,
  729                     XMM7
  730 #ifdef _LP64
  731                    ,XMM8,
  732                     XMM9,
  733                     XMM10,
  734                     XMM11,
  735                     XMM12,
  736                     XMM13,
  737                     XMM14,
  738                     XMM15
  739 #endif
  740                     );
  741 
  742 // Class for evex float registers
  743 reg_class float_reg_evex(XMM0,
  744                     XMM1,
  745                     XMM2,
  746                     XMM3,
  747                     XMM4,
  748                     XMM5,
  749                     XMM6,
  750                     XMM7
  751 #ifdef _LP64
  752                    ,XMM8,
  753                     XMM9,
  754                     XMM10,
  755                     XMM11,
  756                     XMM12,
  757                     XMM13,
  758                     XMM14,
  759                     XMM15,
  760                     XMM16,
  761                     XMM17,
  762                     XMM18,
  763                     XMM19,
  764                     XMM20,
  765                     XMM21,
  766                     XMM22,
  767                     XMM23,
  768                     XMM24,
  769                     XMM25,
  770                     XMM26,
  771                     XMM27,
  772                     XMM28,
  773                     XMM29,
  774                     XMM30,
  775                     XMM31
  776 #endif
  777                     );
  778 
  779 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
  780 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  781 
  782 // Class for pre evex double registers
  783 reg_class double_reg_legacy(XMM0,  XMM0b,
  784                      XMM1,  XMM1b,
  785                      XMM2,  XMM2b,
  786                      XMM3,  XMM3b,
  787                      XMM4,  XMM4b,
  788                      XMM5,  XMM5b,
  789                      XMM6,  XMM6b,
  790                      XMM7,  XMM7b
  791 #ifdef _LP64
  792                     ,XMM8,  XMM8b,
  793                      XMM9,  XMM9b,
  794                      XMM10, XMM10b,
  795                      XMM11, XMM11b,
  796                      XMM12, XMM12b,
  797                      XMM13, XMM13b,
  798                      XMM14, XMM14b,
  799                      XMM15, XMM15b
  800 #endif
  801                      );
  802 
  803 // Class for evex double registers
  804 reg_class double_reg_evex(XMM0,  XMM0b,
  805                      XMM1,  XMM1b,
  806                      XMM2,  XMM2b,
  807                      XMM3,  XMM3b,
  808                      XMM4,  XMM4b,
  809                      XMM5,  XMM5b,
  810                      XMM6,  XMM6b,
  811                      XMM7,  XMM7b
  812 #ifdef _LP64
  813                     ,XMM8,  XMM8b,
  814                      XMM9,  XMM9b,
  815                      XMM10, XMM10b,
  816                      XMM11, XMM11b,
  817                      XMM12, XMM12b,
  818                      XMM13, XMM13b,
  819                      XMM14, XMM14b,
  820                      XMM15, XMM15b,
  821                      XMM16, XMM16b,
  822                      XMM17, XMM17b,
  823                      XMM18, XMM18b,
  824                      XMM19, XMM19b,
  825                      XMM20, XMM20b,
  826                      XMM21, XMM21b,
  827                      XMM22, XMM22b,
  828                      XMM23, XMM23b,
  829                      XMM24, XMM24b,
  830                      XMM25, XMM25b,
  831                      XMM26, XMM26b,
  832                      XMM27, XMM27b,
  833                      XMM28, XMM28b,
  834                      XMM29, XMM29b,
  835                      XMM30, XMM30b,
  836                      XMM31, XMM31b
  837 #endif
  838                      );
  839 
  840 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
  841 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  842 
  843 // Class for pre evex 32bit vector registers
  844 reg_class vectors_reg_legacy(XMM0,
  845                       XMM1,
  846                       XMM2,
  847                       XMM3,
  848                       XMM4,
  849                       XMM5,
  850                       XMM6,
  851                       XMM7
  852 #ifdef _LP64
  853                      ,XMM8,
  854                       XMM9,
  855                       XMM10,
  856                       XMM11,
  857                       XMM12,
  858                       XMM13,
  859                       XMM14,
  860                       XMM15
  861 #endif
  862                       );
  863 
  864 // Class for evex 32bit vector registers
  865 reg_class vectors_reg_evex(XMM0,
  866                       XMM1,
  867                       XMM2,
  868                       XMM3,
  869                       XMM4,
  870                       XMM5,
  871                       XMM6,
  872                       XMM7
  873 #ifdef _LP64
  874                      ,XMM8,
  875                       XMM9,
  876                       XMM10,
  877                       XMM11,
  878                       XMM12,
  879                       XMM13,
  880                       XMM14,
  881                       XMM15,
  882                       XMM16,
  883                       XMM17,
  884                       XMM18,
  885                       XMM19,
  886                       XMM20,
  887                       XMM21,
  888                       XMM22,
  889                       XMM23,
  890                       XMM24,
  891                       XMM25,
  892                       XMM26,
  893                       XMM27,
  894                       XMM28,
  895                       XMM29,
  896                       XMM30,
  897                       XMM31
  898 #endif
  899                       );
  900 
  901 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
  902 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  903 
  904 // Class for all 64bit vector registers
  905 reg_class vectord_reg_legacy(XMM0,  XMM0b,
  906                       XMM1,  XMM1b,
  907                       XMM2,  XMM2b,
  908                       XMM3,  XMM3b,
  909                       XMM4,  XMM4b,
  910                       XMM5,  XMM5b,
  911                       XMM6,  XMM6b,
  912                       XMM7,  XMM7b
  913 #ifdef _LP64
  914                      ,XMM8,  XMM8b,
  915                       XMM9,  XMM9b,
  916                       XMM10, XMM10b,
  917                       XMM11, XMM11b,
  918                       XMM12, XMM12b,
  919                       XMM13, XMM13b,
  920                       XMM14, XMM14b,
  921                       XMM15, XMM15b
  922 #endif
  923                       );
  924 
  925 // Class for all 64bit vector registers
  926 reg_class vectord_reg_evex(XMM0,  XMM0b,
  927                       XMM1,  XMM1b,
  928                       XMM2,  XMM2b,
  929                       XMM3,  XMM3b,
  930                       XMM4,  XMM4b,
  931                       XMM5,  XMM5b,
  932                       XMM6,  XMM6b,
  933                       XMM7,  XMM7b
  934 #ifdef _LP64
  935                      ,XMM8,  XMM8b,
  936                       XMM9,  XMM9b,
  937                       XMM10, XMM10b,
  938                       XMM11, XMM11b,
  939                       XMM12, XMM12b,
  940                       XMM13, XMM13b,
  941                       XMM14, XMM14b,
  942                       XMM15, XMM15b,
  943                       XMM16, XMM16b,
  944                       XMM17, XMM17b,
  945                       XMM18, XMM18b,
  946                       XMM19, XMM19b,
  947                       XMM20, XMM20b,
  948                       XMM21, XMM21b,
  949                       XMM22, XMM22b,
  950                       XMM23, XMM23b,
  951                       XMM24, XMM24b,
  952                       XMM25, XMM25b,
  953                       XMM26, XMM26b,
  954                       XMM27, XMM27b,
  955                       XMM28, XMM28b,
  956                       XMM29, XMM29b,
  957                       XMM30, XMM30b,
  958                       XMM31, XMM31b
  959 #endif
  960                       );
  961 
  962 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
  963 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  964 
  965 // Class for all 128bit vector registers
  966 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
  967                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  968                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  969                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  970                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  971                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  972                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  973                       XMM7,  XMM7b,  XMM7c,  XMM7d
  974 #ifdef _LP64
  975                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  976                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  977                       XMM10, XMM10b, XMM10c, XMM10d,
  978                       XMM11, XMM11b, XMM11c, XMM11d,
  979                       XMM12, XMM12b, XMM12c, XMM12d,
  980                       XMM13, XMM13b, XMM13c, XMM13d,
  981                       XMM14, XMM14b, XMM14c, XMM14d,
  982                       XMM15, XMM15b, XMM15c, XMM15d
  983 #endif
  984                       );
  985 
  986 // Class for all 128bit vector registers
  987 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
  988                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  989                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  990                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  991                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  992                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  993                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  994                       XMM7,  XMM7b,  XMM7c,  XMM7d
  995 #ifdef _LP64
  996                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  997                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  998                       XMM10, XMM10b, XMM10c, XMM10d,
  999                       XMM11, XMM11b, XMM11c, XMM11d,
 1000                       XMM12, XMM12b, XMM12c, XMM12d,
 1001                       XMM13, XMM13b, XMM13c, XMM13d,
 1002                       XMM14, XMM14b, XMM14c, XMM14d,
 1003                       XMM15, XMM15b, XMM15c, XMM15d,
 1004                       XMM16, XMM16b, XMM16c, XMM16d,
 1005                       XMM17, XMM17b, XMM17c, XMM17d,
 1006                       XMM18, XMM18b, XMM18c, XMM18d,
 1007                       XMM19, XMM19b, XMM19c, XMM19d,
 1008                       XMM20, XMM20b, XMM20c, XMM20d,
 1009                       XMM21, XMM21b, XMM21c, XMM21d,
 1010                       XMM22, XMM22b, XMM22c, XMM22d,
 1011                       XMM23, XMM23b, XMM23c, XMM23d,
 1012                       XMM24, XMM24b, XMM24c, XMM24d,
 1013                       XMM25, XMM25b, XMM25c, XMM25d,
 1014                       XMM26, XMM26b, XMM26c, XMM26d,
 1015                       XMM27, XMM27b, XMM27c, XMM27d,
 1016                       XMM28, XMM28b, XMM28c, XMM28d,
 1017                       XMM29, XMM29b, XMM29c, XMM29d,
 1018                       XMM30, XMM30b, XMM30c, XMM30d,
 1019                       XMM31, XMM31b, XMM31c, XMM31d
 1020 #endif
 1021                       );
 1022 
 1023 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 1024 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1025 
 1026 // Class for all 256bit vector registers
 1027 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1028                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1029                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1030                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1031                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1032                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1033                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1034                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1035 #ifdef _LP64
 1036                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1037                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1038                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1039                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1040                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1041                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1042                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1043                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 1044 #endif
 1045                       );
 1046 
 1047 // Class for all 256bit vector registers
 1048 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1049                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1050                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1051                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1052                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1053                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1054                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1055                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1056 #ifdef _LP64
 1057                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1058                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1059                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1060                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1061                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1062                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1063                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1064                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
 1065                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
 1066                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
 1067                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
 1068                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
 1069                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
 1070                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
 1071                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
 1072                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
 1073                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
 1074                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
 1075                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
 1076                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
 1077                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
 1078                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
 1079                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
 1080                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
 1081 #endif
 1082                       );
 1083 
 1084 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
 1085 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1086 
 1087 // Class for all 512bit vector registers
 1088 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1089                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1090                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1091                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1092                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1093                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1094                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1095                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1096 #ifdef _LP64
 1097                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1098                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1099                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1100                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1101                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1102                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1103                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1104                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1105                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 1106                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 1107                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 1108                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 1109                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 1110                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 1111                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 1112                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 1113                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 1114                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 1115                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 1116                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 1117                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 1118                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 1119                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 1120                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 1121 #endif
 1122                       );
 1123 
 1124 // Class for restricted 512bit vector registers
 1125 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1126                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1127                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1128                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1129                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1130                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1131                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1132                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1133 #ifdef _LP64
 1134                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1135                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1136                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1137                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1138                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1139                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1140                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1141                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1142 #endif
 1143                       );
 1144 
 1145 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
 1146 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 1147 
 1148 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 1149 %}
 1150 
 1151 
 1152 //----------SOURCE BLOCK-------------------------------------------------------
 1153 // This is a block of C++ code which provides values, functions, and
 1154 // definitions necessary in the rest of the architecture description
 1155 
 1156 source_hpp %{
 1157 // Header information of the source block.
 1158 // Method declarations/definitions which are used outside
 1159 // the ad-scope can conveniently be defined here.
 1160 //
 1161 // To keep related declarations/definitions/uses close together,
 1162 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 1163 
 1164 #include "runtime/vm_version.hpp"
 1165 
 1166 class NativeJump;
 1167 
 1168 class CallStubImpl {
 1169 
 1170   //--------------------------------------------------------------
 1171   //---<  Used for optimization in Compile::shorten_branches  >---
 1172   //--------------------------------------------------------------
 1173 
 1174  public:
 1175   // Size of call trampoline stub.
 1176   static uint size_call_trampoline() {
 1177     return 0; // no call trampolines on this platform
 1178   }
 1179 
 1180   // number of relocations needed by a call trampoline stub
 1181   static uint reloc_call_trampoline() {
 1182     return 0; // no call trampolines on this platform
 1183   }
 1184 };
 1185 
 1186 class HandlerImpl {
 1187 
 1188  public:
 1189 
 1190   static int emit_exception_handler(CodeBuffer &cbuf);
 1191   static int emit_deopt_handler(CodeBuffer& cbuf);
 1192 
 1193   static uint size_exception_handler() {
 1194     // NativeCall instruction size is the same as NativeJump.
 1195     // exception handler starts out as jump and can be patched to
 1196     // a call be deoptimization.  (4932387)
 1197     // Note that this value is also credited (in output.cpp) to
 1198     // the size of the code section.
 1199     return NativeJump::instruction_size;
 1200   }
 1201 
 1202 #ifdef _LP64
 1203   static uint size_deopt_handler() {
 1204     // three 5 byte instructions plus one move for unreachable address.
 1205     return 15+3;
 1206   }
 1207 #else
 1208   static uint size_deopt_handler() {
 1209     // NativeCall instruction size is the same as NativeJump.
 1210     // exception handler starts out as jump and can be patched to
 1211     // a call be deoptimization.  (4932387)
 1212     // Note that this value is also credited (in output.cpp) to
 1213     // the size of the code section.
 1214     return 5 + NativeJump::instruction_size; // pushl(); jmp;
 1215   }
 1216 #endif
 1217 };
 1218 
 1219 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
 1220   switch(bytes) {
 1221     case  4: // fall-through
 1222     case  8: // fall-through
 1223     case 16: return Assembler::AVX_128bit;
 1224     case 32: return Assembler::AVX_256bit;
 1225     case 64: return Assembler::AVX_512bit;
 1226 
 1227     default: {
 1228       ShouldNotReachHere();
 1229       return Assembler::AVX_NoVec;
 1230     }
 1231   }
 1232 }
 1233 
 1234 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
 1235   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
 1236 }
 1237 
 1238 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
 1239   uint def_idx = use->operand_index(opnd);
 1240   Node* def = use->in(def_idx);
 1241   return vector_length_encoding(def);
 1242 }
 1243 
 1244 static inline bool is_vector_popcount_predicate(BasicType bt) {
 1245   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1246          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1247 }
 1248 
 1249 static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
 1250   return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
 1251            (VM_Version::supports_avx512vl() || vlen_bytes == 64);
 1252 }
 1253 
 1254 class Node::PD {
 1255 public:
 1256   enum NodeFlags {
 1257     Flag_intel_jcc_erratum    = Node::_last_flag << 1,
 1258     Flag_sets_carry_flag      = Node::_last_flag << 2,
 1259     Flag_sets_parity_flag     = Node::_last_flag << 3,
 1260     Flag_sets_zero_flag       = Node::_last_flag << 4,
 1261     Flag_sets_overflow_flag   = Node::_last_flag << 5,
 1262     Flag_sets_sign_flag       = Node::_last_flag << 6,
 1263     Flag_clears_carry_flag    = Node::_last_flag << 7,
 1264     Flag_clears_parity_flag   = Node::_last_flag << 8,
 1265     Flag_clears_zero_flag     = Node::_last_flag << 9,
 1266     Flag_clears_overflow_flag = Node::_last_flag << 10,
 1267     Flag_clears_sign_flag     = Node::_last_flag << 11,
 1268     _last_flag                = Flag_clears_sign_flag
 1269   };
 1270 };
 1271 
 1272 %} // end source_hpp
 1273 
 1274 source %{
 1275 
 1276 #include "opto/addnode.hpp"
 1277 #include "c2_intelJccErratum_x86.hpp"
 1278 
 1279 void PhaseOutput::pd_perform_mach_node_analysis() {
 1280   if (VM_Version::has_intel_jcc_erratum()) {
 1281     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
 1282     _buf_sizes._code += extra_padding;
 1283   }
 1284 }
 1285 
 1286 int MachNode::pd_alignment_required() const {
 1287   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
 1288     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
 1289     return IntelJccErratum::largest_jcc_size() + 1;
 1290   } else {
 1291     return 1;
 1292   }
 1293 }
 1294 
 1295 int MachNode::compute_padding(int current_offset) const {
 1296   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
 1297     Compile* C = Compile::current();
 1298     PhaseOutput* output = C->output();
 1299     Block* block = output->block();
 1300     int index = output->index();
 1301     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
 1302   } else {
 1303     return 0;
 1304   }
 1305 }
 1306 
 1307 // Emit exception handler code.
 1308 // Stuff framesize into a register and call a VM stub routine.
 1309 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
 1310 
 1311   // Note that the code buffer's insts_mark is always relative to insts.
 1312   // That's why we must use the macroassembler to generate a handler.
 1313   C2_MacroAssembler _masm(&cbuf);
 1314   address base = __ start_a_stub(size_exception_handler());
 1315   if (base == nullptr) {
 1316     ciEnv::current()->record_failure("CodeCache is full");
 1317     return 0;  // CodeBuffer::expand failed
 1318   }
 1319   int offset = __ offset();
 1320   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 1321   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 1322   __ end_a_stub();
 1323   return offset;
 1324 }
 1325 
 1326 // Emit deopt handler code.
 1327 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
 1328 
 1329   // Note that the code buffer's insts_mark is always relative to insts.
 1330   // That's why we must use the macroassembler to generate a handler.
 1331   C2_MacroAssembler _masm(&cbuf);
 1332   address base = __ start_a_stub(size_deopt_handler());
 1333   if (base == nullptr) {
 1334     ciEnv::current()->record_failure("CodeCache is full");
 1335     return 0;  // CodeBuffer::expand failed
 1336   }
 1337   int offset = __ offset();
 1338 
 1339 #ifdef _LP64
 1340   address the_pc = (address) __ pc();
 1341   Label next;
 1342   // push a "the_pc" on the stack without destroying any registers
 1343   // as they all may be live.
 1344 
 1345   // push address of "next"
 1346   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 1347   __ bind(next);
 1348   // adjust it so it matches "the_pc"
 1349   __ subptr(Address(rsp, 0), __ offset() - offset);
 1350 #else
 1351   InternalAddress here(__ pc());
 1352   __ pushptr(here.addr(), noreg);
 1353 #endif
 1354 
 1355   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 1356   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
 1357   __ end_a_stub();
 1358   return offset;
 1359 }
 1360 
 1361 static Assembler::Width widthForType(BasicType bt) {
 1362   if (bt == T_BYTE) {
 1363     return Assembler::B;
 1364   } else if (bt == T_SHORT) {
 1365     return Assembler::W;
 1366   } else if (bt == T_INT) {
 1367     return Assembler::D;
 1368   } else {
 1369     assert(bt == T_LONG, "not a long: %s", type2name(bt));
 1370     return Assembler::Q;
 1371   }
 1372 }
 1373 
 1374 //=============================================================================
 1375 
 1376   // Float masks come from different places depending on platform.
 1377 #ifdef _LP64
 1378   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 1379   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 1380   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 1381   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 1382 #else
 1383   static address float_signmask()  { return (address)float_signmask_pool; }
 1384   static address float_signflip()  { return (address)float_signflip_pool; }
 1385   static address double_signmask() { return (address)double_signmask_pool; }
 1386   static address double_signflip() { return (address)double_signflip_pool; }
 1387 #endif
 1388   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
 1389   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
 1390   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
 1391   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
 1392   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
 1393   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
 1394   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
 1395   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
 1396   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
 1397   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
 1398   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
 1399   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
 1400   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
 1401   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
 1402   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
 1403 
 1404 //=============================================================================
 1405 bool Matcher::match_rule_supported(int opcode) {
 1406   if (!has_match_rule(opcode)) {
 1407     return false; // no match rule present
 1408   }
 1409   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1410   switch (opcode) {
 1411     case Op_AbsVL:
 1412     case Op_StoreVectorScatter:
 1413       if (UseAVX < 3) {
 1414         return false;
 1415       }
 1416       break;
 1417     case Op_PopCountI:
 1418     case Op_PopCountL:
 1419       if (!UsePopCountInstruction) {
 1420         return false;
 1421       }
 1422       break;
 1423     case Op_PopCountVI:
 1424       if (UseAVX < 2) {
 1425         return false;
 1426       }
 1427       break;
 1428     case Op_CompressV:
 1429     case Op_ExpandV:
 1430     case Op_PopCountVL:
 1431       if (UseAVX < 2) {
 1432         return false;
 1433       }
 1434       break;
 1435     case Op_MulVI:
 1436       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
 1437         return false;
 1438       }
 1439       break;
 1440     case Op_MulVL:
 1441       if (UseSSE < 4) { // only with SSE4_1 or AVX
 1442         return false;
 1443       }
 1444       break;
 1445     case Op_MulReductionVL:
 1446       if (VM_Version::supports_avx512dq() == false) {
 1447         return false;
 1448       }
 1449       break;
 1450     case Op_AddReductionVL:
 1451       if (UseSSE < 2) { // requires at least SSE2
 1452         return false;
 1453       }
 1454       break;
 1455     case Op_AbsVB:
 1456     case Op_AbsVS:
 1457     case Op_AbsVI:
 1458     case Op_AddReductionVI:
 1459     case Op_AndReductionV:
 1460     case Op_OrReductionV:
 1461     case Op_XorReductionV:
 1462       if (UseSSE < 3) { // requires at least SSSE3
 1463         return false;
 1464       }
 1465       break;
 1466     case Op_VectorLoadShuffle:
 1467     case Op_VectorRearrange:
 1468     case Op_MulReductionVI:
 1469       if (UseSSE < 4) { // requires at least SSE4
 1470         return false;
 1471       }
 1472       break;
 1473     case Op_IsInfiniteF:
 1474     case Op_IsInfiniteD:
 1475       if (!VM_Version::supports_avx512dq()) {
 1476         return false;
 1477       }
 1478       break;
 1479     case Op_SqrtVD:
 1480     case Op_SqrtVF:
 1481     case Op_VectorMaskCmp:
 1482     case Op_VectorCastB2X:
 1483     case Op_VectorCastS2X:
 1484     case Op_VectorCastI2X:
 1485     case Op_VectorCastL2X:
 1486     case Op_VectorCastF2X:
 1487     case Op_VectorCastD2X:
 1488     case Op_VectorUCastB2X:
 1489     case Op_VectorUCastS2X:
 1490     case Op_VectorUCastI2X:
 1491     case Op_VectorMaskCast:
 1492       if (UseAVX < 1) { // enabled for AVX only
 1493         return false;
 1494       }
 1495       break;
 1496     case Op_PopulateIndex:
 1497       if (!is_LP64 || (UseAVX < 2)) {
 1498         return false;
 1499       }
 1500       break;
 1501     case Op_RoundVF:
 1502       if (UseAVX < 2) { // enabled for AVX2 only
 1503         return false;
 1504       }
 1505       break;
 1506     case Op_RoundVD:
 1507       if (UseAVX < 3) {
 1508         return false;  // enabled for AVX3 only
 1509       }
 1510       break;
 1511     case Op_CompareAndSwapL:
 1512 #ifdef _LP64
 1513     case Op_CompareAndSwapP:
 1514 #endif
 1515       break;
 1516     case Op_StrIndexOf:
 1517       if (!UseSSE42Intrinsics) {
 1518         return false;
 1519       }
 1520       break;
 1521     case Op_StrIndexOfChar:
 1522       if (!UseSSE42Intrinsics) {
 1523         return false;
 1524       }
 1525       break;
 1526     case Op_OnSpinWait:
 1527       if (VM_Version::supports_on_spin_wait() == false) {
 1528         return false;
 1529       }
 1530       break;
 1531     case Op_MulVB:
 1532     case Op_LShiftVB:
 1533     case Op_RShiftVB:
 1534     case Op_URShiftVB:
 1535     case Op_VectorInsert:
 1536     case Op_VectorLoadMask:
 1537     case Op_VectorStoreMask:
 1538     case Op_VectorBlend:
 1539       if (UseSSE < 4) {
 1540         return false;
 1541       }
 1542       break;
 1543 #ifdef _LP64
 1544     case Op_MaxD:
 1545     case Op_MaxF:
 1546     case Op_MinD:
 1547     case Op_MinF:
 1548       if (UseAVX < 1) { // enabled for AVX only
 1549         return false;
 1550       }
 1551       break;
 1552 #endif
 1553     case Op_CacheWB:
 1554     case Op_CacheWBPreSync:
 1555     case Op_CacheWBPostSync:
 1556       if (!VM_Version::supports_data_cache_line_flush()) {
 1557         return false;
 1558       }
 1559       break;
 1560     case Op_ExtractB:
 1561     case Op_ExtractL:
 1562     case Op_ExtractI:
 1563     case Op_RoundDoubleMode:
 1564       if (UseSSE < 4) {
 1565         return false;
 1566       }
 1567       break;
 1568     case Op_RoundDoubleModeV:
 1569       if (VM_Version::supports_avx() == false) {
 1570         return false; // 128bit vroundpd is not available
 1571       }
 1572       break;
 1573     case Op_LoadVectorGather:
 1574       if (UseAVX < 2) {
 1575         return false;
 1576       }
 1577       break;
 1578     case Op_FmaF:
 1579     case Op_FmaD:
 1580     case Op_FmaVD:
 1581     case Op_FmaVF:
 1582       if (!UseFMA) {
 1583         return false;
 1584       }
 1585       break;
 1586     case Op_MacroLogicV:
 1587       if (UseAVX < 3 || !UseVectorMacroLogic) {
 1588         return false;
 1589       }
 1590       break;
 1591 
 1592     case Op_VectorCmpMasked:
 1593     case Op_VectorMaskGen:
 1594       if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1595         return false;
 1596       }
 1597       break;
 1598     case Op_VectorMaskFirstTrue:
 1599     case Op_VectorMaskLastTrue:
 1600     case Op_VectorMaskTrueCount:
 1601     case Op_VectorMaskToLong:
 1602       if (!is_LP64 || UseAVX < 1) {
 1603          return false;
 1604       }
 1605       break;
 1606     case Op_RoundF:
 1607     case Op_RoundD:
 1608       if (!is_LP64) {
 1609         return false;
 1610       }
 1611       break;
 1612     case Op_CopySignD:
 1613     case Op_CopySignF:
 1614       if (UseAVX < 3 || !is_LP64)  {
 1615         return false;
 1616       }
 1617       if (!VM_Version::supports_avx512vl()) {
 1618         return false;
 1619       }
 1620       break;
 1621 #ifndef _LP64
 1622     case Op_AddReductionVF:
 1623     case Op_AddReductionVD:
 1624     case Op_MulReductionVF:
 1625     case Op_MulReductionVD:
 1626       if (UseSSE < 1) { // requires at least SSE
 1627         return false;
 1628       }
 1629       break;
 1630     case Op_MulAddVS2VI:
 1631     case Op_RShiftVL:
 1632     case Op_AbsVD:
 1633     case Op_NegVD:
 1634       if (UseSSE < 2) {
 1635         return false;
 1636       }
 1637       break;
 1638 #endif // !LP64
 1639     case Op_CompressBits:
 1640       if (!VM_Version::supports_bmi2() || (!is_LP64 && UseSSE < 2)) {
 1641         return false;
 1642       }
 1643       break;
 1644     case Op_ExpandBits:
 1645       if (!VM_Version::supports_bmi2() || (!is_LP64 && (UseSSE < 2 || !VM_Version::supports_bmi1()))) {
 1646         return false;
 1647       }
 1648       break;
 1649     case Op_SignumF:
 1650       if (UseSSE < 1) {
 1651         return false;
 1652       }
 1653       break;
 1654     case Op_SignumD:
 1655       if (UseSSE < 2) {
 1656         return false;
 1657       }
 1658       break;
 1659     case Op_CompressM:
 1660       if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
 1661         return false;
 1662       }
 1663       break;
 1664     case Op_SqrtF:
 1665       if (UseSSE < 1) {
 1666         return false;
 1667       }
 1668       break;
 1669     case Op_SqrtD:
 1670 #ifdef _LP64
 1671       if (UseSSE < 2) {
 1672         return false;
 1673       }
 1674 #else
 1675       // x86_32.ad has a special match rule for SqrtD.
 1676       // Together with common x86 rules, this handles all UseSSE cases.
 1677 #endif
 1678       break;
 1679     case Op_ConvF2HF:
 1680     case Op_ConvHF2F:
 1681       if (!VM_Version::supports_float16()) {
 1682         return false;
 1683       }
 1684       break;
 1685     case Op_VectorCastF2HF:
 1686     case Op_VectorCastHF2F:
 1687       if (!VM_Version::supports_f16c() && !VM_Version::supports_evex()) {
 1688         return false;
 1689       }
 1690       break;
 1691   }
 1692   return true;  // Match rules are supported by default.
 1693 }
 1694 
 1695 //------------------------------------------------------------------------
 1696 
 1697 static inline bool is_pop_count_instr_target(BasicType bt) {
 1698   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1699          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1700 }
 1701 
 1702 bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
 1703   return match_rule_supported_vector(opcode, vlen, bt);
 1704 }
 1705 
 1706 // Identify extra cases that we might want to provide match rules for vector nodes and
 1707 // other intrinsics guarded with vector length (vlen) and element type (bt).
 1708 bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
 1709   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1710   if (!match_rule_supported(opcode)) {
 1711     return false;
 1712   }
 1713   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
 1714   //   * SSE2 supports 128bit vectors for all types;
 1715   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
 1716   //   * AVX2 supports 256bit vectors for all types;
 1717   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
 1718   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
 1719   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
 1720   // And MaxVectorSize is taken into account as well.
 1721   if (!vector_size_supported(bt, vlen)) {
 1722     return false;
 1723   }
 1724   // Special cases which require vector length follow:
 1725   //   * implementation limitations
 1726   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
 1727   //   * 128bit vroundpd instruction is present only in AVX1
 1728   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1729   switch (opcode) {
 1730     case Op_AbsVF:
 1731     case Op_NegVF:
 1732       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
 1733         return false; // 512bit vandps and vxorps are not available
 1734       }
 1735       break;
 1736     case Op_AbsVD:
 1737     case Op_NegVD:
 1738       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
 1739         return false; // 512bit vpmullq, vandpd and vxorpd are not available
 1740       }
 1741       break;
 1742     case Op_RotateRightV:
 1743     case Op_RotateLeftV:
 1744       if (bt != T_INT && bt != T_LONG) {
 1745         return false;
 1746       } // fallthrough
 1747     case Op_MacroLogicV:
 1748       if (!VM_Version::supports_evex() ||
 1749           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
 1750         return false;
 1751       }
 1752       break;
 1753     case Op_ClearArray:
 1754     case Op_VectorMaskGen:
 1755     case Op_VectorCmpMasked:
 1756       if (!is_LP64 || !VM_Version::supports_avx512bw()) {
 1757         return false;
 1758       }
 1759       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
 1760         return false;
 1761       }
 1762       break;
 1763     case Op_LoadVectorMasked:
 1764     case Op_StoreVectorMasked:
 1765       if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
 1766         return false;
 1767       }
 1768       break;
 1769     case Op_MaxV:
 1770     case Op_MinV:
 1771       if (UseSSE < 4 && is_integral_type(bt)) {
 1772         return false;
 1773       }
 1774       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
 1775           // Float/Double intrinsics are enabled for AVX family currently.
 1776           if (UseAVX == 0) {
 1777             return false;
 1778           }
 1779           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
 1780             return false;
 1781           }
 1782       }
 1783       break;
 1784     case Op_CallLeafVector:
 1785       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
 1786         return false;
 1787       }
 1788       break;
 1789     case Op_AddReductionVI:
 1790       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
 1791         return false;
 1792       }
 1793       // fallthrough
 1794     case Op_AndReductionV:
 1795     case Op_OrReductionV:
 1796     case Op_XorReductionV:
 1797       if (is_subword_type(bt) && (UseSSE < 4)) {
 1798         return false;
 1799       }
 1800 #ifndef _LP64
 1801       if (bt == T_BYTE || bt == T_LONG) {
 1802         return false;
 1803       }
 1804 #endif
 1805       break;
 1806 #ifndef _LP64
 1807     case Op_VectorInsert:
 1808       if (bt == T_LONG || bt == T_DOUBLE) {
 1809         return false;
 1810       }
 1811       break;
 1812 #endif
 1813     case Op_MinReductionV:
 1814     case Op_MaxReductionV:
 1815       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
 1816         return false;
 1817       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
 1818         return false;
 1819       }
 1820       // Float/Double intrinsics enabled for AVX family.
 1821       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
 1822         return false;
 1823       }
 1824       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
 1825         return false;
 1826       }
 1827 #ifndef _LP64
 1828       if (bt == T_BYTE || bt == T_LONG) {
 1829         return false;
 1830       }
 1831 #endif
 1832       break;
 1833     case Op_VectorTest:
 1834       if (UseSSE < 4) {
 1835         return false; // Implementation limitation
 1836       } else if (size_in_bits < 32) {
 1837         return false; // Implementation limitation
 1838       }
 1839       break;
 1840     case Op_VectorLoadShuffle:
 1841     case Op_VectorRearrange:
 1842       if(vlen == 2) {
 1843         return false; // Implementation limitation due to how shuffle is loaded
 1844       } else if (size_in_bits == 256 && UseAVX < 2) {
 1845         return false; // Implementation limitation
 1846       }
 1847       break;
 1848     case Op_VectorLoadMask:
 1849     case Op_VectorMaskCast:
 1850       if (size_in_bits == 256 && UseAVX < 2) {
 1851         return false; // Implementation limitation
 1852       }
 1853       // fallthrough
 1854     case Op_VectorStoreMask:
 1855       if (vlen == 2) {
 1856         return false; // Implementation limitation
 1857       }
 1858       break;
 1859     case Op_PopulateIndex:
 1860       if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
 1861         return false;
 1862       }
 1863       break;
 1864     case Op_VectorCastB2X:
 1865     case Op_VectorCastS2X:
 1866     case Op_VectorCastI2X:
 1867       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
 1868         return false;
 1869       }
 1870       break;
 1871     case Op_VectorCastL2X:
 1872       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
 1873         return false;
 1874       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
 1875         return false;
 1876       }
 1877       break;
 1878     case Op_VectorCastF2X: {
 1879         // As per JLS section 5.1.3 narrowing conversion to sub-word types
 1880         // happen after intermediate conversion to integer and special handling
 1881         // code needs AVX2 vpcmpeqd instruction for 256 bit vectors.
 1882         int src_size_in_bits = type2aelembytes(T_FLOAT) * vlen * BitsPerByte;
 1883         if (is_integral_type(bt) && src_size_in_bits == 256 && UseAVX < 2) {
 1884           return false;
 1885         }
 1886       }
 1887       // fallthrough
 1888     case Op_VectorCastD2X:
 1889       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
 1890         return false;
 1891       }
 1892       break;
 1893     case Op_VectorCastF2HF:
 1894     case Op_VectorCastHF2F:
 1895       if (!VM_Version::supports_f16c() &&
 1896          ((!VM_Version::supports_evex() ||
 1897          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())))) {
 1898         return false;
 1899       }
 1900       break;
 1901     case Op_RoundVD:
 1902       if (!VM_Version::supports_avx512dq()) {
 1903         return false;
 1904       }
 1905       break;
 1906     case Op_MulReductionVI:
 1907       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1908         return false;
 1909       }
 1910       break;
 1911     case Op_LoadVectorGatherMasked:
 1912     case Op_StoreVectorScatterMasked:
 1913     case Op_StoreVectorScatter:
 1914       if (is_subword_type(bt)) {
 1915         return false;
 1916       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1917         return false;
 1918       }
 1919       // fallthrough
 1920     case Op_LoadVectorGather:
 1921       if (size_in_bits == 64 ) {
 1922         return false;
 1923       }
 1924       break;
 1925     case Op_MaskAll:
 1926       if (!VM_Version::supports_evex()) {
 1927         return false;
 1928       }
 1929       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
 1930         return false;
 1931       }
 1932       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1933         return false;
 1934       }
 1935       break;
 1936     case Op_VectorMaskCmp:
 1937       if (vlen < 2 || size_in_bits < 32) {
 1938         return false;
 1939       }
 1940       break;
 1941     case Op_CompressM:
 1942       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1943         return false;
 1944       }
 1945       break;
 1946     case Op_CompressV:
 1947     case Op_ExpandV:
 1948       if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
 1949         return false;
 1950       }
 1951       if (!is_LP64 && !VM_Version::supports_avx512vl() && size_in_bits < 512) {
 1952         return false;
 1953       }
 1954       if (size_in_bits < 128 ) {
 1955         return false;
 1956       }
 1957     case Op_VectorLongToMask:
 1958       if (UseAVX < 1 || !is_LP64) {
 1959         return false;
 1960       }
 1961       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
 1962         return false;
 1963       }
 1964       break;
 1965     case Op_SignumVD:
 1966     case Op_SignumVF:
 1967       if (UseAVX < 1) {
 1968         return false;
 1969       }
 1970       break;
 1971     case Op_PopCountVI:
 1972     case Op_PopCountVL: {
 1973         if (!is_pop_count_instr_target(bt) &&
 1974             (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
 1975           return false;
 1976         }
 1977       }
 1978       break;
 1979     case Op_ReverseV:
 1980     case Op_ReverseBytesV:
 1981       if (UseAVX < 2) {
 1982         return false;
 1983       }
 1984       break;
 1985     case Op_CountTrailingZerosV:
 1986     case Op_CountLeadingZerosV:
 1987       if (UseAVX < 2) {
 1988         return false;
 1989       }
 1990       break;
 1991   }
 1992   return true;  // Per default match rules are supported.
 1993 }
 1994 
 1995 bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
 1996   // ADLC based match_rule_supported routine checks for the existence of pattern based
 1997   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
 1998   // of their non-masked counterpart with mask edge being the differentiator.
 1999   // This routine does a strict check on the existence of masked operation patterns
 2000   // by returning a default false value for all the other opcodes apart from the
 2001   // ones whose masked instruction patterns are defined in this file.
 2002   if (!match_rule_supported_vector(opcode, vlen, bt)) {
 2003     return false;
 2004   }
 2005 
 2006   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 2007   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 2008   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
 2009     return false;
 2010   }
 2011   switch(opcode) {
 2012     // Unary masked operations
 2013     case Op_AbsVB:
 2014     case Op_AbsVS:
 2015       if(!VM_Version::supports_avx512bw()) {
 2016         return false;  // Implementation limitation
 2017       }
 2018     case Op_AbsVI:
 2019     case Op_AbsVL:
 2020       return true;
 2021 
 2022     // Ternary masked operations
 2023     case Op_FmaVF:
 2024     case Op_FmaVD:
 2025       return true;
 2026 
 2027     case Op_MacroLogicV:
 2028       if(bt != T_INT && bt != T_LONG) {
 2029         return false;
 2030       }
 2031       return true;
 2032 
 2033     // Binary masked operations
 2034     case Op_AddVB:
 2035     case Op_AddVS:
 2036     case Op_SubVB:
 2037     case Op_SubVS:
 2038     case Op_MulVS:
 2039     case Op_LShiftVS:
 2040     case Op_RShiftVS:
 2041     case Op_URShiftVS:
 2042       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2043       if (!VM_Version::supports_avx512bw()) {
 2044         return false;  // Implementation limitation
 2045       }
 2046       return true;
 2047 
 2048     case Op_MulVL:
 2049       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2050       if (!VM_Version::supports_avx512dq()) {
 2051         return false;  // Implementation limitation
 2052       }
 2053       return true;
 2054 
 2055     case Op_AndV:
 2056     case Op_OrV:
 2057     case Op_XorV:
 2058     case Op_RotateRightV:
 2059     case Op_RotateLeftV:
 2060       if (bt != T_INT && bt != T_LONG) {
 2061         return false; // Implementation limitation
 2062       }
 2063       return true;
 2064 
 2065     case Op_VectorLoadMask:
 2066       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2067       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2068         return false;
 2069       }
 2070       return true;
 2071 
 2072     case Op_AddVI:
 2073     case Op_AddVL:
 2074     case Op_AddVF:
 2075     case Op_AddVD:
 2076     case Op_SubVI:
 2077     case Op_SubVL:
 2078     case Op_SubVF:
 2079     case Op_SubVD:
 2080     case Op_MulVI:
 2081     case Op_MulVF:
 2082     case Op_MulVD:
 2083     case Op_DivVF:
 2084     case Op_DivVD:
 2085     case Op_SqrtVF:
 2086     case Op_SqrtVD:
 2087     case Op_LShiftVI:
 2088     case Op_LShiftVL:
 2089     case Op_RShiftVI:
 2090     case Op_RShiftVL:
 2091     case Op_URShiftVI:
 2092     case Op_URShiftVL:
 2093     case Op_LoadVectorMasked:
 2094     case Op_StoreVectorMasked:
 2095     case Op_LoadVectorGatherMasked:
 2096     case Op_StoreVectorScatterMasked:
 2097       return true;
 2098 
 2099     case Op_MaxV:
 2100     case Op_MinV:
 2101       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2102         return false; // Implementation limitation
 2103       }
 2104       if (is_floating_point_type(bt)) {
 2105         return false; // Implementation limitation
 2106       }
 2107       return true;
 2108 
 2109     case Op_VectorMaskCmp:
 2110       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2111         return false; // Implementation limitation
 2112       }
 2113       return true;
 2114 
 2115     case Op_VectorRearrange:
 2116       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 2117         return false; // Implementation limitation
 2118       }
 2119       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 2120         return false; // Implementation limitation
 2121       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
 2122         return false; // Implementation limitation
 2123       }
 2124       return true;
 2125 
 2126     // Binary Logical operations
 2127     case Op_AndVMask:
 2128     case Op_OrVMask:
 2129     case Op_XorVMask:
 2130       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
 2131         return false; // Implementation limitation
 2132       }
 2133       return true;
 2134 
 2135     case Op_PopCountVI:
 2136     case Op_PopCountVL:
 2137       if (!is_pop_count_instr_target(bt)) {
 2138         return false;
 2139       }
 2140       return true;
 2141 
 2142     case Op_MaskAll:
 2143       return true;
 2144 
 2145     case Op_CountLeadingZerosV:
 2146       if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
 2147         return true;
 2148       }
 2149     default:
 2150       return false;
 2151   }
 2152 }
 2153 
 2154 bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
 2155   return false;
 2156 }
 2157 
 2158 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
 2159   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
 2160   bool legacy = (generic_opnd->opcode() == LEGVEC);
 2161   if (!VM_Version::supports_avx512vlbwdq() && // KNL
 2162       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
 2163     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
 2164     return new legVecZOper();
 2165   }
 2166   if (legacy) {
 2167     switch (ideal_reg) {
 2168       case Op_VecS: return new legVecSOper();
 2169       case Op_VecD: return new legVecDOper();
 2170       case Op_VecX: return new legVecXOper();
 2171       case Op_VecY: return new legVecYOper();
 2172       case Op_VecZ: return new legVecZOper();
 2173     }
 2174   } else {
 2175     switch (ideal_reg) {
 2176       case Op_VecS: return new vecSOper();
 2177       case Op_VecD: return new vecDOper();
 2178       case Op_VecX: return new vecXOper();
 2179       case Op_VecY: return new vecYOper();
 2180       case Op_VecZ: return new vecZOper();
 2181     }
 2182   }
 2183   ShouldNotReachHere();
 2184   return nullptr;
 2185 }
 2186 
 2187 bool Matcher::is_reg2reg_move(MachNode* m) {
 2188   switch (m->rule()) {
 2189     case MoveVec2Leg_rule:
 2190     case MoveLeg2Vec_rule:
 2191     case MoveF2VL_rule:
 2192     case MoveF2LEG_rule:
 2193     case MoveVL2F_rule:
 2194     case MoveLEG2F_rule:
 2195     case MoveD2VL_rule:
 2196     case MoveD2LEG_rule:
 2197     case MoveVL2D_rule:
 2198     case MoveLEG2D_rule:
 2199       return true;
 2200     default:
 2201       return false;
 2202   }
 2203 }
 2204 
 2205 bool Matcher::is_generic_vector(MachOper* opnd) {
 2206   switch (opnd->opcode()) {
 2207     case VEC:
 2208     case LEGVEC:
 2209       return true;
 2210     default:
 2211       return false;
 2212   }
 2213 }
 2214 
 2215 //------------------------------------------------------------------------
 2216 
 2217 const RegMask* Matcher::predicate_reg_mask(void) {
 2218   return &_VECTMASK_REG_mask;
 2219 }
 2220 
 2221 const TypeVectMask* Matcher::predicate_reg_type(const Type* elemTy, int length) {
 2222   return new TypeVectMask(elemTy, length);
 2223 }
 2224 
 2225 // Max vector size in bytes. 0 if not supported.
 2226 int Matcher::vector_width_in_bytes(BasicType bt) {
 2227   assert(is_java_primitive(bt), "only primitive type vectors");
 2228   if (UseSSE < 2) return 0;
 2229   // SSE2 supports 128bit vectors for all types.
 2230   // AVX2 supports 256bit vectors for all types.
 2231   // AVX2/EVEX supports 512bit vectors for all types.
 2232   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
 2233   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 2234   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 2235     size = (UseAVX > 2) ? 64 : 32;
 2236   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
 2237     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
 2238   // Use flag to limit vector size.
 2239   size = MIN2(size,(int)MaxVectorSize);
 2240   // Minimum 2 values in vector (or 4 for bytes).
 2241   switch (bt) {
 2242   case T_DOUBLE:
 2243   case T_LONG:
 2244     if (size < 16) return 0;
 2245     break;
 2246   case T_FLOAT:
 2247   case T_INT:
 2248     if (size < 8) return 0;
 2249     break;
 2250   case T_BOOLEAN:
 2251     if (size < 4) return 0;
 2252     break;
 2253   case T_CHAR:
 2254     if (size < 4) return 0;
 2255     break;
 2256   case T_BYTE:
 2257     if (size < 4) return 0;
 2258     break;
 2259   case T_SHORT:
 2260     if (size < 4) return 0;
 2261     break;
 2262   default:
 2263     ShouldNotReachHere();
 2264   }
 2265   return size;
 2266 }
 2267 
 2268 // Limits on vector size (number of elements) loaded into vector.
 2269 int Matcher::max_vector_size(const BasicType bt) {
 2270   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 2271 }
 2272 int Matcher::min_vector_size(const BasicType bt) {
 2273   int max_size = max_vector_size(bt);
 2274   // Min size which can be loaded into vector is 4 bytes.
 2275   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 2276   // Support for calling svml double64 vectors
 2277   if (bt == T_DOUBLE) {
 2278     size = 1;
 2279   }
 2280   return MIN2(size,max_size);
 2281 }
 2282 
 2283 int Matcher::max_vector_size_auto_vectorization(const BasicType bt) {
 2284   // Limit the max vector size for auto vectorization to 256 bits (32 bytes)
 2285   // by default on Cascade Lake
 2286   if (VM_Version::is_default_intel_cascade_lake()) {
 2287     return MIN2(Matcher::max_vector_size(bt), 32 / type2aelembytes(bt));
 2288   }
 2289   return Matcher::max_vector_size(bt);
 2290 }
 2291 
 2292 int Matcher::scalable_vector_reg_size(const BasicType bt) {
 2293   return -1;
 2294 }
 2295 
 2296 // Vector ideal reg corresponding to specified size in bytes
 2297 uint Matcher::vector_ideal_reg(int size) {
 2298   assert(MaxVectorSize >= size, "");
 2299   switch(size) {
 2300     case  4: return Op_VecS;
 2301     case  8: return Op_VecD;
 2302     case 16: return Op_VecX;
 2303     case 32: return Op_VecY;
 2304     case 64: return Op_VecZ;
 2305   }
 2306   ShouldNotReachHere();
 2307   return 0;
 2308 }
 2309 
 2310 // Check for shift by small constant as well
 2311 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
 2312   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
 2313       shift->in(2)->get_int() <= 3 &&
 2314       // Are there other uses besides address expressions?
 2315       !matcher->is_visited(shift)) {
 2316     address_visited.set(shift->_idx); // Flag as address_visited
 2317     mstack.push(shift->in(2), Matcher::Visit);
 2318     Node *conv = shift->in(1);
 2319 #ifdef _LP64
 2320     // Allow Matcher to match the rule which bypass
 2321     // ConvI2L operation for an array index on LP64
 2322     // if the index value is positive.
 2323     if (conv->Opcode() == Op_ConvI2L &&
 2324         conv->as_Type()->type()->is_long()->_lo >= 0 &&
 2325         // Are there other uses besides address expressions?
 2326         !matcher->is_visited(conv)) {
 2327       address_visited.set(conv->_idx); // Flag as address_visited
 2328       mstack.push(conv->in(1), Matcher::Pre_Visit);
 2329     } else
 2330 #endif
 2331       mstack.push(conv, Matcher::Pre_Visit);
 2332     return true;
 2333   }
 2334   return false;
 2335 }
 2336 
 2337 // This function identifies sub-graphs in which a 'load' node is
 2338 // input to two different nodes, and such that it can be matched
 2339 // with BMI instructions like blsi, blsr, etc.
 2340 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
 2341 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
 2342 // refers to the same node.
 2343 //
 2344 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
 2345 // This is a temporary solution until we make DAGs expressible in ADL.
 2346 template<typename ConType>
 2347 class FusedPatternMatcher {
 2348   Node* _op1_node;
 2349   Node* _mop_node;
 2350   int _con_op;
 2351 
 2352   static int match_next(Node* n, int next_op, int next_op_idx) {
 2353     if (n->in(1) == nullptr || n->in(2) == nullptr) {
 2354       return -1;
 2355     }
 2356 
 2357     if (next_op_idx == -1) { // n is commutative, try rotations
 2358       if (n->in(1)->Opcode() == next_op) {
 2359         return 1;
 2360       } else if (n->in(2)->Opcode() == next_op) {
 2361         return 2;
 2362       }
 2363     } else {
 2364       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
 2365       if (n->in(next_op_idx)->Opcode() == next_op) {
 2366         return next_op_idx;
 2367       }
 2368     }
 2369     return -1;
 2370   }
 2371 
 2372  public:
 2373   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
 2374     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
 2375 
 2376   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
 2377              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
 2378              typename ConType::NativeType con_value) {
 2379     if (_op1_node->Opcode() != op1) {
 2380       return false;
 2381     }
 2382     if (_mop_node->outcnt() > 2) {
 2383       return false;
 2384     }
 2385     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
 2386     if (op1_op2_idx == -1) {
 2387       return false;
 2388     }
 2389     // Memory operation must be the other edge
 2390     int op1_mop_idx = (op1_op2_idx & 1) + 1;
 2391 
 2392     // Check that the mop node is really what we want
 2393     if (_op1_node->in(op1_mop_idx) == _mop_node) {
 2394       Node* op2_node = _op1_node->in(op1_op2_idx);
 2395       if (op2_node->outcnt() > 1) {
 2396         return false;
 2397       }
 2398       assert(op2_node->Opcode() == op2, "Should be");
 2399       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
 2400       if (op2_con_idx == -1) {
 2401         return false;
 2402       }
 2403       // Memory operation must be the other edge
 2404       int op2_mop_idx = (op2_con_idx & 1) + 1;
 2405       // Check that the memory operation is the same node
 2406       if (op2_node->in(op2_mop_idx) == _mop_node) {
 2407         // Now check the constant
 2408         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
 2409         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
 2410           return true;
 2411         }
 2412       }
 2413     }
 2414     return false;
 2415   }
 2416 };
 2417 
 2418 static bool is_bmi_pattern(Node* n, Node* m) {
 2419   assert(UseBMI1Instructions, "sanity");
 2420   if (n != nullptr && m != nullptr) {
 2421     if (m->Opcode() == Op_LoadI) {
 2422       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
 2423       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
 2424              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
 2425              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
 2426     } else if (m->Opcode() == Op_LoadL) {
 2427       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
 2428       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
 2429              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
 2430              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
 2431     }
 2432   }
 2433   return false;
 2434 }
 2435 
 2436 // Should the matcher clone input 'm' of node 'n'?
 2437 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 2438   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
 2439   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
 2440     mstack.push(m, Visit);
 2441     return true;
 2442   }
 2443   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
 2444     mstack.push(m, Visit);           // m = ShiftCntV
 2445     return true;
 2446   }
 2447   return false;
 2448 }
 2449 
 2450 // Should the Matcher clone shifts on addressing modes, expecting them
 2451 // to be subsumed into complex addressing expressions or compute them
 2452 // into registers?
 2453 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 2454   Node *off = m->in(AddPNode::Offset);
 2455   if (off->is_Con()) {
 2456     address_visited.test_set(m->_idx); // Flag as address_visited
 2457     Node *adr = m->in(AddPNode::Address);
 2458 
 2459     // Intel can handle 2 adds in addressing mode
 2460     // AtomicAdd is not an addressing expression.
 2461     // Cheap to find it by looking for screwy base.
 2462     if (adr->is_AddP() &&
 2463         !adr->in(AddPNode::Base)->is_top() &&
 2464         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
 2465         // Are there other uses besides address expressions?
 2466         !is_visited(adr)) {
 2467       address_visited.set(adr->_idx); // Flag as address_visited
 2468       Node *shift = adr->in(AddPNode::Offset);
 2469       if (!clone_shift(shift, this, mstack, address_visited)) {
 2470         mstack.push(shift, Pre_Visit);
 2471       }
 2472       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
 2473       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
 2474     } else {
 2475       mstack.push(adr, Pre_Visit);
 2476     }
 2477 
 2478     // Clone X+offset as it also folds into most addressing expressions
 2479     mstack.push(off, Visit);
 2480     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2481     return true;
 2482   } else if (clone_shift(off, this, mstack, address_visited)) {
 2483     address_visited.test_set(m->_idx); // Flag as address_visited
 2484     mstack.push(m->in(AddPNode::Address), Pre_Visit);
 2485     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2486     return true;
 2487   }
 2488   return false;
 2489 }
 2490 
 2491 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
 2492   switch (bt) {
 2493     case BoolTest::eq:
 2494       return Assembler::eq;
 2495     case BoolTest::ne:
 2496       return Assembler::neq;
 2497     case BoolTest::le:
 2498     case BoolTest::ule:
 2499       return Assembler::le;
 2500     case BoolTest::ge:
 2501     case BoolTest::uge:
 2502       return Assembler::nlt;
 2503     case BoolTest::lt:
 2504     case BoolTest::ult:
 2505       return Assembler::lt;
 2506     case BoolTest::gt:
 2507     case BoolTest::ugt:
 2508       return Assembler::nle;
 2509     default : ShouldNotReachHere(); return Assembler::_false;
 2510   }
 2511 }
 2512 
 2513 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
 2514   switch (bt) {
 2515   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
 2516   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
 2517   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
 2518   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
 2519   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
 2520   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
 2521   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
 2522   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
 2523   }
 2524 }
 2525 
 2526 // Helper methods for MachSpillCopyNode::implementation().
 2527 static void vec_mov_helper(CodeBuffer *cbuf, int src_lo, int dst_lo,
 2528                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 2529   assert(ireg == Op_VecS || // 32bit vector
 2530          ((src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 2531           (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi),
 2532          "no non-adjacent vector moves" );
 2533   if (cbuf) {
 2534     C2_MacroAssembler _masm(cbuf);
 2535     switch (ireg) {
 2536     case Op_VecS: // copy whole register
 2537     case Op_VecD:
 2538     case Op_VecX:
 2539 #ifndef _LP64
 2540       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2541 #else
 2542       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2543         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2544       } else {
 2545         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2546      }
 2547 #endif
 2548       break;
 2549     case Op_VecY:
 2550 #ifndef _LP64
 2551       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2552 #else
 2553       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2554         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2555       } else {
 2556         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2557      }
 2558 #endif
 2559       break;
 2560     case Op_VecZ:
 2561       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
 2562       break;
 2563     default:
 2564       ShouldNotReachHere();
 2565     }
 2566 #ifndef PRODUCT
 2567   } else {
 2568     switch (ireg) {
 2569     case Op_VecS:
 2570     case Op_VecD:
 2571     case Op_VecX:
 2572       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2573       break;
 2574     case Op_VecY:
 2575     case Op_VecZ:
 2576       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2577       break;
 2578     default:
 2579       ShouldNotReachHere();
 2580     }
 2581 #endif
 2582   }
 2583 }
 2584 
 2585 void vec_spill_helper(CodeBuffer *cbuf, bool is_load,
 2586                      int stack_offset, int reg, uint ireg, outputStream* st) {
 2587   if (cbuf) {
 2588     C2_MacroAssembler _masm(cbuf);
 2589     if (is_load) {
 2590       switch (ireg) {
 2591       case Op_VecS:
 2592         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2593         break;
 2594       case Op_VecD:
 2595         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2596         break;
 2597       case Op_VecX:
 2598 #ifndef _LP64
 2599         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2600 #else
 2601         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2602           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2603         } else {
 2604           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2605           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2606         }
 2607 #endif
 2608         break;
 2609       case Op_VecY:
 2610 #ifndef _LP64
 2611         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2612 #else
 2613         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2614           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2615         } else {
 2616           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2617           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2618         }
 2619 #endif
 2620         break;
 2621       case Op_VecZ:
 2622         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
 2623         break;
 2624       default:
 2625         ShouldNotReachHere();
 2626       }
 2627     } else { // store
 2628       switch (ireg) {
 2629       case Op_VecS:
 2630         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2631         break;
 2632       case Op_VecD:
 2633         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2634         break;
 2635       case Op_VecX:
 2636 #ifndef _LP64
 2637         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2638 #else
 2639         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2640           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2641         }
 2642         else {
 2643           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2644         }
 2645 #endif
 2646         break;
 2647       case Op_VecY:
 2648 #ifndef _LP64
 2649         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2650 #else
 2651         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2652           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2653         }
 2654         else {
 2655           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2656         }
 2657 #endif
 2658         break;
 2659       case Op_VecZ:
 2660         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2661         break;
 2662       default:
 2663         ShouldNotReachHere();
 2664       }
 2665     }
 2666 #ifndef PRODUCT
 2667   } else {
 2668     if (is_load) {
 2669       switch (ireg) {
 2670       case Op_VecS:
 2671         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2672         break;
 2673       case Op_VecD:
 2674         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2675         break;
 2676        case Op_VecX:
 2677         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2678         break;
 2679       case Op_VecY:
 2680       case Op_VecZ:
 2681         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2682         break;
 2683       default:
 2684         ShouldNotReachHere();
 2685       }
 2686     } else { // store
 2687       switch (ireg) {
 2688       case Op_VecS:
 2689         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2690         break;
 2691       case Op_VecD:
 2692         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2693         break;
 2694        case Op_VecX:
 2695         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2696         break;
 2697       case Op_VecY:
 2698       case Op_VecZ:
 2699         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2700         break;
 2701       default:
 2702         ShouldNotReachHere();
 2703       }
 2704     }
 2705 #endif
 2706   }
 2707 }
 2708 
 2709 template <class T>
 2710 static inline GrowableArray<jvalue>* vreplicate_imm(BasicType bt, T con, int len) {
 2711   GrowableArray<jvalue>* val = new GrowableArray<jvalue>(len);
 2712   jvalue ele;
 2713   switch (bt) {
 2714     case T_BYTE:   ele.b = con; break;
 2715     case T_SHORT:  ele.s = con; break;
 2716     case T_INT:    ele.i = con; break;
 2717     case T_LONG:   ele.j = con; break;
 2718     case T_FLOAT:  ele.f = con; break;
 2719     case T_DOUBLE: ele.d = con; break;
 2720     default: ShouldNotReachHere();
 2721   }
 2722   for (int i = 0; i < len; i++) {
 2723     val->append(ele);
 2724   }
 2725   return val;
 2726 }
 2727 
 2728 static inline jlong high_bit_set(BasicType bt) {
 2729   switch (bt) {
 2730     case T_BYTE:  return 0x8080808080808080;
 2731     case T_SHORT: return 0x8000800080008000;
 2732     case T_INT:   return 0x8000000080000000;
 2733     case T_LONG:  return 0x8000000000000000;
 2734     default:
 2735       ShouldNotReachHere();
 2736       return 0;
 2737   }
 2738 }
 2739 
 2740 #ifndef PRODUCT
 2741   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 2742     st->print("nop \t# %d bytes pad for loops and calls", _count);
 2743   }
 2744 #endif
 2745 
 2746   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
 2747     C2_MacroAssembler _masm(&cbuf);
 2748     __ nop(_count);
 2749   }
 2750 
 2751   uint MachNopNode::size(PhaseRegAlloc*) const {
 2752     return _count;
 2753   }
 2754 
 2755 #ifndef PRODUCT
 2756   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 2757     st->print("# breakpoint");
 2758   }
 2759 #endif
 2760 
 2761   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
 2762     C2_MacroAssembler _masm(&cbuf);
 2763     __ int3();
 2764   }
 2765 
 2766   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 2767     return MachNode::size(ra_);
 2768   }
 2769 
 2770 %}
 2771 
 2772 encode %{
 2773 
 2774   enc_class call_epilog %{
 2775     C2_MacroAssembler _masm(&cbuf);
 2776     if (VerifyStackAtCalls) {
 2777       // Check that stack depth is unchanged: find majik cookie on stack
 2778       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 2779       Label L;
 2780       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 2781       __ jccb(Assembler::equal, L);
 2782       // Die if stack mismatch
 2783       __ int3();
 2784       __ bind(L);
 2785     }
 2786     if (tf()->returns_inline_type_as_fields() && !_method->is_method_handle_intrinsic()) {
 2787       C2_MacroAssembler _masm(&cbuf);
 2788       // The last return value is not set by the callee but used to pass IsInit information to compiled code.
 2789       // Search for the corresponding projection, get the register and emit code that initialized it.
 2790       uint con = (tf()->range_cc()->cnt() - 1);
 2791       for (DUIterator_Fast imax, i = fast_outs(imax); i < imax; i++) {
 2792         ProjNode* proj = fast_out(i)->as_Proj();
 2793         if (proj->_con == con) {
 2794           // Set IsInit if rax is non-null (a non-null value is returned buffered or scalarized)
 2795           OptoReg::Name optoReg = ra_->get_reg_first(proj);
 2796           VMReg reg = OptoReg::as_VMReg(optoReg, ra_->_framesize, OptoReg::reg2stack(ra_->_matcher._new_SP));
 2797           Register toReg = reg->is_reg() ? reg->as_Register() : rscratch1;
 2798           __ testq(rax, rax);
 2799           __ setb(Assembler::notZero, toReg);
 2800           __ movzbl(toReg, toReg);
 2801           if (reg->is_stack()) {
 2802             int st_off = reg->reg2stack() * VMRegImpl::stack_slot_size;
 2803             __ movq(Address(rsp, st_off), toReg);
 2804           }
 2805           break;
 2806         }
 2807       }
 2808       if (return_value_is_used()) {
 2809         // An inline type is returned as fields in multiple registers.
 2810         // Rax either contains an oop if the inline type is buffered or a pointer
 2811         // to the corresponding InlineKlass with the lowest bit set to 1. Zero rax
 2812         // if the lowest bit is set to allow C2 to use the oop after null checking.
 2813         // rax &= (rax & 1) - 1
 2814         __ movptr(rscratch1, rax);
 2815         __ andptr(rscratch1, 0x1);
 2816         __ subptr(rscratch1, 0x1);
 2817         __ andptr(rax, rscratch1);
 2818       }
 2819     }
 2820   %}
 2821 
 2822 %}
 2823 
 2824 // Operands for bound floating pointer register arguments
 2825 operand rxmm0() %{
 2826   constraint(ALLOC_IN_RC(xmm0_reg));
 2827   match(VecX);
 2828   format%{%}
 2829   interface(REG_INTER);
 2830 %}
 2831 
 2832 //----------OPERANDS-----------------------------------------------------------
 2833 // Operand definitions must precede instruction definitions for correct parsing
 2834 // in the ADLC because operands constitute user defined types which are used in
 2835 // instruction definitions.
 2836 
 2837 // Vectors
 2838 
 2839 // Dummy generic vector class. Should be used for all vector operands.
 2840 // Replaced with vec[SDXYZ] during post-selection pass.
 2841 operand vec() %{
 2842   constraint(ALLOC_IN_RC(dynamic));
 2843   match(VecX);
 2844   match(VecY);
 2845   match(VecZ);
 2846   match(VecS);
 2847   match(VecD);
 2848 
 2849   format %{ %}
 2850   interface(REG_INTER);
 2851 %}
 2852 
 2853 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
 2854 // Replaced with legVec[SDXYZ] during post-selection cleanup.
 2855 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
 2856 // runtime code generation via reg_class_dynamic.
 2857 operand legVec() %{
 2858   constraint(ALLOC_IN_RC(dynamic));
 2859   match(VecX);
 2860   match(VecY);
 2861   match(VecZ);
 2862   match(VecS);
 2863   match(VecD);
 2864 
 2865   format %{ %}
 2866   interface(REG_INTER);
 2867 %}
 2868 
 2869 // Replaces vec during post-selection cleanup. See above.
 2870 operand vecS() %{
 2871   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
 2872   match(VecS);
 2873 
 2874   format %{ %}
 2875   interface(REG_INTER);
 2876 %}
 2877 
 2878 // Replaces legVec during post-selection cleanup. See above.
 2879 operand legVecS() %{
 2880   constraint(ALLOC_IN_RC(vectors_reg_legacy));
 2881   match(VecS);
 2882 
 2883   format %{ %}
 2884   interface(REG_INTER);
 2885 %}
 2886 
 2887 // Replaces vec during post-selection cleanup. See above.
 2888 operand vecD() %{
 2889   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
 2890   match(VecD);
 2891 
 2892   format %{ %}
 2893   interface(REG_INTER);
 2894 %}
 2895 
 2896 // Replaces legVec during post-selection cleanup. See above.
 2897 operand legVecD() %{
 2898   constraint(ALLOC_IN_RC(vectord_reg_legacy));
 2899   match(VecD);
 2900 
 2901   format %{ %}
 2902   interface(REG_INTER);
 2903 %}
 2904 
 2905 // Replaces vec during post-selection cleanup. See above.
 2906 operand vecX() %{
 2907   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
 2908   match(VecX);
 2909 
 2910   format %{ %}
 2911   interface(REG_INTER);
 2912 %}
 2913 
 2914 // Replaces legVec during post-selection cleanup. See above.
 2915 operand legVecX() %{
 2916   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
 2917   match(VecX);
 2918 
 2919   format %{ %}
 2920   interface(REG_INTER);
 2921 %}
 2922 
 2923 // Replaces vec during post-selection cleanup. See above.
 2924 operand vecY() %{
 2925   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
 2926   match(VecY);
 2927 
 2928   format %{ %}
 2929   interface(REG_INTER);
 2930 %}
 2931 
 2932 // Replaces legVec during post-selection cleanup. See above.
 2933 operand legVecY() %{
 2934   constraint(ALLOC_IN_RC(vectory_reg_legacy));
 2935   match(VecY);
 2936 
 2937   format %{ %}
 2938   interface(REG_INTER);
 2939 %}
 2940 
 2941 // Replaces vec during post-selection cleanup. See above.
 2942 operand vecZ() %{
 2943   constraint(ALLOC_IN_RC(vectorz_reg));
 2944   match(VecZ);
 2945 
 2946   format %{ %}
 2947   interface(REG_INTER);
 2948 %}
 2949 
 2950 // Replaces legVec during post-selection cleanup. See above.
 2951 operand legVecZ() %{
 2952   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
 2953   match(VecZ);
 2954 
 2955   format %{ %}
 2956   interface(REG_INTER);
 2957 %}
 2958 
 2959 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 2960 
 2961 // ============================================================================
 2962 
 2963 instruct ShouldNotReachHere() %{
 2964   match(Halt);
 2965   format %{ "stop\t# ShouldNotReachHere" %}
 2966   ins_encode %{
 2967     if (is_reachable()) {
 2968       __ stop(_halt_reason);
 2969     }
 2970   %}
 2971   ins_pipe(pipe_slow);
 2972 %}
 2973 
 2974 // ============================================================================
 2975 
 2976 instruct addF_reg(regF dst, regF src) %{
 2977   predicate((UseSSE>=1) && (UseAVX == 0));
 2978   match(Set dst (AddF dst src));
 2979 
 2980   format %{ "addss   $dst, $src" %}
 2981   ins_cost(150);
 2982   ins_encode %{
 2983     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 2984   %}
 2985   ins_pipe(pipe_slow);
 2986 %}
 2987 
 2988 instruct addF_mem(regF dst, memory src) %{
 2989   predicate((UseSSE>=1) && (UseAVX == 0));
 2990   match(Set dst (AddF dst (LoadF src)));
 2991 
 2992   format %{ "addss   $dst, $src" %}
 2993   ins_cost(150);
 2994   ins_encode %{
 2995     __ addss($dst$$XMMRegister, $src$$Address);
 2996   %}
 2997   ins_pipe(pipe_slow);
 2998 %}
 2999 
 3000 instruct addF_imm(regF dst, immF con) %{
 3001   predicate((UseSSE>=1) && (UseAVX == 0));
 3002   match(Set dst (AddF dst con));
 3003   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3004   ins_cost(150);
 3005   ins_encode %{
 3006     __ addss($dst$$XMMRegister, $constantaddress($con));
 3007   %}
 3008   ins_pipe(pipe_slow);
 3009 %}
 3010 
 3011 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 3012   predicate(UseAVX > 0);
 3013   match(Set dst (AddF src1 src2));
 3014 
 3015   format %{ "vaddss  $dst, $src1, $src2" %}
 3016   ins_cost(150);
 3017   ins_encode %{
 3018     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3019   %}
 3020   ins_pipe(pipe_slow);
 3021 %}
 3022 
 3023 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 3024   predicate(UseAVX > 0);
 3025   match(Set dst (AddF src1 (LoadF src2)));
 3026 
 3027   format %{ "vaddss  $dst, $src1, $src2" %}
 3028   ins_cost(150);
 3029   ins_encode %{
 3030     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3031   %}
 3032   ins_pipe(pipe_slow);
 3033 %}
 3034 
 3035 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 3036   predicate(UseAVX > 0);
 3037   match(Set dst (AddF src con));
 3038 
 3039   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3040   ins_cost(150);
 3041   ins_encode %{
 3042     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3043   %}
 3044   ins_pipe(pipe_slow);
 3045 %}
 3046 
 3047 instruct addD_reg(regD dst, regD src) %{
 3048   predicate((UseSSE>=2) && (UseAVX == 0));
 3049   match(Set dst (AddD dst src));
 3050 
 3051   format %{ "addsd   $dst, $src" %}
 3052   ins_cost(150);
 3053   ins_encode %{
 3054     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 3055   %}
 3056   ins_pipe(pipe_slow);
 3057 %}
 3058 
 3059 instruct addD_mem(regD dst, memory src) %{
 3060   predicate((UseSSE>=2) && (UseAVX == 0));
 3061   match(Set dst (AddD dst (LoadD src)));
 3062 
 3063   format %{ "addsd   $dst, $src" %}
 3064   ins_cost(150);
 3065   ins_encode %{
 3066     __ addsd($dst$$XMMRegister, $src$$Address);
 3067   %}
 3068   ins_pipe(pipe_slow);
 3069 %}
 3070 
 3071 instruct addD_imm(regD dst, immD con) %{
 3072   predicate((UseSSE>=2) && (UseAVX == 0));
 3073   match(Set dst (AddD dst con));
 3074   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3075   ins_cost(150);
 3076   ins_encode %{
 3077     __ addsd($dst$$XMMRegister, $constantaddress($con));
 3078   %}
 3079   ins_pipe(pipe_slow);
 3080 %}
 3081 
 3082 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 3083   predicate(UseAVX > 0);
 3084   match(Set dst (AddD src1 src2));
 3085 
 3086   format %{ "vaddsd  $dst, $src1, $src2" %}
 3087   ins_cost(150);
 3088   ins_encode %{
 3089     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3090   %}
 3091   ins_pipe(pipe_slow);
 3092 %}
 3093 
 3094 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 3095   predicate(UseAVX > 0);
 3096   match(Set dst (AddD src1 (LoadD src2)));
 3097 
 3098   format %{ "vaddsd  $dst, $src1, $src2" %}
 3099   ins_cost(150);
 3100   ins_encode %{
 3101     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3102   %}
 3103   ins_pipe(pipe_slow);
 3104 %}
 3105 
 3106 instruct addD_reg_imm(regD dst, regD src, immD con) %{
 3107   predicate(UseAVX > 0);
 3108   match(Set dst (AddD src con));
 3109 
 3110   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3111   ins_cost(150);
 3112   ins_encode %{
 3113     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3114   %}
 3115   ins_pipe(pipe_slow);
 3116 %}
 3117 
 3118 instruct subF_reg(regF dst, regF src) %{
 3119   predicate((UseSSE>=1) && (UseAVX == 0));
 3120   match(Set dst (SubF dst src));
 3121 
 3122   format %{ "subss   $dst, $src" %}
 3123   ins_cost(150);
 3124   ins_encode %{
 3125     __ subss($dst$$XMMRegister, $src$$XMMRegister);
 3126   %}
 3127   ins_pipe(pipe_slow);
 3128 %}
 3129 
 3130 instruct subF_mem(regF dst, memory src) %{
 3131   predicate((UseSSE>=1) && (UseAVX == 0));
 3132   match(Set dst (SubF dst (LoadF src)));
 3133 
 3134   format %{ "subss   $dst, $src" %}
 3135   ins_cost(150);
 3136   ins_encode %{
 3137     __ subss($dst$$XMMRegister, $src$$Address);
 3138   %}
 3139   ins_pipe(pipe_slow);
 3140 %}
 3141 
 3142 instruct subF_imm(regF dst, immF con) %{
 3143   predicate((UseSSE>=1) && (UseAVX == 0));
 3144   match(Set dst (SubF dst con));
 3145   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3146   ins_cost(150);
 3147   ins_encode %{
 3148     __ subss($dst$$XMMRegister, $constantaddress($con));
 3149   %}
 3150   ins_pipe(pipe_slow);
 3151 %}
 3152 
 3153 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
 3154   predicate(UseAVX > 0);
 3155   match(Set dst (SubF src1 src2));
 3156 
 3157   format %{ "vsubss  $dst, $src1, $src2" %}
 3158   ins_cost(150);
 3159   ins_encode %{
 3160     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3161   %}
 3162   ins_pipe(pipe_slow);
 3163 %}
 3164 
 3165 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
 3166   predicate(UseAVX > 0);
 3167   match(Set dst (SubF src1 (LoadF src2)));
 3168 
 3169   format %{ "vsubss  $dst, $src1, $src2" %}
 3170   ins_cost(150);
 3171   ins_encode %{
 3172     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3173   %}
 3174   ins_pipe(pipe_slow);
 3175 %}
 3176 
 3177 instruct subF_reg_imm(regF dst, regF src, immF con) %{
 3178   predicate(UseAVX > 0);
 3179   match(Set dst (SubF src con));
 3180 
 3181   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3182   ins_cost(150);
 3183   ins_encode %{
 3184     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3185   %}
 3186   ins_pipe(pipe_slow);
 3187 %}
 3188 
 3189 instruct subD_reg(regD dst, regD src) %{
 3190   predicate((UseSSE>=2) && (UseAVX == 0));
 3191   match(Set dst (SubD dst src));
 3192 
 3193   format %{ "subsd   $dst, $src" %}
 3194   ins_cost(150);
 3195   ins_encode %{
 3196     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
 3197   %}
 3198   ins_pipe(pipe_slow);
 3199 %}
 3200 
 3201 instruct subD_mem(regD dst, memory src) %{
 3202   predicate((UseSSE>=2) && (UseAVX == 0));
 3203   match(Set dst (SubD dst (LoadD src)));
 3204 
 3205   format %{ "subsd   $dst, $src" %}
 3206   ins_cost(150);
 3207   ins_encode %{
 3208     __ subsd($dst$$XMMRegister, $src$$Address);
 3209   %}
 3210   ins_pipe(pipe_slow);
 3211 %}
 3212 
 3213 instruct subD_imm(regD dst, immD con) %{
 3214   predicate((UseSSE>=2) && (UseAVX == 0));
 3215   match(Set dst (SubD dst con));
 3216   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3217   ins_cost(150);
 3218   ins_encode %{
 3219     __ subsd($dst$$XMMRegister, $constantaddress($con));
 3220   %}
 3221   ins_pipe(pipe_slow);
 3222 %}
 3223 
 3224 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
 3225   predicate(UseAVX > 0);
 3226   match(Set dst (SubD src1 src2));
 3227 
 3228   format %{ "vsubsd  $dst, $src1, $src2" %}
 3229   ins_cost(150);
 3230   ins_encode %{
 3231     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3232   %}
 3233   ins_pipe(pipe_slow);
 3234 %}
 3235 
 3236 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
 3237   predicate(UseAVX > 0);
 3238   match(Set dst (SubD src1 (LoadD src2)));
 3239 
 3240   format %{ "vsubsd  $dst, $src1, $src2" %}
 3241   ins_cost(150);
 3242   ins_encode %{
 3243     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3244   %}
 3245   ins_pipe(pipe_slow);
 3246 %}
 3247 
 3248 instruct subD_reg_imm(regD dst, regD src, immD con) %{
 3249   predicate(UseAVX > 0);
 3250   match(Set dst (SubD src con));
 3251 
 3252   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3253   ins_cost(150);
 3254   ins_encode %{
 3255     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3256   %}
 3257   ins_pipe(pipe_slow);
 3258 %}
 3259 
 3260 instruct mulF_reg(regF dst, regF src) %{
 3261   predicate((UseSSE>=1) && (UseAVX == 0));
 3262   match(Set dst (MulF dst src));
 3263 
 3264   format %{ "mulss   $dst, $src" %}
 3265   ins_cost(150);
 3266   ins_encode %{
 3267     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
 3268   %}
 3269   ins_pipe(pipe_slow);
 3270 %}
 3271 
 3272 instruct mulF_mem(regF dst, memory src) %{
 3273   predicate((UseSSE>=1) && (UseAVX == 0));
 3274   match(Set dst (MulF dst (LoadF src)));
 3275 
 3276   format %{ "mulss   $dst, $src" %}
 3277   ins_cost(150);
 3278   ins_encode %{
 3279     __ mulss($dst$$XMMRegister, $src$$Address);
 3280   %}
 3281   ins_pipe(pipe_slow);
 3282 %}
 3283 
 3284 instruct mulF_imm(regF dst, immF con) %{
 3285   predicate((UseSSE>=1) && (UseAVX == 0));
 3286   match(Set dst (MulF dst con));
 3287   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3288   ins_cost(150);
 3289   ins_encode %{
 3290     __ mulss($dst$$XMMRegister, $constantaddress($con));
 3291   %}
 3292   ins_pipe(pipe_slow);
 3293 %}
 3294 
 3295 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
 3296   predicate(UseAVX > 0);
 3297   match(Set dst (MulF src1 src2));
 3298 
 3299   format %{ "vmulss  $dst, $src1, $src2" %}
 3300   ins_cost(150);
 3301   ins_encode %{
 3302     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3303   %}
 3304   ins_pipe(pipe_slow);
 3305 %}
 3306 
 3307 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 3308   predicate(UseAVX > 0);
 3309   match(Set dst (MulF src1 (LoadF src2)));
 3310 
 3311   format %{ "vmulss  $dst, $src1, $src2" %}
 3312   ins_cost(150);
 3313   ins_encode %{
 3314     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3315   %}
 3316   ins_pipe(pipe_slow);
 3317 %}
 3318 
 3319 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 3320   predicate(UseAVX > 0);
 3321   match(Set dst (MulF src con));
 3322 
 3323   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3324   ins_cost(150);
 3325   ins_encode %{
 3326     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3327   %}
 3328   ins_pipe(pipe_slow);
 3329 %}
 3330 
 3331 instruct mulD_reg(regD dst, regD src) %{
 3332   predicate((UseSSE>=2) && (UseAVX == 0));
 3333   match(Set dst (MulD dst src));
 3334 
 3335   format %{ "mulsd   $dst, $src" %}
 3336   ins_cost(150);
 3337   ins_encode %{
 3338     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
 3339   %}
 3340   ins_pipe(pipe_slow);
 3341 %}
 3342 
 3343 instruct mulD_mem(regD dst, memory src) %{
 3344   predicate((UseSSE>=2) && (UseAVX == 0));
 3345   match(Set dst (MulD dst (LoadD src)));
 3346 
 3347   format %{ "mulsd   $dst, $src" %}
 3348   ins_cost(150);
 3349   ins_encode %{
 3350     __ mulsd($dst$$XMMRegister, $src$$Address);
 3351   %}
 3352   ins_pipe(pipe_slow);
 3353 %}
 3354 
 3355 instruct mulD_imm(regD dst, immD con) %{
 3356   predicate((UseSSE>=2) && (UseAVX == 0));
 3357   match(Set dst (MulD dst con));
 3358   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3359   ins_cost(150);
 3360   ins_encode %{
 3361     __ mulsd($dst$$XMMRegister, $constantaddress($con));
 3362   %}
 3363   ins_pipe(pipe_slow);
 3364 %}
 3365 
 3366 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
 3367   predicate(UseAVX > 0);
 3368   match(Set dst (MulD src1 src2));
 3369 
 3370   format %{ "vmulsd  $dst, $src1, $src2" %}
 3371   ins_cost(150);
 3372   ins_encode %{
 3373     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3374   %}
 3375   ins_pipe(pipe_slow);
 3376 %}
 3377 
 3378 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
 3379   predicate(UseAVX > 0);
 3380   match(Set dst (MulD src1 (LoadD src2)));
 3381 
 3382   format %{ "vmulsd  $dst, $src1, $src2" %}
 3383   ins_cost(150);
 3384   ins_encode %{
 3385     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3386   %}
 3387   ins_pipe(pipe_slow);
 3388 %}
 3389 
 3390 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
 3391   predicate(UseAVX > 0);
 3392   match(Set dst (MulD src con));
 3393 
 3394   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3395   ins_cost(150);
 3396   ins_encode %{
 3397     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3398   %}
 3399   ins_pipe(pipe_slow);
 3400 %}
 3401 
 3402 instruct divF_reg(regF dst, regF src) %{
 3403   predicate((UseSSE>=1) && (UseAVX == 0));
 3404   match(Set dst (DivF dst src));
 3405 
 3406   format %{ "divss   $dst, $src" %}
 3407   ins_cost(150);
 3408   ins_encode %{
 3409     __ divss($dst$$XMMRegister, $src$$XMMRegister);
 3410   %}
 3411   ins_pipe(pipe_slow);
 3412 %}
 3413 
 3414 instruct divF_mem(regF dst, memory src) %{
 3415   predicate((UseSSE>=1) && (UseAVX == 0));
 3416   match(Set dst (DivF dst (LoadF src)));
 3417 
 3418   format %{ "divss   $dst, $src" %}
 3419   ins_cost(150);
 3420   ins_encode %{
 3421     __ divss($dst$$XMMRegister, $src$$Address);
 3422   %}
 3423   ins_pipe(pipe_slow);
 3424 %}
 3425 
 3426 instruct divF_imm(regF dst, immF con) %{
 3427   predicate((UseSSE>=1) && (UseAVX == 0));
 3428   match(Set dst (DivF dst con));
 3429   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3430   ins_cost(150);
 3431   ins_encode %{
 3432     __ divss($dst$$XMMRegister, $constantaddress($con));
 3433   %}
 3434   ins_pipe(pipe_slow);
 3435 %}
 3436 
 3437 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
 3438   predicate(UseAVX > 0);
 3439   match(Set dst (DivF src1 src2));
 3440 
 3441   format %{ "vdivss  $dst, $src1, $src2" %}
 3442   ins_cost(150);
 3443   ins_encode %{
 3444     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3445   %}
 3446   ins_pipe(pipe_slow);
 3447 %}
 3448 
 3449 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
 3450   predicate(UseAVX > 0);
 3451   match(Set dst (DivF src1 (LoadF src2)));
 3452 
 3453   format %{ "vdivss  $dst, $src1, $src2" %}
 3454   ins_cost(150);
 3455   ins_encode %{
 3456     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3457   %}
 3458   ins_pipe(pipe_slow);
 3459 %}
 3460 
 3461 instruct divF_reg_imm(regF dst, regF src, immF con) %{
 3462   predicate(UseAVX > 0);
 3463   match(Set dst (DivF src con));
 3464 
 3465   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3466   ins_cost(150);
 3467   ins_encode %{
 3468     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3469   %}
 3470   ins_pipe(pipe_slow);
 3471 %}
 3472 
 3473 instruct divD_reg(regD dst, regD src) %{
 3474   predicate((UseSSE>=2) && (UseAVX == 0));
 3475   match(Set dst (DivD dst src));
 3476 
 3477   format %{ "divsd   $dst, $src" %}
 3478   ins_cost(150);
 3479   ins_encode %{
 3480     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
 3481   %}
 3482   ins_pipe(pipe_slow);
 3483 %}
 3484 
 3485 instruct divD_mem(regD dst, memory src) %{
 3486   predicate((UseSSE>=2) && (UseAVX == 0));
 3487   match(Set dst (DivD dst (LoadD src)));
 3488 
 3489   format %{ "divsd   $dst, $src" %}
 3490   ins_cost(150);
 3491   ins_encode %{
 3492     __ divsd($dst$$XMMRegister, $src$$Address);
 3493   %}
 3494   ins_pipe(pipe_slow);
 3495 %}
 3496 
 3497 instruct divD_imm(regD dst, immD con) %{
 3498   predicate((UseSSE>=2) && (UseAVX == 0));
 3499   match(Set dst (DivD dst con));
 3500   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3501   ins_cost(150);
 3502   ins_encode %{
 3503     __ divsd($dst$$XMMRegister, $constantaddress($con));
 3504   %}
 3505   ins_pipe(pipe_slow);
 3506 %}
 3507 
 3508 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
 3509   predicate(UseAVX > 0);
 3510   match(Set dst (DivD src1 src2));
 3511 
 3512   format %{ "vdivsd  $dst, $src1, $src2" %}
 3513   ins_cost(150);
 3514   ins_encode %{
 3515     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3516   %}
 3517   ins_pipe(pipe_slow);
 3518 %}
 3519 
 3520 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
 3521   predicate(UseAVX > 0);
 3522   match(Set dst (DivD src1 (LoadD src2)));
 3523 
 3524   format %{ "vdivsd  $dst, $src1, $src2" %}
 3525   ins_cost(150);
 3526   ins_encode %{
 3527     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3528   %}
 3529   ins_pipe(pipe_slow);
 3530 %}
 3531 
 3532 instruct divD_reg_imm(regD dst, regD src, immD con) %{
 3533   predicate(UseAVX > 0);
 3534   match(Set dst (DivD src con));
 3535 
 3536   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3537   ins_cost(150);
 3538   ins_encode %{
 3539     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3540   %}
 3541   ins_pipe(pipe_slow);
 3542 %}
 3543 
 3544 instruct absF_reg(regF dst) %{
 3545   predicate((UseSSE>=1) && (UseAVX == 0));
 3546   match(Set dst (AbsF dst));
 3547   ins_cost(150);
 3548   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
 3549   ins_encode %{
 3550     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
 3551   %}
 3552   ins_pipe(pipe_slow);
 3553 %}
 3554 
 3555 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
 3556   predicate(UseAVX > 0);
 3557   match(Set dst (AbsF src));
 3558   ins_cost(150);
 3559   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
 3560   ins_encode %{
 3561     int vlen_enc = Assembler::AVX_128bit;
 3562     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
 3563               ExternalAddress(float_signmask()), vlen_enc);
 3564   %}
 3565   ins_pipe(pipe_slow);
 3566 %}
 3567 
 3568 instruct absD_reg(regD dst) %{
 3569   predicate((UseSSE>=2) && (UseAVX == 0));
 3570   match(Set dst (AbsD dst));
 3571   ins_cost(150);
 3572   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
 3573             "# abs double by sign masking" %}
 3574   ins_encode %{
 3575     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
 3576   %}
 3577   ins_pipe(pipe_slow);
 3578 %}
 3579 
 3580 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
 3581   predicate(UseAVX > 0);
 3582   match(Set dst (AbsD src));
 3583   ins_cost(150);
 3584   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
 3585             "# abs double by sign masking" %}
 3586   ins_encode %{
 3587     int vlen_enc = Assembler::AVX_128bit;
 3588     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
 3589               ExternalAddress(double_signmask()), vlen_enc);
 3590   %}
 3591   ins_pipe(pipe_slow);
 3592 %}
 3593 
 3594 instruct negF_reg(regF dst) %{
 3595   predicate((UseSSE>=1) && (UseAVX == 0));
 3596   match(Set dst (NegF dst));
 3597   ins_cost(150);
 3598   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
 3599   ins_encode %{
 3600     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
 3601   %}
 3602   ins_pipe(pipe_slow);
 3603 %}
 3604 
 3605 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
 3606   predicate(UseAVX > 0);
 3607   match(Set dst (NegF src));
 3608   ins_cost(150);
 3609   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
 3610   ins_encode %{
 3611     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
 3612                  ExternalAddress(float_signflip()));
 3613   %}
 3614   ins_pipe(pipe_slow);
 3615 %}
 3616 
 3617 instruct negD_reg(regD dst) %{
 3618   predicate((UseSSE>=2) && (UseAVX == 0));
 3619   match(Set dst (NegD dst));
 3620   ins_cost(150);
 3621   format %{ "xorpd   $dst, [0x8000000000000000]\t"
 3622             "# neg double by sign flipping" %}
 3623   ins_encode %{
 3624     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
 3625   %}
 3626   ins_pipe(pipe_slow);
 3627 %}
 3628 
 3629 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
 3630   predicate(UseAVX > 0);
 3631   match(Set dst (NegD src));
 3632   ins_cost(150);
 3633   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
 3634             "# neg double by sign flipping" %}
 3635   ins_encode %{
 3636     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
 3637                  ExternalAddress(double_signflip()));
 3638   %}
 3639   ins_pipe(pipe_slow);
 3640 %}
 3641 
 3642 // sqrtss instruction needs destination register to be pre initialized for best performance
 3643 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3644 instruct sqrtF_reg(regF dst) %{
 3645   predicate(UseSSE>=1);
 3646   match(Set dst (SqrtF dst));
 3647   format %{ "sqrtss  $dst, $dst" %}
 3648   ins_encode %{
 3649     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
 3650   %}
 3651   ins_pipe(pipe_slow);
 3652 %}
 3653 
 3654 // sqrtsd instruction needs destination register to be pre initialized for best performance
 3655 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3656 instruct sqrtD_reg(regD dst) %{
 3657   predicate(UseSSE>=2);
 3658   match(Set dst (SqrtD dst));
 3659   format %{ "sqrtsd  $dst, $dst" %}
 3660   ins_encode %{
 3661     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
 3662   %}
 3663   ins_pipe(pipe_slow);
 3664 %}
 3665 
 3666 instruct convF2HF_reg_reg(rRegI dst, vlRegF src, vlRegF tmp) %{
 3667   effect(TEMP tmp);
 3668   match(Set dst (ConvF2HF src));
 3669   ins_cost(125);
 3670   format %{ "vcvtps2ph $dst,$src \t using $tmp as TEMP"%}
 3671   ins_encode %{
 3672     __ flt_to_flt16($dst$$Register, $src$$XMMRegister, $tmp$$XMMRegister);
 3673   %}
 3674   ins_pipe( pipe_slow );
 3675 %}
 3676 
 3677 instruct convF2HF_mem_reg(memory mem, regF src, kReg ktmp, rRegI rtmp) %{
 3678   predicate((UseAVX > 2) && VM_Version::supports_avx512vl());
 3679   effect(TEMP ktmp, TEMP rtmp);
 3680   match(Set mem (StoreC mem (ConvF2HF src)));
 3681   format %{ "evcvtps2ph $mem,$src \t using $ktmp and $rtmp as TEMP" %}
 3682   ins_encode %{
 3683     __ movl($rtmp$$Register, 0x1);
 3684     __ kmovwl($ktmp$$KRegister, $rtmp$$Register);
 3685     __ evcvtps2ph($mem$$Address, $ktmp$$KRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
 3686   %}
 3687   ins_pipe( pipe_slow );
 3688 %}
 3689 
 3690 instruct vconvF2HF(vec dst, vec src) %{
 3691   match(Set dst (VectorCastF2HF src));
 3692   format %{ "vector_conv_F2HF $dst $src" %}
 3693   ins_encode %{
 3694     int vlen_enc = vector_length_encoding(this, $src);
 3695     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, vlen_enc);
 3696   %}
 3697   ins_pipe( pipe_slow );
 3698 %}
 3699 
 3700 instruct vconvF2HF_mem_reg(memory mem, vec src) %{
 3701   match(Set mem (StoreVector mem (VectorCastF2HF src)));
 3702   format %{ "vcvtps2ph $mem,$src" %}
 3703   ins_encode %{
 3704     int vlen_enc = vector_length_encoding(this, $src);
 3705     __ vcvtps2ph($mem$$Address, $src$$XMMRegister, 0x04, vlen_enc);
 3706   %}
 3707   ins_pipe( pipe_slow );
 3708 %}
 3709 
 3710 instruct convHF2F_reg_reg(vlRegF dst, rRegI src) %{
 3711   match(Set dst (ConvHF2F src));
 3712   format %{ "vcvtph2ps $dst,$src" %}
 3713   ins_encode %{
 3714     __ flt16_to_flt($dst$$XMMRegister, $src$$Register);
 3715   %}
 3716   ins_pipe( pipe_slow );
 3717 %}
 3718 
 3719 instruct vconvHF2F_reg_mem(vec dst, memory mem) %{
 3720   match(Set dst (VectorCastHF2F (LoadVector mem)));
 3721   format %{ "vcvtph2ps $dst,$mem" %}
 3722   ins_encode %{
 3723     int vlen_enc = vector_length_encoding(this);
 3724     __ vcvtph2ps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 3725   %}
 3726   ins_pipe( pipe_slow );
 3727 %}
 3728 
 3729 instruct vconvHF2F(vec dst, vec src) %{
 3730   match(Set dst (VectorCastHF2F src));
 3731   ins_cost(125);
 3732   format %{ "vector_conv_HF2F $dst,$src" %}
 3733   ins_encode %{
 3734     int vlen_enc = vector_length_encoding(this);
 3735     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 3736   %}
 3737   ins_pipe( pipe_slow );
 3738 %}
 3739 
 3740 // ---------------------------------------- VectorReinterpret ------------------------------------
 3741 instruct reinterpret_mask(kReg dst) %{
 3742   predicate(n->bottom_type()->isa_vectmask() &&
 3743             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
 3744   match(Set dst (VectorReinterpret dst));
 3745   ins_cost(125);
 3746   format %{ "vector_reinterpret $dst\t!" %}
 3747   ins_encode %{
 3748     // empty
 3749   %}
 3750   ins_pipe( pipe_slow );
 3751 %}
 3752 
 3753 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
 3754   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3755             n->bottom_type()->isa_vectmask() &&
 3756             n->in(1)->bottom_type()->isa_vectmask() &&
 3757             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
 3758             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3759   match(Set dst (VectorReinterpret src));
 3760   effect(TEMP xtmp);
 3761   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
 3762   ins_encode %{
 3763      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
 3764      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3765      assert(src_sz == dst_sz , "src and dst size mismatch");
 3766      int vlen_enc = vector_length_encoding(src_sz);
 3767      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3768      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3769   %}
 3770   ins_pipe( pipe_slow );
 3771 %}
 3772 
 3773 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
 3774   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3775             n->bottom_type()->isa_vectmask() &&
 3776             n->in(1)->bottom_type()->isa_vectmask() &&
 3777             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
 3778              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
 3779             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3780   match(Set dst (VectorReinterpret src));
 3781   effect(TEMP xtmp);
 3782   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
 3783   ins_encode %{
 3784      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
 3785      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3786      assert(src_sz == dst_sz , "src and dst size mismatch");
 3787      int vlen_enc = vector_length_encoding(src_sz);
 3788      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3789      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3790   %}
 3791   ins_pipe( pipe_slow );
 3792 %}
 3793 
 3794 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
 3795   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3796             n->bottom_type()->isa_vectmask() &&
 3797             n->in(1)->bottom_type()->isa_vectmask() &&
 3798             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
 3799              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
 3800             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3801   match(Set dst (VectorReinterpret src));
 3802   effect(TEMP xtmp);
 3803   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
 3804   ins_encode %{
 3805      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
 3806      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3807      assert(src_sz == dst_sz , "src and dst size mismatch");
 3808      int vlen_enc = vector_length_encoding(src_sz);
 3809      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3810      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3811   %}
 3812   ins_pipe( pipe_slow );
 3813 %}
 3814 
 3815 instruct reinterpret(vec dst) %{
 3816   predicate(!n->bottom_type()->isa_vectmask() &&
 3817             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
 3818   match(Set dst (VectorReinterpret dst));
 3819   ins_cost(125);
 3820   format %{ "vector_reinterpret $dst\t!" %}
 3821   ins_encode %{
 3822     // empty
 3823   %}
 3824   ins_pipe( pipe_slow );
 3825 %}
 3826 
 3827 instruct reinterpret_expand(vec dst, vec src) %{
 3828   predicate(UseAVX == 0 &&
 3829             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3830   match(Set dst (VectorReinterpret src));
 3831   ins_cost(125);
 3832   effect(TEMP dst);
 3833   format %{ "vector_reinterpret_expand $dst,$src" %}
 3834   ins_encode %{
 3835     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
 3836     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
 3837 
 3838     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
 3839     if (src_vlen_in_bytes == 4) {
 3840       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), noreg);
 3841     } else {
 3842       assert(src_vlen_in_bytes == 8, "");
 3843       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), noreg);
 3844     }
 3845     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 3846   %}
 3847   ins_pipe( pipe_slow );
 3848 %}
 3849 
 3850 instruct vreinterpret_expand4(legVec dst, vec src) %{
 3851   predicate(UseAVX > 0 &&
 3852             !n->bottom_type()->isa_vectmask() &&
 3853             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
 3854             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3855   match(Set dst (VectorReinterpret src));
 3856   ins_cost(125);
 3857   format %{ "vector_reinterpret_expand $dst,$src" %}
 3858   ins_encode %{
 3859     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, noreg);
 3860   %}
 3861   ins_pipe( pipe_slow );
 3862 %}
 3863 
 3864 
 3865 instruct vreinterpret_expand(legVec dst, vec src) %{
 3866   predicate(UseAVX > 0 &&
 3867             !n->bottom_type()->isa_vectmask() &&
 3868             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
 3869             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3870   match(Set dst (VectorReinterpret src));
 3871   ins_cost(125);
 3872   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
 3873   ins_encode %{
 3874     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3875       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3876       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3877       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3878       default: ShouldNotReachHere();
 3879     }
 3880   %}
 3881   ins_pipe( pipe_slow );
 3882 %}
 3883 
 3884 instruct reinterpret_shrink(vec dst, legVec src) %{
 3885   predicate(!n->bottom_type()->isa_vectmask() &&
 3886             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
 3887   match(Set dst (VectorReinterpret src));
 3888   ins_cost(125);
 3889   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
 3890   ins_encode %{
 3891     switch (Matcher::vector_length_in_bytes(this)) {
 3892       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
 3893       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3894       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3895       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3896       default: ShouldNotReachHere();
 3897     }
 3898   %}
 3899   ins_pipe( pipe_slow );
 3900 %}
 3901 
 3902 // ----------------------------------------------------------------------------------------------------
 3903 
 3904 #ifdef _LP64
 3905 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
 3906   match(Set dst (RoundDoubleMode src rmode));
 3907   format %{ "roundsd $dst,$src" %}
 3908   ins_cost(150);
 3909   ins_encode %{
 3910     assert(UseSSE >= 4, "required");
 3911     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
 3912   %}
 3913   ins_pipe(pipe_slow);
 3914 %}
 3915 
 3916 instruct roundD_mem(legRegD dst, memory src, immU8 rmode) %{
 3917   match(Set dst (RoundDoubleMode (LoadD src) rmode));
 3918   format %{ "roundsd $dst,$src" %}
 3919   ins_cost(150);
 3920   ins_encode %{
 3921     assert(UseSSE >= 4, "required");
 3922     __ roundsd($dst$$XMMRegister, $src$$Address, $rmode$$constant);
 3923   %}
 3924   ins_pipe(pipe_slow);
 3925 %}
 3926 
 3927 instruct roundD_imm(legRegD dst, immD con, immU8 rmode) %{
 3928   match(Set dst (RoundDoubleMode con rmode));
 3929   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
 3930   ins_cost(150);
 3931   ins_encode %{
 3932     assert(UseSSE >= 4, "required");
 3933     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, noreg);
 3934   %}
 3935   ins_pipe(pipe_slow);
 3936 %}
 3937 
 3938 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
 3939   predicate(Matcher::vector_length(n) < 8);
 3940   match(Set dst (RoundDoubleModeV src rmode));
 3941   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
 3942   ins_encode %{
 3943     assert(UseAVX > 0, "required");
 3944     int vlen_enc = vector_length_encoding(this);
 3945     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
 3946   %}
 3947   ins_pipe( pipe_slow );
 3948 %}
 3949 
 3950 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
 3951   predicate(Matcher::vector_length(n) == 8);
 3952   match(Set dst (RoundDoubleModeV src rmode));
 3953   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
 3954   ins_encode %{
 3955     assert(UseAVX > 2, "required");
 3956     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
 3957   %}
 3958   ins_pipe( pipe_slow );
 3959 %}
 3960 
 3961 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
 3962   predicate(Matcher::vector_length(n) < 8);
 3963   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3964   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
 3965   ins_encode %{
 3966     assert(UseAVX > 0, "required");
 3967     int vlen_enc = vector_length_encoding(this);
 3968     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
 3969   %}
 3970   ins_pipe( pipe_slow );
 3971 %}
 3972 
 3973 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
 3974   predicate(Matcher::vector_length(n) == 8);
 3975   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3976   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
 3977   ins_encode %{
 3978     assert(UseAVX > 2, "required");
 3979     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
 3980   %}
 3981   ins_pipe( pipe_slow );
 3982 %}
 3983 #endif // _LP64
 3984 
 3985 instruct onspinwait() %{
 3986   match(OnSpinWait);
 3987   ins_cost(200);
 3988 
 3989   format %{
 3990     $$template
 3991     $$emit$$"pause\t! membar_onspinwait"
 3992   %}
 3993   ins_encode %{
 3994     __ pause();
 3995   %}
 3996   ins_pipe(pipe_slow);
 3997 %}
 3998 
 3999 // a * b + c
 4000 instruct fmaD_reg(regD a, regD b, regD c) %{
 4001   match(Set c (FmaD  c (Binary a b)));
 4002   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
 4003   ins_cost(150);
 4004   ins_encode %{
 4005     assert(UseFMA, "Needs FMA instructions support.");
 4006     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 4007   %}
 4008   ins_pipe( pipe_slow );
 4009 %}
 4010 
 4011 // a * b + c
 4012 instruct fmaF_reg(regF a, regF b, regF c) %{
 4013   match(Set c (FmaF  c (Binary a b)));
 4014   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
 4015   ins_cost(150);
 4016   ins_encode %{
 4017     assert(UseFMA, "Needs FMA instructions support.");
 4018     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 4019   %}
 4020   ins_pipe( pipe_slow );
 4021 %}
 4022 
 4023 // ====================VECTOR INSTRUCTIONS=====================================
 4024 
 4025 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
 4026 instruct MoveVec2Leg(legVec dst, vec src) %{
 4027   match(Set dst src);
 4028   format %{ "" %}
 4029   ins_encode %{
 4030     ShouldNotReachHere();
 4031   %}
 4032   ins_pipe( fpu_reg_reg );
 4033 %}
 4034 
 4035 instruct MoveLeg2Vec(vec dst, legVec src) %{
 4036   match(Set dst src);
 4037   format %{ "" %}
 4038   ins_encode %{
 4039     ShouldNotReachHere();
 4040   %}
 4041   ins_pipe( fpu_reg_reg );
 4042 %}
 4043 
 4044 // ============================================================================
 4045 
 4046 // Load vectors generic operand pattern
 4047 instruct loadV(vec dst, memory mem) %{
 4048   match(Set dst (LoadVector mem));
 4049   ins_cost(125);
 4050   format %{ "load_vector $dst,$mem" %}
 4051   ins_encode %{
 4052     __ load_vector($dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
 4053   %}
 4054   ins_pipe( pipe_slow );
 4055 %}
 4056 
 4057 // Store vectors generic operand pattern.
 4058 instruct storeV(memory mem, vec src) %{
 4059   match(Set mem (StoreVector mem src));
 4060   ins_cost(145);
 4061   format %{ "store_vector $mem,$src\n\t" %}
 4062   ins_encode %{
 4063     switch (Matcher::vector_length_in_bytes(this, $src)) {
 4064       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
 4065       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
 4066       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
 4067       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
 4068       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
 4069       default: ShouldNotReachHere();
 4070     }
 4071   %}
 4072   ins_pipe( pipe_slow );
 4073 %}
 4074 
 4075 // ---------------------------------------- Gather ------------------------------------
 4076 
 4077 // Gather INT, LONG, FLOAT, DOUBLE
 4078 
 4079 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
 4080   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 4081   match(Set dst (LoadVectorGather mem idx));
 4082   effect(TEMP dst, TEMP tmp, TEMP mask);
 4083   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
 4084   ins_encode %{
 4085     int vlen_enc = vector_length_encoding(this);
 4086     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4087     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4088     __ vpcmpeqd($mask$$XMMRegister, $mask$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4089     __ lea($tmp$$Register, $mem$$Address);
 4090     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4091   %}
 4092   ins_pipe( pipe_slow );
 4093 %}
 4094 
 4095 
 4096 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
 4097   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 4098   match(Set dst (LoadVectorGather mem idx));
 4099   effect(TEMP dst, TEMP tmp, TEMP ktmp);
 4100   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
 4101   ins_encode %{
 4102     int vlen_enc = vector_length_encoding(this);
 4103     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4104     __ kxnorwl($ktmp$$KRegister, $ktmp$$KRegister, $ktmp$$KRegister);
 4105     __ lea($tmp$$Register, $mem$$Address);
 4106     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4107   %}
 4108   ins_pipe( pipe_slow );
 4109 %}
 4110 
 4111 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4112   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 4113   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
 4114   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
 4115   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
 4116   ins_encode %{
 4117     assert(UseAVX > 2, "sanity");
 4118     int vlen_enc = vector_length_encoding(this);
 4119     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4120     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4121     // Note: Since gather instruction partially updates the opmask register used
 4122     // for predication hense moving mask operand to a temporary.
 4123     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4124     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4125     __ lea($tmp$$Register, $mem$$Address);
 4126     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4127   %}
 4128   ins_pipe( pipe_slow );
 4129 %}
 4130 // ====================Scatter=======================================
 4131 
 4132 // Scatter INT, LONG, FLOAT, DOUBLE
 4133 
 4134 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
 4135   predicate(UseAVX > 2);
 4136   match(Set mem (StoreVectorScatter mem (Binary src idx)));
 4137   effect(TEMP tmp, TEMP ktmp);
 4138   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
 4139   ins_encode %{
 4140     int vlen_enc = vector_length_encoding(this, $src);
 4141     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4142 
 4143     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4144     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4145 
 4146     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4147     __ lea($tmp$$Register, $mem$$Address);
 4148     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4149   %}
 4150   ins_pipe( pipe_slow );
 4151 %}
 4152 
 4153 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4154   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
 4155   effect(TEMP tmp, TEMP ktmp);
 4156   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
 4157   ins_encode %{
 4158     int vlen_enc = vector_length_encoding(this, $src);
 4159     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4160     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4161     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4162     // Note: Since scatter instruction partially updates the opmask register used
 4163     // for predication hense moving mask operand to a temporary.
 4164     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4165     __ lea($tmp$$Register, $mem$$Address);
 4166     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4167   %}
 4168   ins_pipe( pipe_slow );
 4169 %}
 4170 
 4171 // ====================REPLICATE=======================================
 4172 
 4173 // Replicate byte scalar to be vector
 4174 instruct vReplB_reg(vec dst, rRegI src) %{
 4175   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 4176   match(Set dst (Replicate src));
 4177   format %{ "replicateB $dst,$src" %}
 4178   ins_encode %{
 4179     uint vlen = Matcher::vector_length(this);
 4180     if (UseAVX >= 2) {
 4181       int vlen_enc = vector_length_encoding(this);
 4182       if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4183         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
 4184         __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
 4185       } else {
 4186         __ movdl($dst$$XMMRegister, $src$$Register);
 4187         __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4188       }
 4189     } else {
 4190        assert(UseAVX < 2, "");
 4191       __ movdl($dst$$XMMRegister, $src$$Register);
 4192       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
 4193       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4194       if (vlen >= 16) {
 4195         assert(vlen == 16, "");
 4196         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4197       }
 4198     }
 4199   %}
 4200   ins_pipe( pipe_slow );
 4201 %}
 4202 
 4203 instruct ReplB_mem(vec dst, memory mem) %{
 4204   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_BYTE);
 4205   match(Set dst (Replicate (LoadB mem)));
 4206   format %{ "replicateB $dst,$mem" %}
 4207   ins_encode %{
 4208     int vlen_enc = vector_length_encoding(this);
 4209     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4210   %}
 4211   ins_pipe( pipe_slow );
 4212 %}
 4213 
 4214 // ====================ReplicateS=======================================
 4215 
 4216 instruct vReplS_reg(vec dst, rRegI src) %{
 4217   predicate(Matcher::vector_element_basic_type(n) == T_SHORT);
 4218   match(Set dst (Replicate src));
 4219   format %{ "replicateS $dst,$src" %}
 4220   ins_encode %{
 4221     uint vlen = Matcher::vector_length(this);
 4222     int vlen_enc = vector_length_encoding(this);
 4223     if (UseAVX >= 2) {
 4224       if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4225         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
 4226         __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
 4227       } else {
 4228         __ movdl($dst$$XMMRegister, $src$$Register);
 4229         __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4230       }
 4231     } else {
 4232       assert(UseAVX < 2, "");
 4233       __ movdl($dst$$XMMRegister, $src$$Register);
 4234       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4235       if (vlen >= 8) {
 4236         assert(vlen == 8, "");
 4237         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4238       }
 4239     }
 4240   %}
 4241   ins_pipe( pipe_slow );
 4242 %}
 4243 
 4244 instruct ReplS_mem(vec dst, memory mem) %{
 4245   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_SHORT);
 4246   match(Set dst (Replicate (LoadS mem)));
 4247   format %{ "replicateS $dst,$mem" %}
 4248   ins_encode %{
 4249     int vlen_enc = vector_length_encoding(this);
 4250     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4251   %}
 4252   ins_pipe( pipe_slow );
 4253 %}
 4254 
 4255 // ====================ReplicateI=======================================
 4256 
 4257 instruct ReplI_reg(vec dst, rRegI src) %{
 4258   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4259   match(Set dst (Replicate src));
 4260   format %{ "replicateI $dst,$src" %}
 4261   ins_encode %{
 4262     uint vlen = Matcher::vector_length(this);
 4263     int vlen_enc = vector_length_encoding(this);
 4264     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4265       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
 4266     } else if (VM_Version::supports_avx2()) {
 4267       __ movdl($dst$$XMMRegister, $src$$Register);
 4268       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4269     } else {
 4270       __ movdl($dst$$XMMRegister, $src$$Register);
 4271       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4272     }
 4273   %}
 4274   ins_pipe( pipe_slow );
 4275 %}
 4276 
 4277 instruct ReplI_mem(vec dst, memory mem) %{
 4278   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4279   match(Set dst (Replicate (LoadI mem)));
 4280   format %{ "replicateI $dst,$mem" %}
 4281   ins_encode %{
 4282     int vlen_enc = vector_length_encoding(this);
 4283     if (VM_Version::supports_avx2()) {
 4284       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4285     } else if (VM_Version::supports_avx()) {
 4286       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4287     } else {
 4288       __ movdl($dst$$XMMRegister, $mem$$Address);
 4289       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4290     }
 4291   %}
 4292   ins_pipe( pipe_slow );
 4293 %}
 4294 
 4295 instruct ReplI_imm(vec dst, immI con) %{
 4296   predicate(Matcher::is_non_long_integral_vector(n));
 4297   match(Set dst (Replicate con));
 4298   format %{ "replicateI $dst,$con" %}
 4299   ins_encode %{
 4300     InternalAddress addr = $constantaddress(Matcher::vector_element_basic_type(this),
 4301         vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
 4302             (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 8) /
 4303                 type2aelembytes(Matcher::vector_element_basic_type(this))));
 4304     BasicType bt = Matcher::vector_element_basic_type(this);
 4305     int vlen = Matcher::vector_length_in_bytes(this);
 4306     __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
 4307   %}
 4308   ins_pipe( pipe_slow );
 4309 %}
 4310 
 4311 // Replicate scalar zero to be vector
 4312 instruct ReplI_zero(vec dst, immI_0 zero) %{
 4313   predicate(Matcher::is_non_long_integral_vector(n));
 4314   match(Set dst (Replicate zero));
 4315   format %{ "replicateI $dst,$zero" %}
 4316   ins_encode %{
 4317     int vlen_enc = vector_length_encoding(this);
 4318     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4319       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4320     } else {
 4321       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4322     }
 4323   %}
 4324   ins_pipe( fpu_reg_reg );
 4325 %}
 4326 
 4327 instruct ReplI_M1(vec dst, immI_M1 con) %{
 4328   predicate(UseSSE >= 2 && Matcher::is_non_long_integral_vector(n));
 4329   match(Set dst (Replicate con));
 4330   format %{ "vallones $dst" %}
 4331   ins_encode %{
 4332     int vector_len = vector_length_encoding(this);
 4333     __ vallones($dst$$XMMRegister, vector_len);
 4334   %}
 4335   ins_pipe( pipe_slow );
 4336 %}
 4337 
 4338 // ====================ReplicateL=======================================
 4339 
 4340 #ifdef _LP64
 4341 // Replicate long (8 byte) scalar to be vector
 4342 instruct ReplL_reg(vec dst, rRegL src) %{
 4343   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4344   match(Set dst (Replicate src));
 4345   format %{ "replicateL $dst,$src" %}
 4346   ins_encode %{
 4347     int vlen = Matcher::vector_length(this);
 4348     int vlen_enc = vector_length_encoding(this);
 4349     if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4350       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
 4351     } else if (VM_Version::supports_avx2()) {
 4352       __ movdq($dst$$XMMRegister, $src$$Register);
 4353       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4354     } else {
 4355       __ movdq($dst$$XMMRegister, $src$$Register);
 4356       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4357     }
 4358   %}
 4359   ins_pipe( pipe_slow );
 4360 %}
 4361 #else // _LP64
 4362 // Replicate long (8 byte) scalar to be vector
 4363 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
 4364   predicate(Matcher::vector_length(n) <= 4 && Matcher::vector_element_basic_type(n) == T_LONG);
 4365   match(Set dst (Replicate src));
 4366   effect(TEMP dst, USE src, TEMP tmp);
 4367   format %{ "replicateL $dst,$src" %}
 4368   ins_encode %{
 4369     uint vlen = Matcher::vector_length(this);
 4370     if (vlen == 2) {
 4371       __ movdl($dst$$XMMRegister, $src$$Register);
 4372       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4373       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4374       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4375     } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4376       int vlen_enc = Assembler::AVX_256bit;
 4377       __ movdl($dst$$XMMRegister, $src$$Register);
 4378       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4379       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4380       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4381     } else {
 4382       __ movdl($dst$$XMMRegister, $src$$Register);
 4383       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4384       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4385       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4386       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4387     }
 4388   %}
 4389   ins_pipe( pipe_slow );
 4390 %}
 4391 
 4392 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
 4393   predicate(Matcher::vector_length(n) == 8 && Matcher::vector_element_basic_type(n) == T_LONG);
 4394   match(Set dst (Replicate src));
 4395   effect(TEMP dst, USE src, TEMP tmp);
 4396   format %{ "replicateL $dst,$src" %}
 4397   ins_encode %{
 4398     if (VM_Version::supports_avx512vl()) {
 4399       __ movdl($dst$$XMMRegister, $src$$Register);
 4400       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4401       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4402       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4403       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4404       __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
 4405     } else {
 4406       int vlen_enc = Assembler::AVX_512bit;
 4407       __ movdl($dst$$XMMRegister, $src$$Register);
 4408       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4409       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4410       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4411     }
 4412   %}
 4413   ins_pipe( pipe_slow );
 4414 %}
 4415 #endif // _LP64
 4416 
 4417 instruct ReplL_mem(vec dst, memory mem) %{
 4418   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4419   match(Set dst (Replicate (LoadL mem)));
 4420   format %{ "replicateL $dst,$mem" %}
 4421   ins_encode %{
 4422     int vlen_enc = vector_length_encoding(this);
 4423     if (VM_Version::supports_avx2()) {
 4424       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4425     } else if (VM_Version::supports_sse3()) {
 4426       __ movddup($dst$$XMMRegister, $mem$$Address);
 4427     } else {
 4428       __ movq($dst$$XMMRegister, $mem$$Address);
 4429       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4430     }
 4431   %}
 4432   ins_pipe( pipe_slow );
 4433 %}
 4434 
 4435 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
 4436 instruct ReplL_imm(vec dst, immL con) %{
 4437   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4438   match(Set dst (Replicate con));
 4439   format %{ "replicateL $dst,$con" %}
 4440   ins_encode %{
 4441     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, $con$$constant, 1));
 4442     int vlen = Matcher::vector_length_in_bytes(this);
 4443     __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
 4444   %}
 4445   ins_pipe( pipe_slow );
 4446 %}
 4447 
 4448 instruct ReplL_zero(vec dst, immL0 zero) %{
 4449   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4450   match(Set dst (Replicate zero));
 4451   format %{ "replicateL $dst,$zero" %}
 4452   ins_encode %{
 4453     int vlen_enc = vector_length_encoding(this);
 4454     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4455       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4456     } else {
 4457       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4458     }
 4459   %}
 4460   ins_pipe( fpu_reg_reg );
 4461 %}
 4462 
 4463 instruct ReplL_M1(vec dst, immL_M1 con) %{
 4464   predicate(UseSSE >= 2 && Matcher::vector_element_basic_type(n) == T_LONG);
 4465   match(Set dst (Replicate con));
 4466   format %{ "vallones $dst" %}
 4467   ins_encode %{
 4468     int vector_len = vector_length_encoding(this);
 4469     __ vallones($dst$$XMMRegister, vector_len);
 4470   %}
 4471   ins_pipe( pipe_slow );
 4472 %}
 4473 
 4474 // ====================ReplicateF=======================================
 4475 
 4476 instruct vReplF_reg(vec dst, vlRegF src) %{
 4477   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4478   match(Set dst (Replicate src));
 4479   format %{ "replicateF $dst,$src" %}
 4480   ins_encode %{
 4481     uint vlen = Matcher::vector_length(this);
 4482     int vlen_enc = vector_length_encoding(this);
 4483     if (vlen <= 4) {
 4484       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4485     } else if (VM_Version::supports_avx2()) {
 4486       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4487     } else {
 4488       assert(vlen == 8, "sanity");
 4489       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4490       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4491     }
 4492   %}
 4493   ins_pipe( pipe_slow );
 4494 %}
 4495 
 4496 instruct ReplF_reg(vec dst, vlRegF src) %{
 4497   predicate(UseAVX == 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4498   match(Set dst (Replicate src));
 4499   format %{ "replicateF $dst,$src" %}
 4500   ins_encode %{
 4501     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4502   %}
 4503   ins_pipe( pipe_slow );
 4504 %}
 4505 
 4506 instruct ReplF_mem(vec dst, memory mem) %{
 4507   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4508   match(Set dst (Replicate (LoadF mem)));
 4509   format %{ "replicateF $dst,$mem" %}
 4510   ins_encode %{
 4511     int vlen_enc = vector_length_encoding(this);
 4512     __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4513   %}
 4514   ins_pipe( pipe_slow );
 4515 %}
 4516 
 4517 // Replicate float scalar immediate to be vector by loading from const table.
 4518 instruct ReplF_imm(vec dst, immF con) %{
 4519   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4520   match(Set dst (Replicate con));
 4521   format %{ "replicateF $dst,$con" %}
 4522   ins_encode %{
 4523     InternalAddress addr = $constantaddress(T_FLOAT, vreplicate_imm(T_FLOAT, $con$$constant,
 4524         VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 2));
 4525     int vlen = Matcher::vector_length_in_bytes(this);
 4526     __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
 4527   %}
 4528   ins_pipe( pipe_slow );
 4529 %}
 4530 
 4531 instruct ReplF_zero(vec dst, immF0 zero) %{
 4532   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4533   match(Set dst (Replicate zero));
 4534   format %{ "replicateF $dst,$zero" %}
 4535   ins_encode %{
 4536     int vlen_enc = vector_length_encoding(this);
 4537     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4538       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4539     } else {
 4540       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4541     }
 4542   %}
 4543   ins_pipe( fpu_reg_reg );
 4544 %}
 4545 
 4546 // ====================ReplicateD=======================================
 4547 
 4548 // Replicate double (8 bytes) scalar to be vector
 4549 instruct vReplD_reg(vec dst, vlRegD src) %{
 4550   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4551   match(Set dst (Replicate src));
 4552   format %{ "replicateD $dst,$src" %}
 4553   ins_encode %{
 4554     uint vlen = Matcher::vector_length(this);
 4555     int vlen_enc = vector_length_encoding(this);
 4556     if (vlen <= 2) {
 4557       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4558     } else if (VM_Version::supports_avx2()) {
 4559       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4560     } else {
 4561       assert(vlen == 4, "sanity");
 4562       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4563       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4564     }
 4565   %}
 4566   ins_pipe( pipe_slow );
 4567 %}
 4568 
 4569 instruct ReplD_reg(vec dst, vlRegD src) %{
 4570   predicate(UseSSE < 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4571   match(Set dst (Replicate src));
 4572   format %{ "replicateD $dst,$src" %}
 4573   ins_encode %{
 4574     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4575   %}
 4576   ins_pipe( pipe_slow );
 4577 %}
 4578 
 4579 instruct ReplD_mem(vec dst, memory mem) %{
 4580   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4581   match(Set dst (Replicate (LoadD mem)));
 4582   format %{ "replicateD $dst,$mem" %}
 4583   ins_encode %{
 4584     if (Matcher::vector_length(this) >= 4) {
 4585       int vlen_enc = vector_length_encoding(this);
 4586       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4587     } else {
 4588       __ movddup($dst$$XMMRegister, $mem$$Address);
 4589     }
 4590   %}
 4591   ins_pipe( pipe_slow );
 4592 %}
 4593 
 4594 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
 4595 instruct ReplD_imm(vec dst, immD con) %{
 4596   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4597   match(Set dst (Replicate con));
 4598   format %{ "replicateD $dst,$con" %}
 4599   ins_encode %{
 4600     InternalAddress addr = $constantaddress(T_DOUBLE, vreplicate_imm(T_DOUBLE, $con$$constant, 1));
 4601     int vlen = Matcher::vector_length_in_bytes(this);
 4602     __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
 4603   %}
 4604   ins_pipe( pipe_slow );
 4605 %}
 4606 
 4607 instruct ReplD_zero(vec dst, immD0 zero) %{
 4608   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4609   match(Set dst (Replicate zero));
 4610   format %{ "replicateD $dst,$zero" %}
 4611   ins_encode %{
 4612     int vlen_enc = vector_length_encoding(this);
 4613     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4614       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4615     } else {
 4616       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4617     }
 4618   %}
 4619   ins_pipe( fpu_reg_reg );
 4620 %}
 4621 
 4622 // ====================VECTOR INSERT=======================================
 4623 
 4624 instruct insert(vec dst, rRegI val, immU8 idx) %{
 4625   predicate(Matcher::vector_length_in_bytes(n) < 32);
 4626   match(Set dst (VectorInsert (Binary dst val) idx));
 4627   format %{ "vector_insert $dst,$val,$idx" %}
 4628   ins_encode %{
 4629     assert(UseSSE >= 4, "required");
 4630     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
 4631 
 4632     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4633 
 4634     assert(is_integral_type(elem_bt), "");
 4635     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4636 
 4637     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
 4638   %}
 4639   ins_pipe( pipe_slow );
 4640 %}
 4641 
 4642 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
 4643   predicate(Matcher::vector_length_in_bytes(n) == 32);
 4644   match(Set dst (VectorInsert (Binary src val) idx));
 4645   effect(TEMP vtmp);
 4646   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4647   ins_encode %{
 4648     int vlen_enc = Assembler::AVX_256bit;
 4649     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4650     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4651     int log2epr = log2(elem_per_lane);
 4652 
 4653     assert(is_integral_type(elem_bt), "sanity");
 4654     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4655 
 4656     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4657     uint y_idx = ($idx$$constant >> log2epr) & 1;
 4658     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4659     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4660     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4661   %}
 4662   ins_pipe( pipe_slow );
 4663 %}
 4664 
 4665 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
 4666   predicate(Matcher::vector_length_in_bytes(n) == 64);
 4667   match(Set dst (VectorInsert (Binary src val) idx));
 4668   effect(TEMP vtmp);
 4669   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4670   ins_encode %{
 4671     assert(UseAVX > 2, "sanity");
 4672 
 4673     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4674     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4675     int log2epr = log2(elem_per_lane);
 4676 
 4677     assert(is_integral_type(elem_bt), "");
 4678     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4679 
 4680     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4681     uint y_idx = ($idx$$constant >> log2epr) & 3;
 4682     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4683     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4684     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4685   %}
 4686   ins_pipe( pipe_slow );
 4687 %}
 4688 
 4689 #ifdef _LP64
 4690 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
 4691   predicate(Matcher::vector_length(n) == 2);
 4692   match(Set dst (VectorInsert (Binary dst val) idx));
 4693   format %{ "vector_insert $dst,$val,$idx" %}
 4694   ins_encode %{
 4695     assert(UseSSE >= 4, "required");
 4696     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4697     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4698 
 4699     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
 4700   %}
 4701   ins_pipe( pipe_slow );
 4702 %}
 4703 
 4704 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
 4705   predicate(Matcher::vector_length(n) == 4);
 4706   match(Set dst (VectorInsert (Binary src val) idx));
 4707   effect(TEMP vtmp);
 4708   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4709   ins_encode %{
 4710     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4711     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4712 
 4713     uint x_idx = $idx$$constant & right_n_bits(1);
 4714     uint y_idx = ($idx$$constant >> 1) & 1;
 4715     int vlen_enc = Assembler::AVX_256bit;
 4716     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4717     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4718     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4719   %}
 4720   ins_pipe( pipe_slow );
 4721 %}
 4722 
 4723 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
 4724   predicate(Matcher::vector_length(n) == 8);
 4725   match(Set dst (VectorInsert (Binary src val) idx));
 4726   effect(TEMP vtmp);
 4727   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4728   ins_encode %{
 4729     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
 4730     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4731 
 4732     uint x_idx = $idx$$constant & right_n_bits(1);
 4733     uint y_idx = ($idx$$constant >> 1) & 3;
 4734     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4735     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4736     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4737   %}
 4738   ins_pipe( pipe_slow );
 4739 %}
 4740 #endif
 4741 
 4742 instruct insertF(vec dst, regF val, immU8 idx) %{
 4743   predicate(Matcher::vector_length(n) < 8);
 4744   match(Set dst (VectorInsert (Binary dst val) idx));
 4745   format %{ "vector_insert $dst,$val,$idx" %}
 4746   ins_encode %{
 4747     assert(UseSSE >= 4, "sanity");
 4748 
 4749     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4750     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4751 
 4752     uint x_idx = $idx$$constant & right_n_bits(2);
 4753     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4754   %}
 4755   ins_pipe( pipe_slow );
 4756 %}
 4757 
 4758 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
 4759   predicate(Matcher::vector_length(n) >= 8);
 4760   match(Set dst (VectorInsert (Binary src val) idx));
 4761   effect(TEMP vtmp);
 4762   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4763   ins_encode %{
 4764     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4765     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4766 
 4767     int vlen = Matcher::vector_length(this);
 4768     uint x_idx = $idx$$constant & right_n_bits(2);
 4769     if (vlen == 8) {
 4770       uint y_idx = ($idx$$constant >> 2) & 1;
 4771       int vlen_enc = Assembler::AVX_256bit;
 4772       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4773       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4774       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4775     } else {
 4776       assert(vlen == 16, "sanity");
 4777       uint y_idx = ($idx$$constant >> 2) & 3;
 4778       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4779       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4780       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4781     }
 4782   %}
 4783   ins_pipe( pipe_slow );
 4784 %}
 4785 
 4786 #ifdef _LP64
 4787 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
 4788   predicate(Matcher::vector_length(n) == 2);
 4789   match(Set dst (VectorInsert (Binary dst val) idx));
 4790   effect(TEMP tmp);
 4791   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
 4792   ins_encode %{
 4793     assert(UseSSE >= 4, "sanity");
 4794     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4795     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4796 
 4797     __ movq($tmp$$Register, $val$$XMMRegister);
 4798     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
 4799   %}
 4800   ins_pipe( pipe_slow );
 4801 %}
 4802 
 4803 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
 4804   predicate(Matcher::vector_length(n) == 4);
 4805   match(Set dst (VectorInsert (Binary src val) idx));
 4806   effect(TEMP vtmp, TEMP tmp);
 4807   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
 4808   ins_encode %{
 4809     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4810     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4811 
 4812     uint x_idx = $idx$$constant & right_n_bits(1);
 4813     uint y_idx = ($idx$$constant >> 1) & 1;
 4814     int vlen_enc = Assembler::AVX_256bit;
 4815     __ movq($tmp$$Register, $val$$XMMRegister);
 4816     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4817     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4818     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4819   %}
 4820   ins_pipe( pipe_slow );
 4821 %}
 4822 
 4823 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
 4824   predicate(Matcher::vector_length(n) == 8);
 4825   match(Set dst (VectorInsert (Binary src val) idx));
 4826   effect(TEMP tmp, TEMP vtmp);
 4827   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4828   ins_encode %{
 4829     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4830     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4831 
 4832     uint x_idx = $idx$$constant & right_n_bits(1);
 4833     uint y_idx = ($idx$$constant >> 1) & 3;
 4834     __ movq($tmp$$Register, $val$$XMMRegister);
 4835     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4836     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4837     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4838   %}
 4839   ins_pipe( pipe_slow );
 4840 %}
 4841 #endif
 4842 
 4843 // ====================REDUCTION ARITHMETIC=======================================
 4844 
 4845 // =======================Int Reduction==========================================
 4846 
 4847 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4848   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
 4849   match(Set dst (AddReductionVI src1 src2));
 4850   match(Set dst (MulReductionVI src1 src2));
 4851   match(Set dst (AndReductionV  src1 src2));
 4852   match(Set dst ( OrReductionV  src1 src2));
 4853   match(Set dst (XorReductionV  src1 src2));
 4854   match(Set dst (MinReductionV  src1 src2));
 4855   match(Set dst (MaxReductionV  src1 src2));
 4856   effect(TEMP vtmp1, TEMP vtmp2);
 4857   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4858   ins_encode %{
 4859     int opcode = this->ideal_Opcode();
 4860     int vlen = Matcher::vector_length(this, $src2);
 4861     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4862   %}
 4863   ins_pipe( pipe_slow );
 4864 %}
 4865 
 4866 // =======================Long Reduction==========================================
 4867 
 4868 #ifdef _LP64
 4869 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4870   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
 4871   match(Set dst (AddReductionVL src1 src2));
 4872   match(Set dst (MulReductionVL src1 src2));
 4873   match(Set dst (AndReductionV  src1 src2));
 4874   match(Set dst ( OrReductionV  src1 src2));
 4875   match(Set dst (XorReductionV  src1 src2));
 4876   match(Set dst (MinReductionV  src1 src2));
 4877   match(Set dst (MaxReductionV  src1 src2));
 4878   effect(TEMP vtmp1, TEMP vtmp2);
 4879   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4880   ins_encode %{
 4881     int opcode = this->ideal_Opcode();
 4882     int vlen = Matcher::vector_length(this, $src2);
 4883     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4884   %}
 4885   ins_pipe( pipe_slow );
 4886 %}
 4887 
 4888 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
 4889   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
 4890   match(Set dst (AddReductionVL src1 src2));
 4891   match(Set dst (MulReductionVL src1 src2));
 4892   match(Set dst (AndReductionV  src1 src2));
 4893   match(Set dst ( OrReductionV  src1 src2));
 4894   match(Set dst (XorReductionV  src1 src2));
 4895   match(Set dst (MinReductionV  src1 src2));
 4896   match(Set dst (MaxReductionV  src1 src2));
 4897   effect(TEMP vtmp1, TEMP vtmp2);
 4898   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4899   ins_encode %{
 4900     int opcode = this->ideal_Opcode();
 4901     int vlen = Matcher::vector_length(this, $src2);
 4902     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4903   %}
 4904   ins_pipe( pipe_slow );
 4905 %}
 4906 #endif // _LP64
 4907 
 4908 // =======================Float Reduction==========================================
 4909 
 4910 instruct reductionF128(regF dst, vec src, vec vtmp) %{
 4911   predicate(Matcher::vector_length(n->in(2)) <= 4); // src
 4912   match(Set dst (AddReductionVF dst src));
 4913   match(Set dst (MulReductionVF dst src));
 4914   effect(TEMP dst, TEMP vtmp);
 4915   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
 4916   ins_encode %{
 4917     int opcode = this->ideal_Opcode();
 4918     int vlen = Matcher::vector_length(this, $src);
 4919     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 4920   %}
 4921   ins_pipe( pipe_slow );
 4922 %}
 4923 
 4924 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
 4925   predicate(Matcher::vector_length(n->in(2)) == 8); // src
 4926   match(Set dst (AddReductionVF dst src));
 4927   match(Set dst (MulReductionVF dst src));
 4928   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4929   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4930   ins_encode %{
 4931     int opcode = this->ideal_Opcode();
 4932     int vlen = Matcher::vector_length(this, $src);
 4933     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4934   %}
 4935   ins_pipe( pipe_slow );
 4936 %}
 4937 
 4938 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 4939   predicate(Matcher::vector_length(n->in(2)) == 16); // src
 4940   match(Set dst (AddReductionVF dst src));
 4941   match(Set dst (MulReductionVF dst src));
 4942   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4943   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4944   ins_encode %{
 4945     int opcode = this->ideal_Opcode();
 4946     int vlen = Matcher::vector_length(this, $src);
 4947     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4948   %}
 4949   ins_pipe( pipe_slow );
 4950 %}
 4951 
 4952 // =======================Double Reduction==========================================
 4953 
 4954 instruct reduction2D(regD dst, vec src, vec vtmp) %{
 4955   predicate(Matcher::vector_length(n->in(2)) == 2); // src
 4956   match(Set dst (AddReductionVD dst src));
 4957   match(Set dst (MulReductionVD dst src));
 4958   effect(TEMP dst, TEMP vtmp);
 4959   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
 4960   ins_encode %{
 4961     int opcode = this->ideal_Opcode();
 4962     int vlen = Matcher::vector_length(this, $src);
 4963     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 4964 %}
 4965   ins_pipe( pipe_slow );
 4966 %}
 4967 
 4968 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
 4969   predicate(Matcher::vector_length(n->in(2)) == 4); // src
 4970   match(Set dst (AddReductionVD dst src));
 4971   match(Set dst (MulReductionVD dst src));
 4972   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4973   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4974   ins_encode %{
 4975     int opcode = this->ideal_Opcode();
 4976     int vlen = Matcher::vector_length(this, $src);
 4977     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4978   %}
 4979   ins_pipe( pipe_slow );
 4980 %}
 4981 
 4982 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 4983   predicate(Matcher::vector_length(n->in(2)) == 8); // src
 4984   match(Set dst (AddReductionVD dst src));
 4985   match(Set dst (MulReductionVD dst src));
 4986   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4987   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4988   ins_encode %{
 4989     int opcode = this->ideal_Opcode();
 4990     int vlen = Matcher::vector_length(this, $src);
 4991     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4992   %}
 4993   ins_pipe( pipe_slow );
 4994 %}
 4995 
 4996 // =======================Byte Reduction==========================================
 4997 
 4998 #ifdef _LP64
 4999 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5000   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
 5001   match(Set dst (AddReductionVI src1 src2));
 5002   match(Set dst (AndReductionV  src1 src2));
 5003   match(Set dst ( OrReductionV  src1 src2));
 5004   match(Set dst (XorReductionV  src1 src2));
 5005   match(Set dst (MinReductionV  src1 src2));
 5006   match(Set dst (MaxReductionV  src1 src2));
 5007   effect(TEMP vtmp1, TEMP vtmp2);
 5008   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5009   ins_encode %{
 5010     int opcode = this->ideal_Opcode();
 5011     int vlen = Matcher::vector_length(this, $src2);
 5012     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5013   %}
 5014   ins_pipe( pipe_slow );
 5015 %}
 5016 
 5017 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5018   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
 5019   match(Set dst (AddReductionVI src1 src2));
 5020   match(Set dst (AndReductionV  src1 src2));
 5021   match(Set dst ( OrReductionV  src1 src2));
 5022   match(Set dst (XorReductionV  src1 src2));
 5023   match(Set dst (MinReductionV  src1 src2));
 5024   match(Set dst (MaxReductionV  src1 src2));
 5025   effect(TEMP vtmp1, TEMP vtmp2);
 5026   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5027   ins_encode %{
 5028     int opcode = this->ideal_Opcode();
 5029     int vlen = Matcher::vector_length(this, $src2);
 5030     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5031   %}
 5032   ins_pipe( pipe_slow );
 5033 %}
 5034 #endif
 5035 
 5036 // =======================Short Reduction==========================================
 5037 
 5038 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5039   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
 5040   match(Set dst (AddReductionVI src1 src2));
 5041   match(Set dst (MulReductionVI src1 src2));
 5042   match(Set dst (AndReductionV  src1 src2));
 5043   match(Set dst ( OrReductionV  src1 src2));
 5044   match(Set dst (XorReductionV  src1 src2));
 5045   match(Set dst (MinReductionV  src1 src2));
 5046   match(Set dst (MaxReductionV  src1 src2));
 5047   effect(TEMP vtmp1, TEMP vtmp2);
 5048   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5049   ins_encode %{
 5050     int opcode = this->ideal_Opcode();
 5051     int vlen = Matcher::vector_length(this, $src2);
 5052     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5053   %}
 5054   ins_pipe( pipe_slow );
 5055 %}
 5056 
 5057 // =======================Mul Reduction==========================================
 5058 
 5059 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5060   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5061             Matcher::vector_length(n->in(2)) <= 32); // src2
 5062   match(Set dst (MulReductionVI src1 src2));
 5063   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5064   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5065   ins_encode %{
 5066     int opcode = this->ideal_Opcode();
 5067     int vlen = Matcher::vector_length(this, $src2);
 5068     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5069   %}
 5070   ins_pipe( pipe_slow );
 5071 %}
 5072 
 5073 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5074   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5075             Matcher::vector_length(n->in(2)) == 64); // src2
 5076   match(Set dst (MulReductionVI src1 src2));
 5077   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5078   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5079   ins_encode %{
 5080     int opcode = this->ideal_Opcode();
 5081     int vlen = Matcher::vector_length(this, $src2);
 5082     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5083   %}
 5084   ins_pipe( pipe_slow );
 5085 %}
 5086 
 5087 //--------------------Min/Max Float Reduction --------------------
 5088 // Float Min Reduction
 5089 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
 5090                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5091   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5092             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5093              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5094             Matcher::vector_length(n->in(2)) == 2);
 5095   match(Set dst (MinReductionV src1 src2));
 5096   match(Set dst (MaxReductionV src1 src2));
 5097   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5098   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5099   ins_encode %{
 5100     assert(UseAVX > 0, "sanity");
 5101 
 5102     int opcode = this->ideal_Opcode();
 5103     int vlen = Matcher::vector_length(this, $src2);
 5104     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5105                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5106   %}
 5107   ins_pipe( pipe_slow );
 5108 %}
 5109 
 5110 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5111                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5112   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5113             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5114              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5115             Matcher::vector_length(n->in(2)) >= 4);
 5116   match(Set dst (MinReductionV src1 src2));
 5117   match(Set dst (MaxReductionV src1 src2));
 5118   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5119   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5120   ins_encode %{
 5121     assert(UseAVX > 0, "sanity");
 5122 
 5123     int opcode = this->ideal_Opcode();
 5124     int vlen = Matcher::vector_length(this, $src2);
 5125     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5126                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5127   %}
 5128   ins_pipe( pipe_slow );
 5129 %}
 5130 
 5131 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
 5132                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5133   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5134             Matcher::vector_length(n->in(2)) == 2);
 5135   match(Set dst (MinReductionV dst src));
 5136   match(Set dst (MaxReductionV dst src));
 5137   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5138   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5139   ins_encode %{
 5140     assert(UseAVX > 0, "sanity");
 5141 
 5142     int opcode = this->ideal_Opcode();
 5143     int vlen = Matcher::vector_length(this, $src);
 5144     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5145                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5146   %}
 5147   ins_pipe( pipe_slow );
 5148 %}
 5149 
 5150 
 5151 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
 5152                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5153   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5154             Matcher::vector_length(n->in(2)) >= 4);
 5155   match(Set dst (MinReductionV dst src));
 5156   match(Set dst (MaxReductionV dst src));
 5157   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5158   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5159   ins_encode %{
 5160     assert(UseAVX > 0, "sanity");
 5161 
 5162     int opcode = this->ideal_Opcode();
 5163     int vlen = Matcher::vector_length(this, $src);
 5164     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5165                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5166   %}
 5167   ins_pipe( pipe_slow );
 5168 %}
 5169 
 5170 
 5171 //--------------------Min Double Reduction --------------------
 5172 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
 5173                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5174                             rFlagsReg cr) %{
 5175   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5176             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5177              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5178             Matcher::vector_length(n->in(2)) == 2);
 5179   match(Set dst (MinReductionV src1 src2));
 5180   match(Set dst (MaxReductionV src1 src2));
 5181   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5182   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5183   ins_encode %{
 5184     assert(UseAVX > 0, "sanity");
 5185 
 5186     int opcode = this->ideal_Opcode();
 5187     int vlen = Matcher::vector_length(this, $src2);
 5188     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5189                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5190   %}
 5191   ins_pipe( pipe_slow );
 5192 %}
 5193 
 5194 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
 5195                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5196                            rFlagsReg cr) %{
 5197   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5198             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5199              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5200             Matcher::vector_length(n->in(2)) >= 4);
 5201   match(Set dst (MinReductionV src1 src2));
 5202   match(Set dst (MaxReductionV src1 src2));
 5203   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5204   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5205   ins_encode %{
 5206     assert(UseAVX > 0, "sanity");
 5207 
 5208     int opcode = this->ideal_Opcode();
 5209     int vlen = Matcher::vector_length(this, $src2);
 5210     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5211                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5212   %}
 5213   ins_pipe( pipe_slow );
 5214 %}
 5215 
 5216 
 5217 instruct minmax_reduction2D_av(legRegD dst, legVec src,
 5218                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5219                                rFlagsReg cr) %{
 5220   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5221             Matcher::vector_length(n->in(2)) == 2);
 5222   match(Set dst (MinReductionV dst src));
 5223   match(Set dst (MaxReductionV dst src));
 5224   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5225   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5226   ins_encode %{
 5227     assert(UseAVX > 0, "sanity");
 5228 
 5229     int opcode = this->ideal_Opcode();
 5230     int vlen = Matcher::vector_length(this, $src);
 5231     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5232                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5233   %}
 5234   ins_pipe( pipe_slow );
 5235 %}
 5236 
 5237 instruct minmax_reductionD_av(legRegD dst, legVec src,
 5238                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5239                               rFlagsReg cr) %{
 5240   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5241             Matcher::vector_length(n->in(2)) >= 4);
 5242   match(Set dst (MinReductionV dst src));
 5243   match(Set dst (MaxReductionV dst src));
 5244   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5245   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5246   ins_encode %{
 5247     assert(UseAVX > 0, "sanity");
 5248 
 5249     int opcode = this->ideal_Opcode();
 5250     int vlen = Matcher::vector_length(this, $src);
 5251     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5252                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5253   %}
 5254   ins_pipe( pipe_slow );
 5255 %}
 5256 
 5257 // ====================VECTOR ARITHMETIC=======================================
 5258 
 5259 // --------------------------------- ADD --------------------------------------
 5260 
 5261 // Bytes vector add
 5262 instruct vaddB(vec dst, vec src) %{
 5263   predicate(UseAVX == 0);
 5264   match(Set dst (AddVB dst src));
 5265   format %{ "paddb   $dst,$src\t! add packedB" %}
 5266   ins_encode %{
 5267     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
 5268   %}
 5269   ins_pipe( pipe_slow );
 5270 %}
 5271 
 5272 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
 5273   predicate(UseAVX > 0);
 5274   match(Set dst (AddVB src1 src2));
 5275   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
 5276   ins_encode %{
 5277     int vlen_enc = vector_length_encoding(this);
 5278     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5279   %}
 5280   ins_pipe( pipe_slow );
 5281 %}
 5282 
 5283 instruct vaddB_mem(vec dst, vec src, memory mem) %{
 5284   predicate((UseAVX > 0) &&
 5285             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5286   match(Set dst (AddVB src (LoadVector mem)));
 5287   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
 5288   ins_encode %{
 5289     int vlen_enc = vector_length_encoding(this);
 5290     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5291   %}
 5292   ins_pipe( pipe_slow );
 5293 %}
 5294 
 5295 // Shorts/Chars vector add
 5296 instruct vaddS(vec dst, vec src) %{
 5297   predicate(UseAVX == 0);
 5298   match(Set dst (AddVS dst src));
 5299   format %{ "paddw   $dst,$src\t! add packedS" %}
 5300   ins_encode %{
 5301     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
 5302   %}
 5303   ins_pipe( pipe_slow );
 5304 %}
 5305 
 5306 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
 5307   predicate(UseAVX > 0);
 5308   match(Set dst (AddVS src1 src2));
 5309   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
 5310   ins_encode %{
 5311     int vlen_enc = vector_length_encoding(this);
 5312     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5313   %}
 5314   ins_pipe( pipe_slow );
 5315 %}
 5316 
 5317 instruct vaddS_mem(vec dst, vec src, memory mem) %{
 5318   predicate((UseAVX > 0) &&
 5319             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5320   match(Set dst (AddVS src (LoadVector mem)));
 5321   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
 5322   ins_encode %{
 5323     int vlen_enc = vector_length_encoding(this);
 5324     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5325   %}
 5326   ins_pipe( pipe_slow );
 5327 %}
 5328 
 5329 // Integers vector add
 5330 instruct vaddI(vec dst, vec src) %{
 5331   predicate(UseAVX == 0);
 5332   match(Set dst (AddVI dst src));
 5333   format %{ "paddd   $dst,$src\t! add packedI" %}
 5334   ins_encode %{
 5335     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
 5336   %}
 5337   ins_pipe( pipe_slow );
 5338 %}
 5339 
 5340 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
 5341   predicate(UseAVX > 0);
 5342   match(Set dst (AddVI src1 src2));
 5343   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
 5344   ins_encode %{
 5345     int vlen_enc = vector_length_encoding(this);
 5346     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5347   %}
 5348   ins_pipe( pipe_slow );
 5349 %}
 5350 
 5351 
 5352 instruct vaddI_mem(vec dst, vec src, memory mem) %{
 5353   predicate((UseAVX > 0) &&
 5354             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5355   match(Set dst (AddVI src (LoadVector mem)));
 5356   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
 5357   ins_encode %{
 5358     int vlen_enc = vector_length_encoding(this);
 5359     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5360   %}
 5361   ins_pipe( pipe_slow );
 5362 %}
 5363 
 5364 // Longs vector add
 5365 instruct vaddL(vec dst, vec src) %{
 5366   predicate(UseAVX == 0);
 5367   match(Set dst (AddVL dst src));
 5368   format %{ "paddq   $dst,$src\t! add packedL" %}
 5369   ins_encode %{
 5370     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
 5371   %}
 5372   ins_pipe( pipe_slow );
 5373 %}
 5374 
 5375 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
 5376   predicate(UseAVX > 0);
 5377   match(Set dst (AddVL src1 src2));
 5378   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
 5379   ins_encode %{
 5380     int vlen_enc = vector_length_encoding(this);
 5381     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5382   %}
 5383   ins_pipe( pipe_slow );
 5384 %}
 5385 
 5386 instruct vaddL_mem(vec dst, vec src, memory mem) %{
 5387   predicate((UseAVX > 0) &&
 5388             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5389   match(Set dst (AddVL src (LoadVector mem)));
 5390   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
 5391   ins_encode %{
 5392     int vlen_enc = vector_length_encoding(this);
 5393     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5394   %}
 5395   ins_pipe( pipe_slow );
 5396 %}
 5397 
 5398 // Floats vector add
 5399 instruct vaddF(vec dst, vec src) %{
 5400   predicate(UseAVX == 0);
 5401   match(Set dst (AddVF dst src));
 5402   format %{ "addps   $dst,$src\t! add packedF" %}
 5403   ins_encode %{
 5404     __ addps($dst$$XMMRegister, $src$$XMMRegister);
 5405   %}
 5406   ins_pipe( pipe_slow );
 5407 %}
 5408 
 5409 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
 5410   predicate(UseAVX > 0);
 5411   match(Set dst (AddVF src1 src2));
 5412   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
 5413   ins_encode %{
 5414     int vlen_enc = vector_length_encoding(this);
 5415     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5416   %}
 5417   ins_pipe( pipe_slow );
 5418 %}
 5419 
 5420 instruct vaddF_mem(vec dst, vec src, memory mem) %{
 5421   predicate((UseAVX > 0) &&
 5422             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5423   match(Set dst (AddVF src (LoadVector mem)));
 5424   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
 5425   ins_encode %{
 5426     int vlen_enc = vector_length_encoding(this);
 5427     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5428   %}
 5429   ins_pipe( pipe_slow );
 5430 %}
 5431 
 5432 // Doubles vector add
 5433 instruct vaddD(vec dst, vec src) %{
 5434   predicate(UseAVX == 0);
 5435   match(Set dst (AddVD dst src));
 5436   format %{ "addpd   $dst,$src\t! add packedD" %}
 5437   ins_encode %{
 5438     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
 5439   %}
 5440   ins_pipe( pipe_slow );
 5441 %}
 5442 
 5443 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
 5444   predicate(UseAVX > 0);
 5445   match(Set dst (AddVD src1 src2));
 5446   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
 5447   ins_encode %{
 5448     int vlen_enc = vector_length_encoding(this);
 5449     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5450   %}
 5451   ins_pipe( pipe_slow );
 5452 %}
 5453 
 5454 instruct vaddD_mem(vec dst, vec src, memory mem) %{
 5455   predicate((UseAVX > 0) &&
 5456             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5457   match(Set dst (AddVD src (LoadVector mem)));
 5458   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
 5459   ins_encode %{
 5460     int vlen_enc = vector_length_encoding(this);
 5461     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5462   %}
 5463   ins_pipe( pipe_slow );
 5464 %}
 5465 
 5466 // --------------------------------- SUB --------------------------------------
 5467 
 5468 // Bytes vector sub
 5469 instruct vsubB(vec dst, vec src) %{
 5470   predicate(UseAVX == 0);
 5471   match(Set dst (SubVB dst src));
 5472   format %{ "psubb   $dst,$src\t! sub packedB" %}
 5473   ins_encode %{
 5474     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
 5475   %}
 5476   ins_pipe( pipe_slow );
 5477 %}
 5478 
 5479 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
 5480   predicate(UseAVX > 0);
 5481   match(Set dst (SubVB src1 src2));
 5482   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
 5483   ins_encode %{
 5484     int vlen_enc = vector_length_encoding(this);
 5485     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5486   %}
 5487   ins_pipe( pipe_slow );
 5488 %}
 5489 
 5490 instruct vsubB_mem(vec dst, vec src, memory mem) %{
 5491   predicate((UseAVX > 0) &&
 5492             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5493   match(Set dst (SubVB src (LoadVector mem)));
 5494   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
 5495   ins_encode %{
 5496     int vlen_enc = vector_length_encoding(this);
 5497     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5498   %}
 5499   ins_pipe( pipe_slow );
 5500 %}
 5501 
 5502 // Shorts/Chars vector sub
 5503 instruct vsubS(vec dst, vec src) %{
 5504   predicate(UseAVX == 0);
 5505   match(Set dst (SubVS dst src));
 5506   format %{ "psubw   $dst,$src\t! sub packedS" %}
 5507   ins_encode %{
 5508     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
 5509   %}
 5510   ins_pipe( pipe_slow );
 5511 %}
 5512 
 5513 
 5514 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
 5515   predicate(UseAVX > 0);
 5516   match(Set dst (SubVS src1 src2));
 5517   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
 5518   ins_encode %{
 5519     int vlen_enc = vector_length_encoding(this);
 5520     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5521   %}
 5522   ins_pipe( pipe_slow );
 5523 %}
 5524 
 5525 instruct vsubS_mem(vec dst, vec src, memory mem) %{
 5526   predicate((UseAVX > 0) &&
 5527             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5528   match(Set dst (SubVS src (LoadVector mem)));
 5529   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
 5530   ins_encode %{
 5531     int vlen_enc = vector_length_encoding(this);
 5532     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5533   %}
 5534   ins_pipe( pipe_slow );
 5535 %}
 5536 
 5537 // Integers vector sub
 5538 instruct vsubI(vec dst, vec src) %{
 5539   predicate(UseAVX == 0);
 5540   match(Set dst (SubVI dst src));
 5541   format %{ "psubd   $dst,$src\t! sub packedI" %}
 5542   ins_encode %{
 5543     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
 5544   %}
 5545   ins_pipe( pipe_slow );
 5546 %}
 5547 
 5548 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
 5549   predicate(UseAVX > 0);
 5550   match(Set dst (SubVI src1 src2));
 5551   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
 5552   ins_encode %{
 5553     int vlen_enc = vector_length_encoding(this);
 5554     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5555   %}
 5556   ins_pipe( pipe_slow );
 5557 %}
 5558 
 5559 instruct vsubI_mem(vec dst, vec src, memory mem) %{
 5560   predicate((UseAVX > 0) &&
 5561             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5562   match(Set dst (SubVI src (LoadVector mem)));
 5563   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
 5564   ins_encode %{
 5565     int vlen_enc = vector_length_encoding(this);
 5566     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5567   %}
 5568   ins_pipe( pipe_slow );
 5569 %}
 5570 
 5571 // Longs vector sub
 5572 instruct vsubL(vec dst, vec src) %{
 5573   predicate(UseAVX == 0);
 5574   match(Set dst (SubVL dst src));
 5575   format %{ "psubq   $dst,$src\t! sub packedL" %}
 5576   ins_encode %{
 5577     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
 5578   %}
 5579   ins_pipe( pipe_slow );
 5580 %}
 5581 
 5582 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
 5583   predicate(UseAVX > 0);
 5584   match(Set dst (SubVL src1 src2));
 5585   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
 5586   ins_encode %{
 5587     int vlen_enc = vector_length_encoding(this);
 5588     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5589   %}
 5590   ins_pipe( pipe_slow );
 5591 %}
 5592 
 5593 
 5594 instruct vsubL_mem(vec dst, vec src, memory mem) %{
 5595   predicate((UseAVX > 0) &&
 5596             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5597   match(Set dst (SubVL src (LoadVector mem)));
 5598   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
 5599   ins_encode %{
 5600     int vlen_enc = vector_length_encoding(this);
 5601     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5602   %}
 5603   ins_pipe( pipe_slow );
 5604 %}
 5605 
 5606 // Floats vector sub
 5607 instruct vsubF(vec dst, vec src) %{
 5608   predicate(UseAVX == 0);
 5609   match(Set dst (SubVF dst src));
 5610   format %{ "subps   $dst,$src\t! sub packedF" %}
 5611   ins_encode %{
 5612     __ subps($dst$$XMMRegister, $src$$XMMRegister);
 5613   %}
 5614   ins_pipe( pipe_slow );
 5615 %}
 5616 
 5617 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
 5618   predicate(UseAVX > 0);
 5619   match(Set dst (SubVF src1 src2));
 5620   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
 5621   ins_encode %{
 5622     int vlen_enc = vector_length_encoding(this);
 5623     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5624   %}
 5625   ins_pipe( pipe_slow );
 5626 %}
 5627 
 5628 instruct vsubF_mem(vec dst, vec src, memory mem) %{
 5629   predicate((UseAVX > 0) &&
 5630             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5631   match(Set dst (SubVF src (LoadVector mem)));
 5632   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
 5633   ins_encode %{
 5634     int vlen_enc = vector_length_encoding(this);
 5635     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5636   %}
 5637   ins_pipe( pipe_slow );
 5638 %}
 5639 
 5640 // Doubles vector sub
 5641 instruct vsubD(vec dst, vec src) %{
 5642   predicate(UseAVX == 0);
 5643   match(Set dst (SubVD dst src));
 5644   format %{ "subpd   $dst,$src\t! sub packedD" %}
 5645   ins_encode %{
 5646     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
 5647   %}
 5648   ins_pipe( pipe_slow );
 5649 %}
 5650 
 5651 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
 5652   predicate(UseAVX > 0);
 5653   match(Set dst (SubVD src1 src2));
 5654   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
 5655   ins_encode %{
 5656     int vlen_enc = vector_length_encoding(this);
 5657     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5658   %}
 5659   ins_pipe( pipe_slow );
 5660 %}
 5661 
 5662 instruct vsubD_mem(vec dst, vec src, memory mem) %{
 5663   predicate((UseAVX > 0) &&
 5664             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5665   match(Set dst (SubVD src (LoadVector mem)));
 5666   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
 5667   ins_encode %{
 5668     int vlen_enc = vector_length_encoding(this);
 5669     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5670   %}
 5671   ins_pipe( pipe_slow );
 5672 %}
 5673 
 5674 // --------------------------------- MUL --------------------------------------
 5675 
 5676 // Byte vector mul
 5677 instruct vmul8B(vec dst, vec src1, vec src2, vec xtmp) %{
 5678   predicate(Matcher::vector_length_in_bytes(n) <= 8);
 5679   match(Set dst (MulVB src1 src2));
 5680   effect(TEMP dst, TEMP xtmp);
 5681   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5682   ins_encode %{
 5683     assert(UseSSE > 3, "required");
 5684     __ pmovsxbw($dst$$XMMRegister, $src1$$XMMRegister);
 5685     __ pmovsxbw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5686     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5687     __ psllw($dst$$XMMRegister, 8);
 5688     __ psrlw($dst$$XMMRegister, 8);
 5689     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 5690   %}
 5691   ins_pipe( pipe_slow );
 5692 %}
 5693 
 5694 instruct vmulB(vec dst, vec src1, vec src2, vec xtmp) %{
 5695   predicate(UseAVX == 0 && Matcher::vector_length_in_bytes(n) > 8);
 5696   match(Set dst (MulVB src1 src2));
 5697   effect(TEMP dst, TEMP xtmp);
 5698   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5699   ins_encode %{
 5700     assert(UseSSE > 3, "required");
 5701     // Odd-index elements
 5702     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
 5703     __ psrlw($dst$$XMMRegister, 8);
 5704     __ movdqu($xtmp$$XMMRegister, $src2$$XMMRegister);
 5705     __ psrlw($xtmp$$XMMRegister, 8);
 5706     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5707     __ psllw($dst$$XMMRegister, 8);
 5708     // Even-index elements
 5709     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 5710     __ pmullw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5711     __ psllw($xtmp$$XMMRegister, 8);
 5712     __ psrlw($xtmp$$XMMRegister, 8);
 5713     // Combine
 5714     __ por($dst$$XMMRegister, $xtmp$$XMMRegister);
 5715   %}
 5716   ins_pipe( pipe_slow );
 5717 %}
 5718 
 5719 instruct vmulB_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 5720   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) > 8);
 5721   match(Set dst (MulVB src1 src2));
 5722   effect(TEMP xtmp1, TEMP xtmp2);
 5723   format %{ "vmulVB  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 5724   ins_encode %{
 5725     int vlen_enc = vector_length_encoding(this);
 5726     // Odd-index elements
 5727     __ vpsrlw($xtmp2$$XMMRegister, $src1$$XMMRegister, 8, vlen_enc);
 5728     __ vpsrlw($xtmp1$$XMMRegister, $src2$$XMMRegister, 8, vlen_enc);
 5729     __ vpmullw($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5730     __ vpsllw($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 8, vlen_enc);
 5731     // Even-index elements
 5732     __ vpmullw($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5733     __ vpsllw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5734     __ vpsrlw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5735     // Combine
 5736     __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5737   %}
 5738   ins_pipe( pipe_slow );
 5739 %}
 5740 
 5741 // Shorts/Chars vector mul
 5742 instruct vmulS(vec dst, vec src) %{
 5743   predicate(UseAVX == 0);
 5744   match(Set dst (MulVS dst src));
 5745   format %{ "pmullw  $dst,$src\t! mul packedS" %}
 5746   ins_encode %{
 5747     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
 5748   %}
 5749   ins_pipe( pipe_slow );
 5750 %}
 5751 
 5752 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
 5753   predicate(UseAVX > 0);
 5754   match(Set dst (MulVS src1 src2));
 5755   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
 5756   ins_encode %{
 5757     int vlen_enc = vector_length_encoding(this);
 5758     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5759   %}
 5760   ins_pipe( pipe_slow );
 5761 %}
 5762 
 5763 instruct vmulS_mem(vec dst, vec src, memory mem) %{
 5764   predicate((UseAVX > 0) &&
 5765             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5766   match(Set dst (MulVS src (LoadVector mem)));
 5767   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
 5768   ins_encode %{
 5769     int vlen_enc = vector_length_encoding(this);
 5770     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5771   %}
 5772   ins_pipe( pipe_slow );
 5773 %}
 5774 
 5775 // Integers vector mul
 5776 instruct vmulI(vec dst, vec src) %{
 5777   predicate(UseAVX == 0);
 5778   match(Set dst (MulVI dst src));
 5779   format %{ "pmulld  $dst,$src\t! mul packedI" %}
 5780   ins_encode %{
 5781     assert(UseSSE > 3, "required");
 5782     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
 5783   %}
 5784   ins_pipe( pipe_slow );
 5785 %}
 5786 
 5787 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
 5788   predicate(UseAVX > 0);
 5789   match(Set dst (MulVI src1 src2));
 5790   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
 5791   ins_encode %{
 5792     int vlen_enc = vector_length_encoding(this);
 5793     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5794   %}
 5795   ins_pipe( pipe_slow );
 5796 %}
 5797 
 5798 instruct vmulI_mem(vec dst, vec src, memory mem) %{
 5799   predicate((UseAVX > 0) &&
 5800             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5801   match(Set dst (MulVI src (LoadVector mem)));
 5802   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
 5803   ins_encode %{
 5804     int vlen_enc = vector_length_encoding(this);
 5805     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5806   %}
 5807   ins_pipe( pipe_slow );
 5808 %}
 5809 
 5810 // Longs vector mul
 5811 instruct evmulL_reg(vec dst, vec src1, vec src2) %{
 5812   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 5813              VM_Version::supports_avx512dq()) ||
 5814             VM_Version::supports_avx512vldq());
 5815   match(Set dst (MulVL src1 src2));
 5816   format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
 5817   ins_encode %{
 5818     assert(UseAVX > 2, "required");
 5819     int vlen_enc = vector_length_encoding(this);
 5820     __ evpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5821   %}
 5822   ins_pipe( pipe_slow );
 5823 %}
 5824 
 5825 instruct evmulL_mem(vec dst, vec src, memory mem) %{
 5826   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 5827              VM_Version::supports_avx512dq()) ||
 5828             (Matcher::vector_length_in_bytes(n) > 8 &&
 5829              VM_Version::supports_avx512vldq()));
 5830   match(Set dst (MulVL src (LoadVector mem)));
 5831   format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
 5832   ins_encode %{
 5833     assert(UseAVX > 2, "required");
 5834     int vlen_enc = vector_length_encoding(this);
 5835     __ evpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5836   %}
 5837   ins_pipe( pipe_slow );
 5838 %}
 5839 
 5840 instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
 5841   predicate(UseAVX == 0);
 5842   match(Set dst (MulVL src1 src2));
 5843   effect(TEMP dst, TEMP xtmp);
 5844   format %{ "mulVL   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5845   ins_encode %{
 5846     assert(VM_Version::supports_sse4_1(), "required");
 5847     // Get the lo-hi products, only the lower 32 bits is in concerns
 5848     __ pshufd($xtmp$$XMMRegister, $src2$$XMMRegister, 0xB1);
 5849     __ pmulld($xtmp$$XMMRegister, $src1$$XMMRegister);
 5850     __ pshufd($dst$$XMMRegister, $xtmp$$XMMRegister, 0xB1);
 5851     __ paddd($dst$$XMMRegister, $xtmp$$XMMRegister);
 5852     __ psllq($dst$$XMMRegister, 32);
 5853     // Get the lo-lo products
 5854     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 5855     __ pmuludq($xtmp$$XMMRegister, $src2$$XMMRegister);
 5856     __ paddq($dst$$XMMRegister, $xtmp$$XMMRegister);
 5857   %}
 5858   ins_pipe( pipe_slow );
 5859 %}
 5860 
 5861 instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 5862   predicate(UseAVX > 0 &&
 5863             ((Matcher::vector_length_in_bytes(n) == 64 &&
 5864               !VM_Version::supports_avx512dq()) ||
 5865              (Matcher::vector_length_in_bytes(n) < 64 &&
 5866               !VM_Version::supports_avx512vldq())));
 5867   match(Set dst (MulVL src1 src2));
 5868   effect(TEMP xtmp1, TEMP xtmp2);
 5869   format %{ "vmulVL  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 5870   ins_encode %{
 5871     int vlen_enc = vector_length_encoding(this);
 5872     // Get the lo-hi products, only the lower 32 bits is in concerns
 5873     __ vpshufd($xtmp1$$XMMRegister, $src2$$XMMRegister, 0xB1, vlen_enc);
 5874     __ vpmulld($xtmp1$$XMMRegister, $src1$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 5875     __ vpshufd($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 0xB1, vlen_enc);
 5876     __ vpaddd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 5877     __ vpsllq($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 32, vlen_enc);
 5878     // Get the lo-lo products
 5879     __ vpmuludq($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5880     __ vpaddq($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5881   %}
 5882   ins_pipe( pipe_slow );
 5883 %}
 5884 
 5885 // Floats vector mul
 5886 instruct vmulF(vec dst, vec src) %{
 5887   predicate(UseAVX == 0);
 5888   match(Set dst (MulVF dst src));
 5889   format %{ "mulps   $dst,$src\t! mul packedF" %}
 5890   ins_encode %{
 5891     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
 5892   %}
 5893   ins_pipe( pipe_slow );
 5894 %}
 5895 
 5896 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
 5897   predicate(UseAVX > 0);
 5898   match(Set dst (MulVF src1 src2));
 5899   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
 5900   ins_encode %{
 5901     int vlen_enc = vector_length_encoding(this);
 5902     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5903   %}
 5904   ins_pipe( pipe_slow );
 5905 %}
 5906 
 5907 instruct vmulF_mem(vec dst, vec src, memory mem) %{
 5908   predicate((UseAVX > 0) &&
 5909             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5910   match(Set dst (MulVF src (LoadVector mem)));
 5911   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
 5912   ins_encode %{
 5913     int vlen_enc = vector_length_encoding(this);
 5914     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5915   %}
 5916   ins_pipe( pipe_slow );
 5917 %}
 5918 
 5919 // Doubles vector mul
 5920 instruct vmulD(vec dst, vec src) %{
 5921   predicate(UseAVX == 0);
 5922   match(Set dst (MulVD dst src));
 5923   format %{ "mulpd   $dst,$src\t! mul packedD" %}
 5924   ins_encode %{
 5925     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
 5926   %}
 5927   ins_pipe( pipe_slow );
 5928 %}
 5929 
 5930 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
 5931   predicate(UseAVX > 0);
 5932   match(Set dst (MulVD src1 src2));
 5933   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
 5934   ins_encode %{
 5935     int vlen_enc = vector_length_encoding(this);
 5936     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5937   %}
 5938   ins_pipe( pipe_slow );
 5939 %}
 5940 
 5941 instruct vmulD_mem(vec dst, vec src, memory mem) %{
 5942   predicate((UseAVX > 0) &&
 5943             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5944   match(Set dst (MulVD src (LoadVector mem)));
 5945   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
 5946   ins_encode %{
 5947     int vlen_enc = vector_length_encoding(this);
 5948     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5949   %}
 5950   ins_pipe( pipe_slow );
 5951 %}
 5952 
 5953 // --------------------------------- DIV --------------------------------------
 5954 
 5955 // Floats vector div
 5956 instruct vdivF(vec dst, vec src) %{
 5957   predicate(UseAVX == 0);
 5958   match(Set dst (DivVF dst src));
 5959   format %{ "divps   $dst,$src\t! div packedF" %}
 5960   ins_encode %{
 5961     __ divps($dst$$XMMRegister, $src$$XMMRegister);
 5962   %}
 5963   ins_pipe( pipe_slow );
 5964 %}
 5965 
 5966 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
 5967   predicate(UseAVX > 0);
 5968   match(Set dst (DivVF src1 src2));
 5969   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
 5970   ins_encode %{
 5971     int vlen_enc = vector_length_encoding(this);
 5972     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5973   %}
 5974   ins_pipe( pipe_slow );
 5975 %}
 5976 
 5977 instruct vdivF_mem(vec dst, vec src, memory mem) %{
 5978   predicate((UseAVX > 0) &&
 5979             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5980   match(Set dst (DivVF src (LoadVector mem)));
 5981   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
 5982   ins_encode %{
 5983     int vlen_enc = vector_length_encoding(this);
 5984     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5985   %}
 5986   ins_pipe( pipe_slow );
 5987 %}
 5988 
 5989 // Doubles vector div
 5990 instruct vdivD(vec dst, vec src) %{
 5991   predicate(UseAVX == 0);
 5992   match(Set dst (DivVD dst src));
 5993   format %{ "divpd   $dst,$src\t! div packedD" %}
 5994   ins_encode %{
 5995     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
 5996   %}
 5997   ins_pipe( pipe_slow );
 5998 %}
 5999 
 6000 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
 6001   predicate(UseAVX > 0);
 6002   match(Set dst (DivVD src1 src2));
 6003   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
 6004   ins_encode %{
 6005     int vlen_enc = vector_length_encoding(this);
 6006     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6007   %}
 6008   ins_pipe( pipe_slow );
 6009 %}
 6010 
 6011 instruct vdivD_mem(vec dst, vec src, memory mem) %{
 6012   predicate((UseAVX > 0) &&
 6013             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6014   match(Set dst (DivVD src (LoadVector mem)));
 6015   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
 6016   ins_encode %{
 6017     int vlen_enc = vector_length_encoding(this);
 6018     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6019   %}
 6020   ins_pipe( pipe_slow );
 6021 %}
 6022 
 6023 // ------------------------------ MinMax ---------------------------------------
 6024 
 6025 // Byte, Short, Int vector Min/Max
 6026 instruct minmax_reg_sse(vec dst, vec src) %{
 6027   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6028             UseAVX == 0);
 6029   match(Set dst (MinV dst src));
 6030   match(Set dst (MaxV dst src));
 6031   format %{ "vector_minmax  $dst,$src\t!  " %}
 6032   ins_encode %{
 6033     assert(UseSSE >= 4, "required");
 6034 
 6035     int opcode = this->ideal_Opcode();
 6036     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6037     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
 6038   %}
 6039   ins_pipe( pipe_slow );
 6040 %}
 6041 
 6042 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
 6043   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6044             UseAVX > 0);
 6045   match(Set dst (MinV src1 src2));
 6046   match(Set dst (MaxV src1 src2));
 6047   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
 6048   ins_encode %{
 6049     int opcode = this->ideal_Opcode();
 6050     int vlen_enc = vector_length_encoding(this);
 6051     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6052 
 6053     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6054   %}
 6055   ins_pipe( pipe_slow );
 6056 %}
 6057 
 6058 // Long vector Min/Max
 6059 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
 6060   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6061             UseAVX == 0);
 6062   match(Set dst (MinV dst src));
 6063   match(Set dst (MaxV src dst));
 6064   effect(TEMP dst, TEMP tmp);
 6065   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
 6066   ins_encode %{
 6067     assert(UseSSE >= 4, "required");
 6068 
 6069     int opcode = this->ideal_Opcode();
 6070     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6071     assert(elem_bt == T_LONG, "sanity");
 6072 
 6073     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
 6074   %}
 6075   ins_pipe( pipe_slow );
 6076 %}
 6077 
 6078 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
 6079   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6080             UseAVX > 0 && !VM_Version::supports_avx512vl());
 6081   match(Set dst (MinV src1 src2));
 6082   match(Set dst (MaxV src1 src2));
 6083   effect(TEMP dst);
 6084   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
 6085   ins_encode %{
 6086     int vlen_enc = vector_length_encoding(this);
 6087     int opcode = this->ideal_Opcode();
 6088     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6089     assert(elem_bt == T_LONG, "sanity");
 6090 
 6091     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6092   %}
 6093   ins_pipe( pipe_slow );
 6094 %}
 6095 
 6096 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
 6097   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
 6098             Matcher::vector_element_basic_type(n) == T_LONG);
 6099   match(Set dst (MinV src1 src2));
 6100   match(Set dst (MaxV src1 src2));
 6101   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
 6102   ins_encode %{
 6103     assert(UseAVX > 2, "required");
 6104 
 6105     int vlen_enc = vector_length_encoding(this);
 6106     int opcode = this->ideal_Opcode();
 6107     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6108     assert(elem_bt == T_LONG, "sanity");
 6109 
 6110     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6111   %}
 6112   ins_pipe( pipe_slow );
 6113 %}
 6114 
 6115 // Float/Double vector Min/Max
 6116 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
 6117   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
 6118             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
 6119             UseAVX > 0);
 6120   match(Set dst (MinV a b));
 6121   match(Set dst (MaxV a b));
 6122   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
 6123   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
 6124   ins_encode %{
 6125     assert(UseAVX > 0, "required");
 6126 
 6127     int opcode = this->ideal_Opcode();
 6128     int vlen_enc = vector_length_encoding(this);
 6129     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6130 
 6131     __ vminmax_fp(opcode, elem_bt,
 6132                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6133                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6134   %}
 6135   ins_pipe( pipe_slow );
 6136 %}
 6137 
 6138 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
 6139   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 6140             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6141   match(Set dst (MinV a b));
 6142   match(Set dst (MaxV a b));
 6143   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
 6144   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
 6145   ins_encode %{
 6146     assert(UseAVX > 2, "required");
 6147 
 6148     int opcode = this->ideal_Opcode();
 6149     int vlen_enc = vector_length_encoding(this);
 6150     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6151 
 6152     __ evminmax_fp(opcode, elem_bt,
 6153                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6154                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6155   %}
 6156   ins_pipe( pipe_slow );
 6157 %}
 6158 
 6159 // --------------------------------- Signum/CopySign ---------------------------
 6160 
 6161 instruct signumF_reg(regF dst, regF zero, regF one, rFlagsReg cr) %{
 6162   match(Set dst (SignumF dst (Binary zero one)));
 6163   effect(KILL cr);
 6164   format %{ "signumF $dst, $dst" %}
 6165   ins_encode %{
 6166     int opcode = this->ideal_Opcode();
 6167     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6168   %}
 6169   ins_pipe( pipe_slow );
 6170 %}
 6171 
 6172 instruct signumD_reg(regD dst, regD zero, regD one, rFlagsReg cr) %{
 6173   match(Set dst (SignumD dst (Binary zero one)));
 6174   effect(KILL cr);
 6175   format %{ "signumD $dst, $dst" %}
 6176   ins_encode %{
 6177     int opcode = this->ideal_Opcode();
 6178     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6179   %}
 6180   ins_pipe( pipe_slow );
 6181 %}
 6182 
 6183 instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
 6184   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 6185   match(Set dst (SignumVF src (Binary zero one)));
 6186   match(Set dst (SignumVD src (Binary zero one)));
 6187   effect(TEMP dst, TEMP xtmp1);
 6188   format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
 6189   ins_encode %{
 6190     int opcode = this->ideal_Opcode();
 6191     int vec_enc = vector_length_encoding(this);
 6192     __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6193                          $xtmp1$$XMMRegister, vec_enc);
 6194   %}
 6195   ins_pipe( pipe_slow );
 6196 %}
 6197 
 6198 instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
 6199   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 6200   match(Set dst (SignumVF src (Binary zero one)));
 6201   match(Set dst (SignumVD src (Binary zero one)));
 6202   effect(TEMP dst, TEMP ktmp1);
 6203   format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
 6204   ins_encode %{
 6205     int opcode = this->ideal_Opcode();
 6206     int vec_enc = vector_length_encoding(this);
 6207     __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6208                           $ktmp1$$KRegister, vec_enc);
 6209   %}
 6210   ins_pipe( pipe_slow );
 6211 %}
 6212 
 6213 // ---------------------------------------
 6214 // For copySign use 0xE4 as writemask for vpternlog
 6215 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
 6216 // C (xmm2) is set to 0x7FFFFFFF
 6217 // Wherever xmm2 is 0, we want to pick from B (sign)
 6218 // Wherever xmm2 is 1, we want to pick from A (src)
 6219 //
 6220 // A B C Result
 6221 // 0 0 0 0
 6222 // 0 0 1 0
 6223 // 0 1 0 1
 6224 // 0 1 1 0
 6225 // 1 0 0 0
 6226 // 1 0 1 1
 6227 // 1 1 0 1
 6228 // 1 1 1 1
 6229 //
 6230 // Result going from high bit to low bit is 0x11100100 = 0xe4
 6231 // ---------------------------------------
 6232 
 6233 #ifdef _LP64
 6234 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
 6235   match(Set dst (CopySignF dst src));
 6236   effect(TEMP tmp1, TEMP tmp2);
 6237   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6238   ins_encode %{
 6239     __ movl($tmp2$$Register, 0x7FFFFFFF);
 6240     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
 6241     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6242   %}
 6243   ins_pipe( pipe_slow );
 6244 %}
 6245 
 6246 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
 6247   match(Set dst (CopySignD dst (Binary src zero)));
 6248   ins_cost(100);
 6249   effect(TEMP tmp1, TEMP tmp2);
 6250   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6251   ins_encode %{
 6252     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
 6253     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
 6254     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6255   %}
 6256   ins_pipe( pipe_slow );
 6257 %}
 6258 
 6259 #endif // _LP64
 6260 
 6261 //----------------------------- CompressBits/ExpandBits ------------------------
 6262 
 6263 instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6264   predicate(n->bottom_type()->isa_int());
 6265   match(Set dst (CompressBits src mask));
 6266   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6267   ins_encode %{
 6268     __ pextl($dst$$Register, $src$$Register, $mask$$Register);
 6269   %}
 6270   ins_pipe( pipe_slow );
 6271 %}
 6272 
 6273 instruct expandBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6274   predicate(n->bottom_type()->isa_int());
 6275   match(Set dst (ExpandBits src mask));
 6276   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6277   ins_encode %{
 6278     __ pdepl($dst$$Register, $src$$Register, $mask$$Register);
 6279   %}
 6280   ins_pipe( pipe_slow );
 6281 %}
 6282 
 6283 instruct compressBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6284   predicate(n->bottom_type()->isa_int());
 6285   match(Set dst (CompressBits src (LoadI mask)));
 6286   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6287   ins_encode %{
 6288     __ pextl($dst$$Register, $src$$Register, $mask$$Address);
 6289   %}
 6290   ins_pipe( pipe_slow );
 6291 %}
 6292 
 6293 instruct expandBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6294   predicate(n->bottom_type()->isa_int());
 6295   match(Set dst (ExpandBits src (LoadI mask)));
 6296   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6297   ins_encode %{
 6298     __ pdepl($dst$$Register, $src$$Register, $mask$$Address);
 6299   %}
 6300   ins_pipe( pipe_slow );
 6301 %}
 6302 
 6303 // --------------------------------- Sqrt --------------------------------------
 6304 
 6305 instruct vsqrtF_reg(vec dst, vec src) %{
 6306   match(Set dst (SqrtVF src));
 6307   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
 6308   ins_encode %{
 6309     assert(UseAVX > 0, "required");
 6310     int vlen_enc = vector_length_encoding(this);
 6311     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6312   %}
 6313   ins_pipe( pipe_slow );
 6314 %}
 6315 
 6316 instruct vsqrtF_mem(vec dst, memory mem) %{
 6317   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6318   match(Set dst (SqrtVF (LoadVector mem)));
 6319   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
 6320   ins_encode %{
 6321     assert(UseAVX > 0, "required");
 6322     int vlen_enc = vector_length_encoding(this);
 6323     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6324   %}
 6325   ins_pipe( pipe_slow );
 6326 %}
 6327 
 6328 // Floating point vector sqrt
 6329 instruct vsqrtD_reg(vec dst, vec src) %{
 6330   match(Set dst (SqrtVD src));
 6331   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
 6332   ins_encode %{
 6333     assert(UseAVX > 0, "required");
 6334     int vlen_enc = vector_length_encoding(this);
 6335     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6336   %}
 6337   ins_pipe( pipe_slow );
 6338 %}
 6339 
 6340 instruct vsqrtD_mem(vec dst, memory mem) %{
 6341   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6342   match(Set dst (SqrtVD (LoadVector mem)));
 6343   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
 6344   ins_encode %{
 6345     assert(UseAVX > 0, "required");
 6346     int vlen_enc = vector_length_encoding(this);
 6347     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6348   %}
 6349   ins_pipe( pipe_slow );
 6350 %}
 6351 
 6352 // ------------------------------ Shift ---------------------------------------
 6353 
 6354 // Left and right shift count vectors are the same on x86
 6355 // (only lowest bits of xmm reg are used for count).
 6356 instruct vshiftcnt(vec dst, rRegI cnt) %{
 6357   match(Set dst (LShiftCntV cnt));
 6358   match(Set dst (RShiftCntV cnt));
 6359   format %{ "movdl    $dst,$cnt\t! load shift count" %}
 6360   ins_encode %{
 6361     __ movdl($dst$$XMMRegister, $cnt$$Register);
 6362   %}
 6363   ins_pipe( pipe_slow );
 6364 %}
 6365 
 6366 // Byte vector shift
 6367 instruct vshiftB(vec dst, vec src, vec shift, vec tmp) %{
 6368   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
 6369   match(Set dst ( LShiftVB src shift));
 6370   match(Set dst ( RShiftVB src shift));
 6371   match(Set dst (URShiftVB src shift));
 6372   effect(TEMP dst, USE src, USE shift, TEMP tmp);
 6373   format %{"vector_byte_shift $dst,$src,$shift" %}
 6374   ins_encode %{
 6375     assert(UseSSE > 3, "required");
 6376     int opcode = this->ideal_Opcode();
 6377     bool sign = (opcode != Op_URShiftVB);
 6378     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
 6379     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
 6380     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6381     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 6382     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6383   %}
 6384   ins_pipe( pipe_slow );
 6385 %}
 6386 
 6387 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6388   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6389             UseAVX <= 1);
 6390   match(Set dst ( LShiftVB src shift));
 6391   match(Set dst ( RShiftVB src shift));
 6392   match(Set dst (URShiftVB src shift));
 6393   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2);
 6394   format %{"vector_byte_shift $dst,$src,$shift" %}
 6395   ins_encode %{
 6396     assert(UseSSE > 3, "required");
 6397     int opcode = this->ideal_Opcode();
 6398     bool sign = (opcode != Op_URShiftVB);
 6399     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
 6400     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
 6401     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
 6402     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
 6403     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
 6404     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6405     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 6406     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 6407     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 6408   %}
 6409   ins_pipe( pipe_slow );
 6410 %}
 6411 
 6412 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6413   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6414             UseAVX > 1);
 6415   match(Set dst ( LShiftVB src shift));
 6416   match(Set dst ( RShiftVB src shift));
 6417   match(Set dst (URShiftVB src shift));
 6418   effect(TEMP dst, TEMP tmp);
 6419   format %{"vector_byte_shift $dst,$src,$shift" %}
 6420   ins_encode %{
 6421     int opcode = this->ideal_Opcode();
 6422     bool sign = (opcode != Op_URShiftVB);
 6423     int vlen_enc = Assembler::AVX_256bit;
 6424     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6425     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6426     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6427     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
 6428     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
 6429   %}
 6430   ins_pipe( pipe_slow );
 6431 %}
 6432 
 6433 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6434   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
 6435   match(Set dst ( LShiftVB src shift));
 6436   match(Set dst ( RShiftVB src shift));
 6437   match(Set dst (URShiftVB src shift));
 6438   effect(TEMP dst, TEMP tmp);
 6439   format %{"vector_byte_shift $dst,$src,$shift" %}
 6440   ins_encode %{
 6441     assert(UseAVX > 1, "required");
 6442     int opcode = this->ideal_Opcode();
 6443     bool sign = (opcode != Op_URShiftVB);
 6444     int vlen_enc = Assembler::AVX_256bit;
 6445     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
 6446     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6447     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6448     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6449     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6450     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6451     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6452     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6453     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6454   %}
 6455   ins_pipe( pipe_slow );
 6456 %}
 6457 
 6458 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6459   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
 6460   match(Set dst ( LShiftVB src shift));
 6461   match(Set dst  (RShiftVB src shift));
 6462   match(Set dst (URShiftVB src shift));
 6463   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 6464   format %{"vector_byte_shift $dst,$src,$shift" %}
 6465   ins_encode %{
 6466     assert(UseAVX > 2, "required");
 6467     int opcode = this->ideal_Opcode();
 6468     bool sign = (opcode != Op_URShiftVB);
 6469     int vlen_enc = Assembler::AVX_512bit;
 6470     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
 6471     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 6472     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6473     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6474     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6475     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6476     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6477     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6478     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6479     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 6480     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
 6481     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6482   %}
 6483   ins_pipe( pipe_slow );
 6484 %}
 6485 
 6486 // Shorts vector logical right shift produces incorrect Java result
 6487 // for negative data because java code convert short value into int with
 6488 // sign extension before a shift. But char vectors are fine since chars are
 6489 // unsigned values.
 6490 // Shorts/Chars vector left shift
 6491 instruct vshiftS(vec dst, vec src, vec shift) %{
 6492   predicate(!n->as_ShiftV()->is_var_shift());
 6493   match(Set dst ( LShiftVS src shift));
 6494   match(Set dst ( RShiftVS src shift));
 6495   match(Set dst (URShiftVS src shift));
 6496   effect(TEMP dst, USE src, USE shift);
 6497   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
 6498   ins_encode %{
 6499     int opcode = this->ideal_Opcode();
 6500     if (UseAVX > 0) {
 6501       int vlen_enc = vector_length_encoding(this);
 6502       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6503     } else {
 6504       int vlen = Matcher::vector_length(this);
 6505       if (vlen == 2) {
 6506         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 6507         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6508       } else if (vlen == 4) {
 6509         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6510         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6511       } else {
 6512         assert (vlen == 8, "sanity");
 6513         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6514         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6515       }
 6516     }
 6517   %}
 6518   ins_pipe( pipe_slow );
 6519 %}
 6520 
 6521 // Integers vector left shift
 6522 instruct vshiftI(vec dst, vec src, vec shift) %{
 6523   predicate(!n->as_ShiftV()->is_var_shift());
 6524   match(Set dst ( LShiftVI src shift));
 6525   match(Set dst ( RShiftVI src shift));
 6526   match(Set dst (URShiftVI src shift));
 6527   effect(TEMP dst, USE src, USE shift);
 6528   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
 6529   ins_encode %{
 6530     int opcode = this->ideal_Opcode();
 6531     if (UseAVX > 0) {
 6532       int vlen_enc = vector_length_encoding(this);
 6533       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6534     } else {
 6535       int vlen = Matcher::vector_length(this);
 6536       if (vlen == 2) {
 6537         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6538         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6539       } else {
 6540         assert(vlen == 4, "sanity");
 6541         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6542         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6543       }
 6544     }
 6545   %}
 6546   ins_pipe( pipe_slow );
 6547 %}
 6548 
 6549 // Integers vector left constant shift
 6550 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
 6551   match(Set dst (LShiftVI src (LShiftCntV shift)));
 6552   match(Set dst (RShiftVI src (RShiftCntV shift)));
 6553   match(Set dst (URShiftVI src (RShiftCntV shift)));
 6554   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
 6555   ins_encode %{
 6556     int opcode = this->ideal_Opcode();
 6557     if (UseAVX > 0) {
 6558       int vector_len = vector_length_encoding(this);
 6559       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6560     } else {
 6561       int vlen = Matcher::vector_length(this);
 6562       if (vlen == 2) {
 6563         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6564         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6565       } else {
 6566         assert(vlen == 4, "sanity");
 6567         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6568         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6569       }
 6570     }
 6571   %}
 6572   ins_pipe( pipe_slow );
 6573 %}
 6574 
 6575 // Longs vector shift
 6576 instruct vshiftL(vec dst, vec src, vec shift) %{
 6577   predicate(!n->as_ShiftV()->is_var_shift());
 6578   match(Set dst ( LShiftVL src shift));
 6579   match(Set dst (URShiftVL src shift));
 6580   effect(TEMP dst, USE src, USE shift);
 6581   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
 6582   ins_encode %{
 6583     int opcode = this->ideal_Opcode();
 6584     if (UseAVX > 0) {
 6585       int vlen_enc = vector_length_encoding(this);
 6586       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6587     } else {
 6588       assert(Matcher::vector_length(this) == 2, "");
 6589       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6590       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6591     }
 6592   %}
 6593   ins_pipe( pipe_slow );
 6594 %}
 6595 
 6596 // Longs vector constant shift
 6597 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
 6598   match(Set dst (LShiftVL src (LShiftCntV shift)));
 6599   match(Set dst (URShiftVL src (RShiftCntV shift)));
 6600   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
 6601   ins_encode %{
 6602     int opcode = this->ideal_Opcode();
 6603     if (UseAVX > 0) {
 6604       int vector_len = vector_length_encoding(this);
 6605       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6606     } else {
 6607       assert(Matcher::vector_length(this) == 2, "");
 6608       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6609       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6610     }
 6611   %}
 6612   ins_pipe( pipe_slow );
 6613 %}
 6614 
 6615 // -------------------ArithmeticRightShift -----------------------------------
 6616 // Long vector arithmetic right shift
 6617 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp) %{
 6618   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
 6619   match(Set dst (RShiftVL src shift));
 6620   effect(TEMP dst, TEMP tmp);
 6621   format %{ "vshiftq $dst,$src,$shift" %}
 6622   ins_encode %{
 6623     uint vlen = Matcher::vector_length(this);
 6624     if (vlen == 2) {
 6625       assert(UseSSE >= 2, "required");
 6626       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6627       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
 6628       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6629       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
 6630       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
 6631       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
 6632     } else {
 6633       assert(vlen == 4, "sanity");
 6634       assert(UseAVX > 1, "required");
 6635       int vlen_enc = Assembler::AVX_256bit;
 6636       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6637       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6638       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6639       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6640       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6641     }
 6642   %}
 6643   ins_pipe( pipe_slow );
 6644 %}
 6645 
 6646 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
 6647   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
 6648   match(Set dst (RShiftVL src shift));
 6649   format %{ "vshiftq $dst,$src,$shift" %}
 6650   ins_encode %{
 6651     int vlen_enc = vector_length_encoding(this);
 6652     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6653   %}
 6654   ins_pipe( pipe_slow );
 6655 %}
 6656 
 6657 // ------------------- Variable Shift -----------------------------
 6658 // Byte variable shift
 6659 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 6660   predicate(Matcher::vector_length(n) <= 8 &&
 6661             n->as_ShiftV()->is_var_shift() &&
 6662             !VM_Version::supports_avx512bw());
 6663   match(Set dst ( LShiftVB src shift));
 6664   match(Set dst ( RShiftVB src shift));
 6665   match(Set dst (URShiftVB src shift));
 6666   effect(TEMP dst, TEMP vtmp);
 6667   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 6668   ins_encode %{
 6669     assert(UseAVX >= 2, "required");
 6670 
 6671     int opcode = this->ideal_Opcode();
 6672     int vlen_enc = Assembler::AVX_128bit;
 6673     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 6674     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 6675   %}
 6676   ins_pipe( pipe_slow );
 6677 %}
 6678 
 6679 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6680   predicate(Matcher::vector_length(n) == 16 &&
 6681             n->as_ShiftV()->is_var_shift() &&
 6682             !VM_Version::supports_avx512bw());
 6683   match(Set dst ( LShiftVB src shift));
 6684   match(Set dst ( RShiftVB src shift));
 6685   match(Set dst (URShiftVB src shift));
 6686   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6687   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 6688   ins_encode %{
 6689     assert(UseAVX >= 2, "required");
 6690 
 6691     int opcode = this->ideal_Opcode();
 6692     int vlen_enc = Assembler::AVX_128bit;
 6693     // Shift lower half and get word result in dst
 6694     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 6695 
 6696     // Shift upper half and get word result in vtmp1
 6697     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 6698     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 6699     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6700 
 6701     // Merge and down convert the two word results to byte in dst
 6702     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6703   %}
 6704   ins_pipe( pipe_slow );
 6705 %}
 6706 
 6707 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4) %{
 6708   predicate(Matcher::vector_length(n) == 32 &&
 6709             n->as_ShiftV()->is_var_shift() &&
 6710             !VM_Version::supports_avx512bw());
 6711   match(Set dst ( LShiftVB src shift));
 6712   match(Set dst ( RShiftVB src shift));
 6713   match(Set dst (URShiftVB src shift));
 6714   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4);
 6715   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 as TEMP" %}
 6716   ins_encode %{
 6717     assert(UseAVX >= 2, "required");
 6718 
 6719     int opcode = this->ideal_Opcode();
 6720     int vlen_enc = Assembler::AVX_128bit;
 6721     // Process lower 128 bits and get result in dst
 6722     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 6723     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 6724     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 6725     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6726     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6727 
 6728     // Process higher 128 bits and get result in vtmp3
 6729     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 6730     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 6731     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister);
 6732     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
 6733     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
 6734     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6735     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6736 
 6737     // Merge the two results in dst
 6738     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 6739   %}
 6740   ins_pipe( pipe_slow );
 6741 %}
 6742 
 6743 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp) %{
 6744   predicate(Matcher::vector_length(n) <= 32 &&
 6745             n->as_ShiftV()->is_var_shift() &&
 6746             VM_Version::supports_avx512bw());
 6747   match(Set dst ( LShiftVB src shift));
 6748   match(Set dst ( RShiftVB src shift));
 6749   match(Set dst (URShiftVB src shift));
 6750   effect(TEMP dst, TEMP vtmp);
 6751   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 6752   ins_encode %{
 6753     assert(UseAVX > 2, "required");
 6754 
 6755     int opcode = this->ideal_Opcode();
 6756     int vlen_enc = vector_length_encoding(this);
 6757     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 6758   %}
 6759   ins_pipe( pipe_slow );
 6760 %}
 6761 
 6762 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6763   predicate(Matcher::vector_length(n) == 64 &&
 6764             n->as_ShiftV()->is_var_shift() &&
 6765             VM_Version::supports_avx512bw());
 6766   match(Set dst ( LShiftVB src shift));
 6767   match(Set dst ( RShiftVB src shift));
 6768   match(Set dst (URShiftVB src shift));
 6769   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6770   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 6771   ins_encode %{
 6772     assert(UseAVX > 2, "required");
 6773 
 6774     int opcode = this->ideal_Opcode();
 6775     int vlen_enc = Assembler::AVX_256bit;
 6776     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 6777     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 6778     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 6779     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6780     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 6781   %}
 6782   ins_pipe( pipe_slow );
 6783 %}
 6784 
 6785 // Short variable shift
 6786 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 6787   predicate(Matcher::vector_length(n) <= 8 &&
 6788             n->as_ShiftV()->is_var_shift() &&
 6789             !VM_Version::supports_avx512bw());
 6790   match(Set dst ( LShiftVS src shift));
 6791   match(Set dst ( RShiftVS src shift));
 6792   match(Set dst (URShiftVS src shift));
 6793   effect(TEMP dst, TEMP vtmp);
 6794   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 6795   ins_encode %{
 6796     assert(UseAVX >= 2, "required");
 6797 
 6798     int opcode = this->ideal_Opcode();
 6799     bool sign = (opcode != Op_URShiftVS);
 6800     int vlen_enc = Assembler::AVX_256bit;
 6801     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
 6802     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
 6803     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 6804     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 6805     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
 6806     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 6807   %}
 6808   ins_pipe( pipe_slow );
 6809 %}
 6810 
 6811 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6812   predicate(Matcher::vector_length(n) == 16 &&
 6813             n->as_ShiftV()->is_var_shift() &&
 6814             !VM_Version::supports_avx512bw());
 6815   match(Set dst ( LShiftVS src shift));
 6816   match(Set dst ( RShiftVS src shift));
 6817   match(Set dst (URShiftVS src shift));
 6818   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6819   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 6820   ins_encode %{
 6821     assert(UseAVX >= 2, "required");
 6822 
 6823     int opcode = this->ideal_Opcode();
 6824     bool sign = (opcode != Op_URShiftVS);
 6825     int vlen_enc = Assembler::AVX_256bit;
 6826     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
 6827     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6828     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6829     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6830     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 6831 
 6832     // Shift upper half, with result in dst using vtmp1 as TEMP
 6833     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
 6834     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
 6835     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6836     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6837     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6838     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 6839 
 6840     // Merge lower and upper half result into dst
 6841     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6842     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6843   %}
 6844   ins_pipe( pipe_slow );
 6845 %}
 6846 
 6847 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
 6848   predicate(n->as_ShiftV()->is_var_shift() &&
 6849             VM_Version::supports_avx512bw());
 6850   match(Set dst ( LShiftVS src shift));
 6851   match(Set dst ( RShiftVS src shift));
 6852   match(Set dst (URShiftVS src shift));
 6853   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
 6854   ins_encode %{
 6855     assert(UseAVX > 2, "required");
 6856 
 6857     int opcode = this->ideal_Opcode();
 6858     int vlen_enc = vector_length_encoding(this);
 6859     if (!VM_Version::supports_avx512vl()) {
 6860       vlen_enc = Assembler::AVX_512bit;
 6861     }
 6862     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6863   %}
 6864   ins_pipe( pipe_slow );
 6865 %}
 6866 
 6867 //Integer variable shift
 6868 instruct vshiftI_var(vec dst, vec src, vec shift) %{
 6869   predicate(n->as_ShiftV()->is_var_shift());
 6870   match(Set dst ( LShiftVI src shift));
 6871   match(Set dst ( RShiftVI src shift));
 6872   match(Set dst (URShiftVI src shift));
 6873   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
 6874   ins_encode %{
 6875     assert(UseAVX >= 2, "required");
 6876 
 6877     int opcode = this->ideal_Opcode();
 6878     int vlen_enc = vector_length_encoding(this);
 6879     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6880   %}
 6881   ins_pipe( pipe_slow );
 6882 %}
 6883 
 6884 //Long variable shift
 6885 instruct vshiftL_var(vec dst, vec src, vec shift) %{
 6886   predicate(n->as_ShiftV()->is_var_shift());
 6887   match(Set dst ( LShiftVL src shift));
 6888   match(Set dst (URShiftVL src shift));
 6889   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
 6890   ins_encode %{
 6891     assert(UseAVX >= 2, "required");
 6892 
 6893     int opcode = this->ideal_Opcode();
 6894     int vlen_enc = vector_length_encoding(this);
 6895     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6896   %}
 6897   ins_pipe( pipe_slow );
 6898 %}
 6899 
 6900 //Long variable right shift arithmetic
 6901 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
 6902   predicate(Matcher::vector_length(n) <= 4 &&
 6903             n->as_ShiftV()->is_var_shift() &&
 6904             UseAVX == 2);
 6905   match(Set dst (RShiftVL src shift));
 6906   effect(TEMP dst, TEMP vtmp);
 6907   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
 6908   ins_encode %{
 6909     int opcode = this->ideal_Opcode();
 6910     int vlen_enc = vector_length_encoding(this);
 6911     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
 6912                  $vtmp$$XMMRegister);
 6913   %}
 6914   ins_pipe( pipe_slow );
 6915 %}
 6916 
 6917 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
 6918   predicate(n->as_ShiftV()->is_var_shift() &&
 6919             UseAVX > 2);
 6920   match(Set dst (RShiftVL src shift));
 6921   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
 6922   ins_encode %{
 6923     int opcode = this->ideal_Opcode();
 6924     int vlen_enc = vector_length_encoding(this);
 6925     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6926   %}
 6927   ins_pipe( pipe_slow );
 6928 %}
 6929 
 6930 // --------------------------------- AND --------------------------------------
 6931 
 6932 instruct vand(vec dst, vec src) %{
 6933   predicate(UseAVX == 0);
 6934   match(Set dst (AndV dst src));
 6935   format %{ "pand    $dst,$src\t! and vectors" %}
 6936   ins_encode %{
 6937     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 6938   %}
 6939   ins_pipe( pipe_slow );
 6940 %}
 6941 
 6942 instruct vand_reg(vec dst, vec src1, vec src2) %{
 6943   predicate(UseAVX > 0);
 6944   match(Set dst (AndV src1 src2));
 6945   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
 6946   ins_encode %{
 6947     int vlen_enc = vector_length_encoding(this);
 6948     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6949   %}
 6950   ins_pipe( pipe_slow );
 6951 %}
 6952 
 6953 instruct vand_mem(vec dst, vec src, memory mem) %{
 6954   predicate((UseAVX > 0) &&
 6955             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6956   match(Set dst (AndV src (LoadVector mem)));
 6957   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
 6958   ins_encode %{
 6959     int vlen_enc = vector_length_encoding(this);
 6960     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6961   %}
 6962   ins_pipe( pipe_slow );
 6963 %}
 6964 
 6965 // --------------------------------- OR ---------------------------------------
 6966 
 6967 instruct vor(vec dst, vec src) %{
 6968   predicate(UseAVX == 0);
 6969   match(Set dst (OrV dst src));
 6970   format %{ "por     $dst,$src\t! or vectors" %}
 6971   ins_encode %{
 6972     __ por($dst$$XMMRegister, $src$$XMMRegister);
 6973   %}
 6974   ins_pipe( pipe_slow );
 6975 %}
 6976 
 6977 instruct vor_reg(vec dst, vec src1, vec src2) %{
 6978   predicate(UseAVX > 0);
 6979   match(Set dst (OrV src1 src2));
 6980   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
 6981   ins_encode %{
 6982     int vlen_enc = vector_length_encoding(this);
 6983     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6984   %}
 6985   ins_pipe( pipe_slow );
 6986 %}
 6987 
 6988 instruct vor_mem(vec dst, vec src, memory mem) %{
 6989   predicate((UseAVX > 0) &&
 6990             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6991   match(Set dst (OrV src (LoadVector mem)));
 6992   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
 6993   ins_encode %{
 6994     int vlen_enc = vector_length_encoding(this);
 6995     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6996   %}
 6997   ins_pipe( pipe_slow );
 6998 %}
 6999 
 7000 // --------------------------------- XOR --------------------------------------
 7001 
 7002 instruct vxor(vec dst, vec src) %{
 7003   predicate(UseAVX == 0);
 7004   match(Set dst (XorV dst src));
 7005   format %{ "pxor    $dst,$src\t! xor vectors" %}
 7006   ins_encode %{
 7007     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
 7008   %}
 7009   ins_pipe( pipe_slow );
 7010 %}
 7011 
 7012 instruct vxor_reg(vec dst, vec src1, vec src2) %{
 7013   predicate(UseAVX > 0);
 7014   match(Set dst (XorV src1 src2));
 7015   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
 7016   ins_encode %{
 7017     int vlen_enc = vector_length_encoding(this);
 7018     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7019   %}
 7020   ins_pipe( pipe_slow );
 7021 %}
 7022 
 7023 instruct vxor_mem(vec dst, vec src, memory mem) %{
 7024   predicate((UseAVX > 0) &&
 7025             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7026   match(Set dst (XorV src (LoadVector mem)));
 7027   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
 7028   ins_encode %{
 7029     int vlen_enc = vector_length_encoding(this);
 7030     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7031   %}
 7032   ins_pipe( pipe_slow );
 7033 %}
 7034 
 7035 // --------------------------------- VectorCast --------------------------------------
 7036 
 7037 instruct vcastBtoX(vec dst, vec src) %{
 7038   match(Set dst (VectorCastB2X src));
 7039   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7040   ins_encode %{
 7041     assert(UseAVX > 0, "required");
 7042 
 7043     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7044     int vlen_enc = vector_length_encoding(this);
 7045     __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7046   %}
 7047   ins_pipe( pipe_slow );
 7048 %}
 7049 
 7050 instruct castStoX(vec dst, vec src) %{
 7051   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7052             Matcher::vector_length(n->in(1)) <= 8 && // src
 7053             Matcher::vector_element_basic_type(n) == T_BYTE);
 7054   match(Set dst (VectorCastS2X src));
 7055   format %{ "vector_cast_s2x $dst,$src" %}
 7056   ins_encode %{
 7057     assert(UseAVX > 0, "required");
 7058 
 7059     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, noreg);
 7060     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7061   %}
 7062   ins_pipe( pipe_slow );
 7063 %}
 7064 
 7065 instruct vcastStoX(vec dst, vec src, vec vtmp) %{
 7066   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7067             Matcher::vector_length(n->in(1)) == 16 && // src
 7068             Matcher::vector_element_basic_type(n) == T_BYTE);
 7069   effect(TEMP dst, TEMP vtmp);
 7070   match(Set dst (VectorCastS2X src));
 7071   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp as TEMP" %}
 7072   ins_encode %{
 7073     assert(UseAVX > 0, "required");
 7074 
 7075     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
 7076     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7077     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 7078     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7079   %}
 7080   ins_pipe( pipe_slow );
 7081 %}
 7082 
 7083 instruct vcastStoX_evex(vec dst, vec src) %{
 7084   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
 7085             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7086   match(Set dst (VectorCastS2X src));
 7087   format %{ "vector_cast_s2x $dst,$src\t!" %}
 7088   ins_encode %{
 7089     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7090     int src_vlen_enc = vector_length_encoding(this, $src);
 7091     int vlen_enc = vector_length_encoding(this);
 7092     switch (to_elem_bt) {
 7093       case T_BYTE:
 7094         if (!VM_Version::supports_avx512vl()) {
 7095           vlen_enc = Assembler::AVX_512bit;
 7096         }
 7097         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7098         break;
 7099       case T_INT:
 7100         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7101         break;
 7102       case T_FLOAT:
 7103         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7104         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7105         break;
 7106       case T_LONG:
 7107         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7108         break;
 7109       case T_DOUBLE: {
 7110         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 7111         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 7112         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7113         break;
 7114       }
 7115       default:
 7116         ShouldNotReachHere();
 7117     }
 7118   %}
 7119   ins_pipe( pipe_slow );
 7120 %}
 7121 
 7122 instruct castItoX(vec dst, vec src) %{
 7123   predicate(UseAVX <= 2 &&
 7124             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
 7125             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7126   match(Set dst (VectorCastI2X src));
 7127   format %{ "vector_cast_i2x $dst,$src" %}
 7128   ins_encode %{
 7129     assert(UseAVX > 0, "required");
 7130 
 7131     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7132     int vlen_enc = vector_length_encoding(this, $src);
 7133 
 7134     if (to_elem_bt == T_BYTE) {
 7135       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7136       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7137       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7138     } else {
 7139       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7140       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7141       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7142     }
 7143   %}
 7144   ins_pipe( pipe_slow );
 7145 %}
 7146 
 7147 instruct vcastItoX(vec dst, vec src, vec vtmp) %{
 7148   predicate(UseAVX <= 2 &&
 7149             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
 7150             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7151   match(Set dst (VectorCastI2X src));
 7152   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp as TEMP" %}
 7153   effect(TEMP dst, TEMP vtmp);
 7154   ins_encode %{
 7155     assert(UseAVX > 0, "required");
 7156 
 7157     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7158     int vlen_enc = vector_length_encoding(this, $src);
 7159 
 7160     if (to_elem_bt == T_BYTE) {
 7161       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7162       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7163       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7164       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7165     } else {
 7166       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7167       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7168       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7169       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7170     }
 7171   %}
 7172   ins_pipe( pipe_slow );
 7173 %}
 7174 
 7175 instruct vcastItoX_evex(vec dst, vec src) %{
 7176   predicate(UseAVX > 2 ||
 7177             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7178   match(Set dst (VectorCastI2X src));
 7179   format %{ "vector_cast_i2x $dst,$src\t!" %}
 7180   ins_encode %{
 7181     assert(UseAVX > 0, "required");
 7182 
 7183     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
 7184     int src_vlen_enc = vector_length_encoding(this, $src);
 7185     int dst_vlen_enc = vector_length_encoding(this);
 7186     switch (dst_elem_bt) {
 7187       case T_BYTE:
 7188         if (!VM_Version::supports_avx512vl()) {
 7189           src_vlen_enc = Assembler::AVX_512bit;
 7190         }
 7191         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7192         break;
 7193       case T_SHORT:
 7194         if (!VM_Version::supports_avx512vl()) {
 7195           src_vlen_enc = Assembler::AVX_512bit;
 7196         }
 7197         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7198         break;
 7199       case T_FLOAT:
 7200         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7201         break;
 7202       case T_LONG:
 7203         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7204         break;
 7205       case T_DOUBLE:
 7206         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7207         break;
 7208       default:
 7209         ShouldNotReachHere();
 7210     }
 7211   %}
 7212   ins_pipe( pipe_slow );
 7213 %}
 7214 
 7215 instruct vcastLtoBS(vec dst, vec src) %{
 7216   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
 7217             UseAVX <= 2);
 7218   match(Set dst (VectorCastL2X src));
 7219   format %{ "vector_cast_l2x  $dst,$src" %}
 7220   ins_encode %{
 7221     assert(UseAVX > 0, "required");
 7222 
 7223     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7224     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
 7225     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
 7226                                                       : ExternalAddress(vector_int_to_short_mask());
 7227     if (vlen <= 16) {
 7228       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
 7229       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7230       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7231     } else {
 7232       assert(vlen <= 32, "required");
 7233       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
 7234       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
 7235       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7236       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7237     }
 7238     if (to_elem_bt == T_BYTE) {
 7239       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7240     }
 7241   %}
 7242   ins_pipe( pipe_slow );
 7243 %}
 7244 
 7245 instruct vcastLtoX_evex(vec dst, vec src) %{
 7246   predicate(UseAVX > 2 ||
 7247             (Matcher::vector_element_basic_type(n) == T_INT ||
 7248              Matcher::vector_element_basic_type(n) == T_FLOAT ||
 7249              Matcher::vector_element_basic_type(n) == T_DOUBLE));
 7250   match(Set dst (VectorCastL2X src));
 7251   format %{ "vector_cast_l2x  $dst,$src\t!" %}
 7252   ins_encode %{
 7253     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7254     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7255     int vlen_enc = vector_length_encoding(this, $src);
 7256     switch (to_elem_bt) {
 7257       case T_BYTE:
 7258         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7259           vlen_enc = Assembler::AVX_512bit;
 7260         }
 7261         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7262         break;
 7263       case T_SHORT:
 7264         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7265           vlen_enc = Assembler::AVX_512bit;
 7266         }
 7267         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7268         break;
 7269       case T_INT:
 7270         if (vlen == 8) {
 7271           if ($dst$$XMMRegister != $src$$XMMRegister) {
 7272             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7273           }
 7274         } else if (vlen == 16) {
 7275           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
 7276         } else if (vlen == 32) {
 7277           if (UseAVX > 2) {
 7278             if (!VM_Version::supports_avx512vl()) {
 7279               vlen_enc = Assembler::AVX_512bit;
 7280             }
 7281             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7282           } else {
 7283             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
 7284             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 7285           }
 7286         } else { // vlen == 64
 7287           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7288         }
 7289         break;
 7290       case T_FLOAT:
 7291         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7292         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7293         break;
 7294       case T_DOUBLE:
 7295         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7296         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7297         break;
 7298 
 7299       default: assert(false, "%s", type2name(to_elem_bt));
 7300     }
 7301   %}
 7302   ins_pipe( pipe_slow );
 7303 %}
 7304 
 7305 instruct vcastFtoD_reg(vec dst, vec src) %{
 7306   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7307   match(Set dst (VectorCastF2X src));
 7308   format %{ "vector_cast_f2d  $dst,$src\t!" %}
 7309   ins_encode %{
 7310     int vlen_enc = vector_length_encoding(this);
 7311     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7312   %}
 7313   ins_pipe( pipe_slow );
 7314 %}
 7315 
 7316 
 7317 instruct castFtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7318   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7319             type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
 7320   match(Set dst (VectorCastF2X src));
 7321   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7322   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
 7323   ins_encode %{
 7324     int vlen_enc = vector_length_encoding(this, $src);
 7325     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7326     // JDK-8292878 removed the need for an explicit scratch register needed to load greater than
 7327     // 32 bit addresses for register indirect addressing mode since stub constants
 7328     // are part of code cache and there is a cap of 2G on ReservedCodeCacheSize currently.
 7329     // However, targets are free to increase this limit, but having a large code cache size
 7330     // greater than 2G looks unreasonable in practical scenario, on the hind side with given
 7331     // cap we save a temporary register allocation which in limiting case can prevent
 7332     // spilling in high register pressure blocks.
 7333     __ vector_castF2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7334                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 7335                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7336   %}
 7337   ins_pipe( pipe_slow );
 7338 %}
 7339 
 7340 instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7341   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7342             is_integral_type(Matcher::vector_element_basic_type(n)));
 7343   match(Set dst (VectorCastF2X src));
 7344   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7345   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7346   ins_encode %{
 7347     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7348     if (to_elem_bt == T_LONG) {
 7349       int vlen_enc = vector_length_encoding(this);
 7350       __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7351                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7352                              ExternalAddress(vector_double_signflip()), noreg, vlen_enc);
 7353     } else {
 7354       int vlen_enc = vector_length_encoding(this, $src);
 7355       __ vector_castF2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7356                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7357                              ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7358     }
 7359   %}
 7360   ins_pipe( pipe_slow );
 7361 %}
 7362 
 7363 instruct vcastDtoF_reg(vec dst, vec src) %{
 7364   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 7365   match(Set dst (VectorCastD2X src));
 7366   format %{ "vector_cast_d2x  $dst,$src\t!" %}
 7367   ins_encode %{
 7368     int vlen_enc = vector_length_encoding(this, $src);
 7369     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7370   %}
 7371   ins_pipe( pipe_slow );
 7372 %}
 7373 
 7374 instruct castDtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, vec xtmp5, rFlagsReg cr) %{
 7375   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7376             is_integral_type(Matcher::vector_element_basic_type(n)));
 7377   match(Set dst (VectorCastD2X src));
 7378   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP xtmp5, KILL cr);
 7379   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $xtmp5 as TEMP" %}
 7380   ins_encode %{
 7381     int vlen_enc = vector_length_encoding(this, $src);
 7382     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7383     __ vector_castD2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7384                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, $xtmp5$$XMMRegister,
 7385                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7386   %}
 7387   ins_pipe( pipe_slow );
 7388 %}
 7389 
 7390 instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7391   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7392             is_integral_type(Matcher::vector_element_basic_type(n)));
 7393   match(Set dst (VectorCastD2X src));
 7394   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7395   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7396   ins_encode %{
 7397     int vlen_enc = vector_length_encoding(this, $src);
 7398     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7399     AddressLiteral signflip = VM_Version::supports_avx512dq() ? ExternalAddress(vector_double_signflip()) :
 7400                               ExternalAddress(vector_float_signflip());
 7401     __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7402                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, signflip, noreg, vlen_enc);
 7403   %}
 7404   ins_pipe( pipe_slow );
 7405 %}
 7406 
 7407 instruct vucast(vec dst, vec src) %{
 7408   match(Set dst (VectorUCastB2X src));
 7409   match(Set dst (VectorUCastS2X src));
 7410   match(Set dst (VectorUCastI2X src));
 7411   format %{ "vector_ucast $dst,$src\t!" %}
 7412   ins_encode %{
 7413     assert(UseAVX > 0, "required");
 7414 
 7415     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
 7416     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7417     int vlen_enc = vector_length_encoding(this);
 7418     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
 7419   %}
 7420   ins_pipe( pipe_slow );
 7421 %}
 7422 
 7423 #ifdef _LP64
 7424 instruct vround_float_avx(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7425   predicate(!VM_Version::supports_avx512vl() &&
 7426             Matcher::vector_length_in_bytes(n) < 64 &&
 7427             Matcher::vector_element_basic_type(n) == T_INT);
 7428   match(Set dst (RoundVF src));
 7429   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7430   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
 7431   ins_encode %{
 7432     int vlen_enc = vector_length_encoding(this);
 7433     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7434     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister,
 7435                               ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7436                               $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
 7437   %}
 7438   ins_pipe( pipe_slow );
 7439 %}
 7440 
 7441 instruct vround_float_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7442   predicate((VM_Version::supports_avx512vl() ||
 7443              Matcher::vector_length_in_bytes(n) == 64) &&
 7444              Matcher::vector_element_basic_type(n) == T_INT);
 7445   match(Set dst (RoundVF src));
 7446   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7447   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7448   ins_encode %{
 7449     int vlen_enc = vector_length_encoding(this);
 7450     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7451     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister,
 7452                                ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7453                                $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7454   %}
 7455   ins_pipe( pipe_slow );
 7456 %}
 7457 
 7458 instruct vround_reg_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7459   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 7460   match(Set dst (RoundVD src));
 7461   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2,  KILL cr);
 7462   format %{ "vector_round_long $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7463   ins_encode %{
 7464     int vlen_enc = vector_length_encoding(this);
 7465     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7466     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister,
 7467                                 ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), new_mxcsr, vlen_enc,
 7468                                 $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7469   %}
 7470   ins_pipe( pipe_slow );
 7471 %}
 7472 
 7473 #endif // _LP64
 7474 
 7475 // --------------------------------- VectorMaskCmp --------------------------------------
 7476 
 7477 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7478   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7479             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
 7480             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7481             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7482   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7483   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7484   ins_encode %{
 7485     int vlen_enc = vector_length_encoding(this, $src1);
 7486     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7487     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7488       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7489     } else {
 7490       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7491     }
 7492   %}
 7493   ins_pipe( pipe_slow );
 7494 %}
 7495 
 7496 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7497   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
 7498             n->bottom_type()->isa_vectmask() == nullptr &&
 7499             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7500   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7501   effect(TEMP ktmp);
 7502   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7503   ins_encode %{
 7504     int vlen_enc = Assembler::AVX_512bit;
 7505     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7506     KRegister mask = k0; // The comparison itself is not being masked.
 7507     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7508       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7509       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7510     } else {
 7511       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7512       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7513     }
 7514   %}
 7515   ins_pipe( pipe_slow );
 7516 %}
 7517 
 7518 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
 7519   predicate(n->bottom_type()->isa_vectmask() &&
 7520             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7521   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7522   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
 7523   ins_encode %{
 7524     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7525     int vlen_enc = vector_length_encoding(this, $src1);
 7526     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7527     KRegister mask = k0; // The comparison itself is not being masked.
 7528     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7529       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7530     } else {
 7531       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7532     }
 7533   %}
 7534   ins_pipe( pipe_slow );
 7535 %}
 7536 
 7537 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7538   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7539             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7540             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7541             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7542             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7543             (n->in(2)->get_int() == BoolTest::eq ||
 7544              n->in(2)->get_int() == BoolTest::lt ||
 7545              n->in(2)->get_int() == BoolTest::gt)); // cond
 7546   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7547   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7548   ins_encode %{
 7549     int vlen_enc = vector_length_encoding(this, $src1);
 7550     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7551     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7552     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
 7553   %}
 7554   ins_pipe( pipe_slow );
 7555 %}
 7556 
 7557 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7558   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7559             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7560             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7561             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7562             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7563             (n->in(2)->get_int() == BoolTest::ne ||
 7564              n->in(2)->get_int() == BoolTest::le ||
 7565              n->in(2)->get_int() == BoolTest::ge)); // cond
 7566   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7567   effect(TEMP dst, TEMP xtmp);
 7568   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7569   ins_encode %{
 7570     int vlen_enc = vector_length_encoding(this, $src1);
 7571     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7572     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7573     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7574   %}
 7575   ins_pipe( pipe_slow );
 7576 %}
 7577 
 7578 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7579   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7580             Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7581             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7582             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7583             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7584   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7585   effect(TEMP dst, TEMP xtmp);
 7586   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7587   ins_encode %{
 7588     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
 7589     int vlen_enc = vector_length_encoding(this, $src1);
 7590     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7591     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7592 
 7593     if (vlen_enc == Assembler::AVX_128bit) {
 7594       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7595     } else {
 7596       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7597     }
 7598     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 7599     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7600     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7601   %}
 7602   ins_pipe( pipe_slow );
 7603 %}
 7604 
 7605 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7606   predicate((n->bottom_type()->isa_vectmask() == nullptr &&
 7607              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
 7608              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7609   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7610   effect(TEMP ktmp);
 7611   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7612   ins_encode %{
 7613     assert(UseAVX > 2, "required");
 7614 
 7615     int vlen_enc = vector_length_encoding(this, $src1);
 7616     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7617     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 7618     KRegister mask = k0; // The comparison itself is not being masked.
 7619     bool merge = false;
 7620     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7621 
 7622     switch (src1_elem_bt) {
 7623       case T_INT: {
 7624         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7625         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7626         break;
 7627       }
 7628       case T_LONG: {
 7629         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7630         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7631         break;
 7632       }
 7633       default: assert(false, "%s", type2name(src1_elem_bt));
 7634     }
 7635   %}
 7636   ins_pipe( pipe_slow );
 7637 %}
 7638 
 7639 
 7640 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
 7641   predicate(n->bottom_type()->isa_vectmask() &&
 7642             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7643   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7644   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
 7645   ins_encode %{
 7646     assert(UseAVX > 2, "required");
 7647     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7648 
 7649     int vlen_enc = vector_length_encoding(this, $src1);
 7650     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7651     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 7652     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7653 
 7654     // Comparison i
 7655     switch (src1_elem_bt) {
 7656       case T_BYTE: {
 7657         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7658         break;
 7659       }
 7660       case T_SHORT: {
 7661         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7662         break;
 7663       }
 7664       case T_INT: {
 7665         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7666         break;
 7667       }
 7668       case T_LONG: {
 7669         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7670         break;
 7671       }
 7672       default: assert(false, "%s", type2name(src1_elem_bt));
 7673     }
 7674   %}
 7675   ins_pipe( pipe_slow );
 7676 %}
 7677 
 7678 // Extract
 7679 
 7680 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
 7681   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
 7682   match(Set dst (ExtractI src idx));
 7683   match(Set dst (ExtractS src idx));
 7684 #ifdef _LP64
 7685   match(Set dst (ExtractB src idx));
 7686 #endif
 7687   format %{ "extractI $dst,$src,$idx\t!" %}
 7688   ins_encode %{
 7689     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7690 
 7691     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 7692     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 7693   %}
 7694   ins_pipe( pipe_slow );
 7695 %}
 7696 
 7697 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
 7698   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
 7699             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
 7700   match(Set dst (ExtractI src idx));
 7701   match(Set dst (ExtractS src idx));
 7702 #ifdef _LP64
 7703   match(Set dst (ExtractB src idx));
 7704 #endif
 7705   effect(TEMP vtmp);
 7706   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7707   ins_encode %{
 7708     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7709 
 7710     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 7711     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7712     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
 7713   %}
 7714   ins_pipe( pipe_slow );
 7715 %}
 7716 
 7717 #ifdef _LP64
 7718 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
 7719   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
 7720   match(Set dst (ExtractL src idx));
 7721   format %{ "extractL $dst,$src,$idx\t!" %}
 7722   ins_encode %{
 7723     assert(UseSSE >= 4, "required");
 7724     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7725 
 7726     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 7727   %}
 7728   ins_pipe( pipe_slow );
 7729 %}
 7730 
 7731 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
 7732   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 7733             Matcher::vector_length(n->in(1)) == 8);  // src
 7734   match(Set dst (ExtractL src idx));
 7735   effect(TEMP vtmp);
 7736   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7737   ins_encode %{
 7738     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7739 
 7740     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7741     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
 7742   %}
 7743   ins_pipe( pipe_slow );
 7744 %}
 7745 #endif
 7746 
 7747 instruct extractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 7748   predicate(Matcher::vector_length(n->in(1)) <= 4);
 7749   match(Set dst (ExtractF src idx));
 7750   effect(TEMP dst, TEMP vtmp);
 7751   format %{ "extractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7752   ins_encode %{
 7753     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7754 
 7755     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $vtmp$$XMMRegister);
 7756   %}
 7757   ins_pipe( pipe_slow );
 7758 %}
 7759 
 7760 instruct vextractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 7761   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
 7762             Matcher::vector_length(n->in(1)/*src*/) == 16);
 7763   match(Set dst (ExtractF src idx));
 7764   effect(TEMP vtmp);
 7765   format %{ "vextractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7766   ins_encode %{
 7767     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7768 
 7769     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7770     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant);
 7771   %}
 7772   ins_pipe( pipe_slow );
 7773 %}
 7774 
 7775 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
 7776   predicate(Matcher::vector_length(n->in(1)) == 2); // src
 7777   match(Set dst (ExtractD src idx));
 7778   format %{ "extractD $dst,$src,$idx\t!" %}
 7779   ins_encode %{
 7780     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7781 
 7782     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7783   %}
 7784   ins_pipe( pipe_slow );
 7785 %}
 7786 
 7787 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
 7788   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 7789             Matcher::vector_length(n->in(1)) == 8);  // src
 7790   match(Set dst (ExtractD src idx));
 7791   effect(TEMP vtmp);
 7792   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7793   ins_encode %{
 7794     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7795 
 7796     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7797     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
 7798   %}
 7799   ins_pipe( pipe_slow );
 7800 %}
 7801 
 7802 // --------------------------------- Vector Blend --------------------------------------
 7803 
 7804 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 7805   predicate(UseAVX == 0);
 7806   match(Set dst (VectorBlend (Binary dst src) mask));
 7807   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
 7808   effect(TEMP tmp);
 7809   ins_encode %{
 7810     assert(UseSSE >= 4, "required");
 7811 
 7812     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
 7813       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
 7814     }
 7815     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
 7816   %}
 7817   ins_pipe( pipe_slow );
 7818 %}
 7819 
 7820 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 7821   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 7822             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 7823             Matcher::vector_length_in_bytes(n) <= 32 &&
 7824             is_integral_type(Matcher::vector_element_basic_type(n)));
 7825   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7826   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 7827   ins_encode %{
 7828     int vlen_enc = vector_length_encoding(this);
 7829     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 7830   %}
 7831   ins_pipe( pipe_slow );
 7832 %}
 7833 
 7834 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
 7835   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 7836             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 7837             Matcher::vector_length_in_bytes(n) <= 32 &&
 7838             !is_integral_type(Matcher::vector_element_basic_type(n)));
 7839   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7840   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 7841   ins_encode %{
 7842     int vlen_enc = vector_length_encoding(this);
 7843     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 7844   %}
 7845   ins_pipe( pipe_slow );
 7846 %}
 7847 
 7848 instruct vblendvp(legVec dst, legVec src1, legVec src2, legVec mask, legVec vtmp) %{
 7849   predicate(UseAVX > 0 && EnableX86ECoreOpts &&
 7850             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 7851             Matcher::vector_length_in_bytes(n) <= 32);
 7852   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7853   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $vtmp as TEMP" %}
 7854   effect(TEMP vtmp, TEMP dst);
 7855   ins_encode %{
 7856     int vlen_enc = vector_length_encoding(this);
 7857     __ vpandn($vtmp$$XMMRegister, $mask$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 7858     __ vpand ($dst$$XMMRegister,  $mask$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7859     __ vpor  ($dst$$XMMRegister,  $dst$$XMMRegister,  $vtmp$$XMMRegister, vlen_enc);
 7860   %}
 7861   ins_pipe( pipe_slow );
 7862 %}
 7863 
 7864 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
 7865   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 7866             n->in(2)->bottom_type()->isa_vectmask() == nullptr);
 7867   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7868   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 7869   effect(TEMP ktmp);
 7870   ins_encode %{
 7871      int vlen_enc = Assembler::AVX_512bit;
 7872      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 7873     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, noreg);
 7874     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 7875   %}
 7876   ins_pipe( pipe_slow );
 7877 %}
 7878 
 7879 
 7880 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask) %{
 7881   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
 7882             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
 7883              VM_Version::supports_avx512bw()));
 7884   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7885   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 7886   ins_encode %{
 7887     int vlen_enc = vector_length_encoding(this);
 7888     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 7889     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 7890   %}
 7891   ins_pipe( pipe_slow );
 7892 %}
 7893 
 7894 // --------------------------------- ABS --------------------------------------
 7895 // a = |a|
 7896 instruct vabsB_reg(vec dst, vec src) %{
 7897   match(Set dst (AbsVB  src));
 7898   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
 7899   ins_encode %{
 7900     uint vlen = Matcher::vector_length(this);
 7901     if (vlen <= 16) {
 7902       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 7903     } else {
 7904       int vlen_enc = vector_length_encoding(this);
 7905       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7906     }
 7907   %}
 7908   ins_pipe( pipe_slow );
 7909 %}
 7910 
 7911 instruct vabsS_reg(vec dst, vec src) %{
 7912   match(Set dst (AbsVS  src));
 7913   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
 7914   ins_encode %{
 7915     uint vlen = Matcher::vector_length(this);
 7916     if (vlen <= 8) {
 7917       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 7918     } else {
 7919       int vlen_enc = vector_length_encoding(this);
 7920       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7921     }
 7922   %}
 7923   ins_pipe( pipe_slow );
 7924 %}
 7925 
 7926 instruct vabsI_reg(vec dst, vec src) %{
 7927   match(Set dst (AbsVI  src));
 7928   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
 7929   ins_encode %{
 7930     uint vlen = Matcher::vector_length(this);
 7931     if (vlen <= 4) {
 7932       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 7933     } else {
 7934       int vlen_enc = vector_length_encoding(this);
 7935       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7936     }
 7937   %}
 7938   ins_pipe( pipe_slow );
 7939 %}
 7940 
 7941 instruct vabsL_reg(vec dst, vec src) %{
 7942   match(Set dst (AbsVL  src));
 7943   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
 7944   ins_encode %{
 7945     assert(UseAVX > 2, "required");
 7946     int vlen_enc = vector_length_encoding(this);
 7947     if (!VM_Version::supports_avx512vl()) {
 7948       vlen_enc = Assembler::AVX_512bit;
 7949     }
 7950     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7951   %}
 7952   ins_pipe( pipe_slow );
 7953 %}
 7954 
 7955 // --------------------------------- ABSNEG --------------------------------------
 7956 
 7957 instruct vabsnegF(vec dst, vec src) %{
 7958   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
 7959   match(Set dst (AbsVF src));
 7960   match(Set dst (NegVF src));
 7961   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
 7962   ins_cost(150);
 7963   ins_encode %{
 7964     int opcode = this->ideal_Opcode();
 7965     int vlen = Matcher::vector_length(this);
 7966     if (vlen == 2) {
 7967       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 7968     } else {
 7969       assert(vlen == 8 || vlen == 16, "required");
 7970       int vlen_enc = vector_length_encoding(this);
 7971       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7972     }
 7973   %}
 7974   ins_pipe( pipe_slow );
 7975 %}
 7976 
 7977 instruct vabsneg4F(vec dst) %{
 7978   predicate(Matcher::vector_length(n) == 4);
 7979   match(Set dst (AbsVF dst));
 7980   match(Set dst (NegVF dst));
 7981   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
 7982   ins_cost(150);
 7983   ins_encode %{
 7984     int opcode = this->ideal_Opcode();
 7985     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister);
 7986   %}
 7987   ins_pipe( pipe_slow );
 7988 %}
 7989 
 7990 instruct vabsnegD(vec dst, vec src) %{
 7991   match(Set dst (AbsVD  src));
 7992   match(Set dst (NegVD  src));
 7993   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
 7994   ins_encode %{
 7995     int opcode = this->ideal_Opcode();
 7996     uint vlen = Matcher::vector_length(this);
 7997     if (vlen == 2) {
 7998       assert(UseSSE >= 2, "required");
 7999       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8000     } else {
 8001       int vlen_enc = vector_length_encoding(this);
 8002       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8003     }
 8004   %}
 8005   ins_pipe( pipe_slow );
 8006 %}
 8007 
 8008 //------------------------------------- VectorTest --------------------------------------------
 8009 
 8010 #ifdef _LP64
 8011 instruct vptest_lt16(rFlagsRegU cr, legVec src1, legVec src2, legVec vtmp) %{
 8012   predicate(Matcher::vector_length_in_bytes(n->in(1)) < 16);
 8013   match(Set cr (VectorTest src1 src2));
 8014   effect(TEMP vtmp);
 8015   format %{ "vptest_lt16  $src1, $src2\t! using $vtmp as TEMP" %}
 8016   ins_encode %{
 8017     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8018     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8019     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister, vlen);
 8020   %}
 8021   ins_pipe( pipe_slow );
 8022 %}
 8023 
 8024 instruct vptest_ge16(rFlagsRegU cr, legVec src1, legVec src2) %{
 8025   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16);
 8026   match(Set cr (VectorTest src1 src2));
 8027   format %{ "vptest_ge16  $src1, $src2\n\t" %}
 8028   ins_encode %{
 8029     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8030     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8031     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, vlen);
 8032   %}
 8033   ins_pipe( pipe_slow );
 8034 %}
 8035 
 8036 instruct ktest_alltrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8037   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8038              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8039             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 8040   match(Set cr (VectorTest src1 src2));
 8041   effect(TEMP tmp);
 8042   format %{ "ktest_alltrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8043   ins_encode %{
 8044     uint masklen = Matcher::vector_length(this, $src1);
 8045     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8046     __ andl($tmp$$Register, (1 << masklen) - 1);
 8047     __ cmpl($tmp$$Register, (1 << masklen) - 1);
 8048   %}
 8049   ins_pipe( pipe_slow );
 8050 %}
 8051 
 8052 instruct ktest_anytrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8053   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8054              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8055             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8056   match(Set cr (VectorTest src1 src2));
 8057   effect(TEMP tmp);
 8058   format %{ "ktest_anytrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8059   ins_encode %{
 8060     uint masklen = Matcher::vector_length(this, $src1);
 8061     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8062     __ andl($tmp$$Register, (1 << masklen) - 1);
 8063   %}
 8064   ins_pipe( pipe_slow );
 8065 %}
 8066 
 8067 instruct ktest_ge8(rFlagsRegU cr, kReg src1, kReg src2) %{
 8068   predicate(Matcher::vector_length(n->in(1)) >= 16 ||
 8069             (Matcher::vector_length(n->in(1)) == 8 && VM_Version::supports_avx512dq()));
 8070   match(Set cr (VectorTest src1 src2));
 8071   format %{ "ktest_ge8  $src1, $src2\n\t" %}
 8072   ins_encode %{
 8073     uint masklen = Matcher::vector_length(this, $src1);
 8074     __ kortest(masklen, $src1$$KRegister, $src1$$KRegister);
 8075   %}
 8076   ins_pipe( pipe_slow );
 8077 %}
 8078 #endif
 8079 
 8080 //------------------------------------- LoadMask --------------------------------------------
 8081 
 8082 instruct loadMask(legVec dst, legVec src) %{
 8083   predicate(n->bottom_type()->isa_vectmask() == nullptr && !VM_Version::supports_avx512vlbw());
 8084   match(Set dst (VectorLoadMask src));
 8085   effect(TEMP dst);
 8086   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
 8087   ins_encode %{
 8088     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8089     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8090     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
 8091   %}
 8092   ins_pipe( pipe_slow );
 8093 %}
 8094 
 8095 instruct loadMask64(kReg dst, vec src, vec xtmp) %{
 8096   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8097   match(Set dst (VectorLoadMask src));
 8098   effect(TEMP xtmp);
 8099   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp as TEMP" %}
 8100   ins_encode %{
 8101     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8102                         true, Assembler::AVX_512bit);
 8103   %}
 8104   ins_pipe( pipe_slow );
 8105 %}
 8106 
 8107 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
 8108   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8109   match(Set dst (VectorLoadMask src));
 8110   effect(TEMP xtmp);
 8111   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
 8112   ins_encode %{
 8113     int vlen_enc = vector_length_encoding(in(1));
 8114     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8115                         false, vlen_enc);
 8116   %}
 8117   ins_pipe( pipe_slow );
 8118 %}
 8119 
 8120 //------------------------------------- StoreMask --------------------------------------------
 8121 
 8122 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
 8123   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8124   match(Set dst (VectorStoreMask src size));
 8125   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8126   ins_encode %{
 8127     int vlen = Matcher::vector_length(this);
 8128     if (vlen <= 16 && UseAVX <= 2) {
 8129       assert(UseSSE >= 3, "required");
 8130       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8131     } else {
 8132       assert(UseAVX > 0, "required");
 8133       int src_vlen_enc = vector_length_encoding(this, $src);
 8134       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8135     }
 8136   %}
 8137   ins_pipe( pipe_slow );
 8138 %}
 8139 
 8140 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
 8141   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8142   match(Set dst (VectorStoreMask src size));
 8143   effect(TEMP_DEF dst, TEMP xtmp);
 8144   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8145   ins_encode %{
 8146     int vlen_enc = Assembler::AVX_128bit;
 8147     int vlen = Matcher::vector_length(this);
 8148     if (vlen <= 8) {
 8149       assert(UseSSE >= 3, "required");
 8150       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8151       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8152       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8153     } else {
 8154       assert(UseAVX > 0, "required");
 8155       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8156       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8157       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8158     }
 8159   %}
 8160   ins_pipe( pipe_slow );
 8161 %}
 8162 
 8163 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
 8164   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8165   match(Set dst (VectorStoreMask src size));
 8166   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8167   effect(TEMP_DEF dst, TEMP xtmp);
 8168   ins_encode %{
 8169     int vlen_enc = Assembler::AVX_128bit;
 8170     int vlen = Matcher::vector_length(this);
 8171     if (vlen <= 4) {
 8172       assert(UseSSE >= 3, "required");
 8173       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8174       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8175       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8176       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8177     } else {
 8178       assert(UseAVX > 0, "required");
 8179       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8180       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8181       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8182       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8183       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8184     }
 8185   %}
 8186   ins_pipe( pipe_slow );
 8187 %}
 8188 
 8189 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
 8190   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
 8191   match(Set dst (VectorStoreMask src size));
 8192   effect(TEMP_DEF dst, TEMP xtmp);
 8193   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8194   ins_encode %{
 8195     assert(UseSSE >= 3, "required");
 8196     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8197     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
 8198     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
 8199     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8200     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8201   %}
 8202   ins_pipe( pipe_slow );
 8203 %}
 8204 
 8205 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
 8206   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
 8207   match(Set dst (VectorStoreMask src size));
 8208   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
 8209   effect(TEMP_DEF dst, TEMP vtmp);
 8210   ins_encode %{
 8211     int vlen_enc = Assembler::AVX_128bit;
 8212     __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
 8213     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 8214     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
 8215     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8216     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8217     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8218     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8219   %}
 8220   ins_pipe( pipe_slow );
 8221 %}
 8222 
 8223 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
 8224   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8225   match(Set dst (VectorStoreMask src size));
 8226   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8227   ins_encode %{
 8228     int src_vlen_enc = vector_length_encoding(this, $src);
 8229     int dst_vlen_enc = vector_length_encoding(this);
 8230     if (!VM_Version::supports_avx512vl()) {
 8231       src_vlen_enc = Assembler::AVX_512bit;
 8232     }
 8233     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8234     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8235   %}
 8236   ins_pipe( pipe_slow );
 8237 %}
 8238 
 8239 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
 8240   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8241   match(Set dst (VectorStoreMask src size));
 8242   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8243   ins_encode %{
 8244     int src_vlen_enc = vector_length_encoding(this, $src);
 8245     int dst_vlen_enc = vector_length_encoding(this);
 8246     if (!VM_Version::supports_avx512vl()) {
 8247       src_vlen_enc = Assembler::AVX_512bit;
 8248     }
 8249     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8250     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8251   %}
 8252   ins_pipe( pipe_slow );
 8253 %}
 8254 
 8255 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size) %{
 8256   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8257   match(Set dst (VectorStoreMask mask size));
 8258   effect(TEMP_DEF dst);
 8259   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8260   ins_encode %{
 8261     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
 8262     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
 8263                  false, Assembler::AVX_512bit, noreg);
 8264     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
 8265   %}
 8266   ins_pipe( pipe_slow );
 8267 %}
 8268 
 8269 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
 8270   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8271   match(Set dst (VectorStoreMask mask size));
 8272   effect(TEMP_DEF dst);
 8273   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8274   ins_encode %{
 8275     int dst_vlen_enc = vector_length_encoding(this);
 8276     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
 8277     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8278   %}
 8279   ins_pipe( pipe_slow );
 8280 %}
 8281 
 8282 instruct vmaskcast_evex(kReg dst) %{
 8283   match(Set dst (VectorMaskCast dst));
 8284   ins_cost(0);
 8285   format %{ "vector_mask_cast $dst" %}
 8286   ins_encode %{
 8287     // empty
 8288   %}
 8289   ins_pipe(empty);
 8290 %}
 8291 
 8292 instruct vmaskcast(vec dst) %{
 8293   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
 8294   match(Set dst (VectorMaskCast dst));
 8295   ins_cost(0);
 8296   format %{ "vector_mask_cast $dst" %}
 8297   ins_encode %{
 8298     // empty
 8299   %}
 8300   ins_pipe(empty);
 8301 %}
 8302 
 8303 instruct vmaskcast_avx(vec dst, vec src) %{
 8304   predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)));
 8305   match(Set dst (VectorMaskCast src));
 8306   format %{ "vector_mask_cast $dst, $src" %}
 8307   ins_encode %{
 8308     int vlen = Matcher::vector_length(this);
 8309     BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
 8310     BasicType dst_bt = Matcher::vector_element_basic_type(this);
 8311     __ vector_mask_cast($dst$$XMMRegister, $src$$XMMRegister, dst_bt, src_bt, vlen);
 8312   %}
 8313   ins_pipe(pipe_slow);
 8314 %}
 8315 
 8316 //-------------------------------- Load Iota Indices ----------------------------------
 8317 
 8318 instruct loadIotaIndices(vec dst, immI_0 src) %{
 8319   match(Set dst (VectorLoadConst src));
 8320   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
 8321   ins_encode %{
 8322      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8323      BasicType bt = Matcher::vector_element_basic_type(this);
 8324      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, bt);
 8325   %}
 8326   ins_pipe( pipe_slow );
 8327 %}
 8328 
 8329 #ifdef _LP64
 8330 instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
 8331   match(Set dst (PopulateIndex src1 src2));
 8332   effect(TEMP dst, TEMP vtmp);
 8333   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8334   ins_encode %{
 8335      assert($src2$$constant == 1, "required");
 8336      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8337      int vlen_enc = vector_length_encoding(this);
 8338      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8339      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8340      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8341      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8342   %}
 8343   ins_pipe( pipe_slow );
 8344 %}
 8345 
 8346 instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
 8347   match(Set dst (PopulateIndex src1 src2));
 8348   effect(TEMP dst, TEMP vtmp);
 8349   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8350   ins_encode %{
 8351      assert($src2$$constant == 1, "required");
 8352      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8353      int vlen_enc = vector_length_encoding(this);
 8354      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8355      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8356      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8357      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8358   %}
 8359   ins_pipe( pipe_slow );
 8360 %}
 8361 #endif
 8362 //-------------------------------- Rearrange ----------------------------------
 8363 
 8364 // LoadShuffle/Rearrange for Byte
 8365 
 8366 instruct loadShuffleB(vec dst) %{
 8367   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 8368   match(Set dst (VectorLoadShuffle dst));
 8369   format %{ "vector_load_shuffle $dst, $dst" %}
 8370   ins_encode %{
 8371     // empty
 8372   %}
 8373   ins_pipe( pipe_slow );
 8374 %}
 8375 
 8376 instruct rearrangeB(vec dst, vec shuffle) %{
 8377   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8378             Matcher::vector_length(n) < 32);
 8379   match(Set dst (VectorRearrange dst shuffle));
 8380   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8381   ins_encode %{
 8382     assert(UseSSE >= 4, "required");
 8383     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8384   %}
 8385   ins_pipe( pipe_slow );
 8386 %}
 8387 
 8388 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8389   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8390             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
 8391   match(Set dst (VectorRearrange src shuffle));
 8392   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8393   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8394   ins_encode %{
 8395     assert(UseAVX >= 2, "required");
 8396     // Swap src into vtmp1
 8397     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8398     // Shuffle swapped src to get entries from other 128 bit lane
 8399     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8400     // Shuffle original src to get entries from self 128 bit lane
 8401     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8402     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8403     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8404     // Perform the blend
 8405     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8406   %}
 8407   ins_pipe( pipe_slow );
 8408 %}
 8409 
 8410 
 8411 instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
 8412   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8413             Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
 8414   match(Set dst (VectorRearrange src shuffle));
 8415   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 8416   format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
 8417   ins_encode %{
 8418     int vlen_enc = vector_length_encoding(this);
 8419     __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
 8420                        $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
 8421                        $rtmp$$Register, $ktmp$$KRegister, vlen_enc);
 8422   %}
 8423   ins_pipe( pipe_slow );
 8424 %}
 8425 
 8426 instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
 8427   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8428             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
 8429   match(Set dst (VectorRearrange src shuffle));
 8430   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8431   ins_encode %{
 8432     int vlen_enc = vector_length_encoding(this);
 8433     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8434   %}
 8435   ins_pipe( pipe_slow );
 8436 %}
 8437 
 8438 // LoadShuffle/Rearrange for Short
 8439 
 8440 instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
 8441   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8442             Matcher::vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
 8443   match(Set dst (VectorLoadShuffle src));
 8444   effect(TEMP dst, TEMP vtmp);
 8445   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8446   ins_encode %{
 8447     // Create a byte shuffle mask from short shuffle mask
 8448     // only byte shuffle instruction available on these platforms
 8449     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8450     if (UseAVX == 0) {
 8451       assert(vlen_in_bytes <= 16, "required");
 8452       // Multiply each shuffle by two to get byte index
 8453       __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
 8454       __ psllw($vtmp$$XMMRegister, 1);
 8455 
 8456       // Duplicate to create 2 copies of byte index
 8457       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8458       __ psllw($dst$$XMMRegister, 8);
 8459       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
 8460 
 8461       // Add one to get alternate byte index
 8462       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), noreg);
 8463       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8464     } else {
 8465       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
 8466       int vlen_enc = vector_length_encoding(this);
 8467       // Multiply each shuffle by two to get byte index
 8468       __ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8469       __ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
 8470 
 8471       // Duplicate to create 2 copies of byte index
 8472       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
 8473       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8474 
 8475       // Add one to get alternate byte index
 8476       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, noreg);
 8477     }
 8478   %}
 8479   ins_pipe( pipe_slow );
 8480 %}
 8481 
 8482 instruct rearrangeS(vec dst, vec shuffle) %{
 8483   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8484             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
 8485   match(Set dst (VectorRearrange dst shuffle));
 8486   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8487   ins_encode %{
 8488     assert(UseSSE >= 4, "required");
 8489     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8490   %}
 8491   ins_pipe( pipe_slow );
 8492 %}
 8493 
 8494 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8495   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8496             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
 8497   match(Set dst (VectorRearrange src shuffle));
 8498   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8499   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8500   ins_encode %{
 8501     assert(UseAVX >= 2, "required");
 8502     // Swap src into vtmp1
 8503     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8504     // Shuffle swapped src to get entries from other 128 bit lane
 8505     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8506     // Shuffle original src to get entries from self 128 bit lane
 8507     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8508     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8509     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8510     // Perform the blend
 8511     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8512   %}
 8513   ins_pipe( pipe_slow );
 8514 %}
 8515 
 8516 instruct loadShuffleS_evex(vec dst, vec src) %{
 8517   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8518             VM_Version::supports_avx512bw());
 8519   match(Set dst (VectorLoadShuffle src));
 8520   format %{ "vector_load_shuffle $dst, $src" %}
 8521   ins_encode %{
 8522     int vlen_enc = vector_length_encoding(this);
 8523     if (!VM_Version::supports_avx512vl()) {
 8524       vlen_enc = Assembler::AVX_512bit;
 8525     }
 8526     __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8527   %}
 8528   ins_pipe( pipe_slow );
 8529 %}
 8530 
 8531 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
 8532   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8533             VM_Version::supports_avx512bw());
 8534   match(Set dst (VectorRearrange src shuffle));
 8535   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8536   ins_encode %{
 8537     int vlen_enc = vector_length_encoding(this);
 8538     if (!VM_Version::supports_avx512vl()) {
 8539       vlen_enc = Assembler::AVX_512bit;
 8540     }
 8541     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8542   %}
 8543   ins_pipe( pipe_slow );
 8544 %}
 8545 
 8546 // LoadShuffle/Rearrange for Integer and Float
 8547 
 8548 instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
 8549   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8550             Matcher::vector_length(n) == 4 && UseAVX == 0);
 8551   match(Set dst (VectorLoadShuffle src));
 8552   effect(TEMP dst, TEMP vtmp);
 8553   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8554   ins_encode %{
 8555     assert(UseSSE >= 4, "required");
 8556 
 8557     // Create a byte shuffle mask from int shuffle mask
 8558     // only byte shuffle instruction available on these platforms
 8559 
 8560     // Duplicate and multiply each shuffle by 4
 8561     __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
 8562     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8563     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8564     __ psllw($vtmp$$XMMRegister, 2);
 8565 
 8566     // Duplicate again to create 4 copies of byte index
 8567     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8568     __ psllw($dst$$XMMRegister, 8);
 8569     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
 8570 
 8571     // Add 3,2,1,0 to get alternate byte index
 8572     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), noreg);
 8573     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8574   %}
 8575   ins_pipe( pipe_slow );
 8576 %}
 8577 
 8578 instruct rearrangeI(vec dst, vec shuffle) %{
 8579   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8580             UseAVX == 0);
 8581   match(Set dst (VectorRearrange dst shuffle));
 8582   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8583   ins_encode %{
 8584     assert(UseSSE >= 4, "required");
 8585     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8586   %}
 8587   ins_pipe( pipe_slow );
 8588 %}
 8589 
 8590 instruct loadShuffleI_avx(vec dst, vec src) %{
 8591   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8592             UseAVX > 0);
 8593   match(Set dst (VectorLoadShuffle src));
 8594   format %{ "vector_load_shuffle $dst, $src" %}
 8595   ins_encode %{
 8596     int vlen_enc = vector_length_encoding(this);
 8597     __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8598   %}
 8599   ins_pipe( pipe_slow );
 8600 %}
 8601 
 8602 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
 8603   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8604             UseAVX > 0);
 8605   match(Set dst (VectorRearrange src shuffle));
 8606   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8607   ins_encode %{
 8608     int vlen_enc = vector_length_encoding(this);
 8609     BasicType bt = Matcher::vector_element_basic_type(this);
 8610     __ vector_rearrange_int_float(bt, $dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8611   %}
 8612   ins_pipe( pipe_slow );
 8613 %}
 8614 
 8615 // LoadShuffle/Rearrange for Long and Double
 8616 
 8617 instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
 8618   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8619             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8620   match(Set dst (VectorLoadShuffle src));
 8621   effect(TEMP dst, TEMP vtmp);
 8622   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8623   ins_encode %{
 8624     assert(UseAVX >= 2, "required");
 8625 
 8626     int vlen_enc = vector_length_encoding(this);
 8627     // Create a double word shuffle mask from long shuffle mask
 8628     // only double word shuffle instruction available on these platforms
 8629 
 8630     // Multiply each shuffle by two to get double word index
 8631     __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8632     __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
 8633 
 8634     // Duplicate each double word shuffle
 8635     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
 8636     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8637 
 8638     // Add one to get alternate double word index
 8639     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, noreg);
 8640   %}
 8641   ins_pipe( pipe_slow );
 8642 %}
 8643 
 8644 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
 8645   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8646             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8647   match(Set dst (VectorRearrange src shuffle));
 8648   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8649   ins_encode %{
 8650     assert(UseAVX >= 2, "required");
 8651 
 8652     int vlen_enc = vector_length_encoding(this);
 8653     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8654   %}
 8655   ins_pipe( pipe_slow );
 8656 %}
 8657 
 8658 instruct loadShuffleL_evex(vec dst, vec src) %{
 8659   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8660             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 8661   match(Set dst (VectorLoadShuffle src));
 8662   format %{ "vector_load_shuffle $dst, $src" %}
 8663   ins_encode %{
 8664     assert(UseAVX > 2, "required");
 8665 
 8666     int vlen_enc = vector_length_encoding(this);
 8667     __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8668   %}
 8669   ins_pipe( pipe_slow );
 8670 %}
 8671 
 8672 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
 8673   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8674             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 8675   match(Set dst (VectorRearrange src shuffle));
 8676   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8677   ins_encode %{
 8678     assert(UseAVX > 2, "required");
 8679 
 8680     int vlen_enc = vector_length_encoding(this);
 8681     if (vlen_enc == Assembler::AVX_128bit) {
 8682       vlen_enc = Assembler::AVX_256bit;
 8683     }
 8684     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8685   %}
 8686   ins_pipe( pipe_slow );
 8687 %}
 8688 
 8689 // --------------------------------- FMA --------------------------------------
 8690 // a * b + c
 8691 
 8692 instruct vfmaF_reg(vec a, vec b, vec c) %{
 8693   match(Set c (FmaVF  c (Binary a b)));
 8694   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8695   ins_cost(150);
 8696   ins_encode %{
 8697     assert(UseFMA, "not enabled");
 8698     int vlen_enc = vector_length_encoding(this);
 8699     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 8700   %}
 8701   ins_pipe( pipe_slow );
 8702 %}
 8703 
 8704 instruct vfmaF_mem(vec a, memory b, vec c) %{
 8705   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 8706   match(Set c (FmaVF  c (Binary a (LoadVector b))));
 8707   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8708   ins_cost(150);
 8709   ins_encode %{
 8710     assert(UseFMA, "not enabled");
 8711     int vlen_enc = vector_length_encoding(this);
 8712     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 8713   %}
 8714   ins_pipe( pipe_slow );
 8715 %}
 8716 
 8717 instruct vfmaD_reg(vec a, vec b, vec c) %{
 8718   match(Set c (FmaVD  c (Binary a b)));
 8719   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 8720   ins_cost(150);
 8721   ins_encode %{
 8722     assert(UseFMA, "not enabled");
 8723     int vlen_enc = vector_length_encoding(this);
 8724     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 8725   %}
 8726   ins_pipe( pipe_slow );
 8727 %}
 8728 
 8729 instruct vfmaD_mem(vec a, memory b, vec c) %{
 8730   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 8731   match(Set c (FmaVD  c (Binary a (LoadVector b))));
 8732   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 8733   ins_cost(150);
 8734   ins_encode %{
 8735     assert(UseFMA, "not enabled");
 8736     int vlen_enc = vector_length_encoding(this);
 8737     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 8738   %}
 8739   ins_pipe( pipe_slow );
 8740 %}
 8741 
 8742 // --------------------------------- Vector Multiply Add --------------------------------------
 8743 
 8744 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
 8745   predicate(UseAVX == 0);
 8746   match(Set dst (MulAddVS2VI dst src1));
 8747   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
 8748   ins_encode %{
 8749     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
 8750   %}
 8751   ins_pipe( pipe_slow );
 8752 %}
 8753 
 8754 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
 8755   predicate(UseAVX > 0);
 8756   match(Set dst (MulAddVS2VI src1 src2));
 8757   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
 8758   ins_encode %{
 8759     int vlen_enc = vector_length_encoding(this);
 8760     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8761   %}
 8762   ins_pipe( pipe_slow );
 8763 %}
 8764 
 8765 // --------------------------------- Vector Multiply Add Add ----------------------------------
 8766 
 8767 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
 8768   predicate(VM_Version::supports_avx512_vnni());
 8769   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
 8770   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
 8771   ins_encode %{
 8772     assert(UseAVX > 2, "required");
 8773     int vlen_enc = vector_length_encoding(this);
 8774     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8775   %}
 8776   ins_pipe( pipe_slow );
 8777   ins_cost(10);
 8778 %}
 8779 
 8780 // --------------------------------- PopCount --------------------------------------
 8781 
 8782 instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
 8783   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8784   match(Set dst (PopCountVI src));
 8785   match(Set dst (PopCountVL src));
 8786   format %{ "vector_popcount_integral $dst, $src" %}
 8787   ins_encode %{
 8788     int opcode = this->ideal_Opcode();
 8789     int vlen_enc = vector_length_encoding(this, $src);
 8790     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8791     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
 8792   %}
 8793   ins_pipe( pipe_slow );
 8794 %}
 8795 
 8796 instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
 8797   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8798   match(Set dst (PopCountVI src mask));
 8799   match(Set dst (PopCountVL src mask));
 8800   format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
 8801   ins_encode %{
 8802     int vlen_enc = vector_length_encoding(this, $src);
 8803     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8804     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8805     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
 8806   %}
 8807   ins_pipe( pipe_slow );
 8808 %}
 8809 
 8810 instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
 8811   predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8812   match(Set dst (PopCountVI src));
 8813   match(Set dst (PopCountVL src));
 8814   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 8815   format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
 8816   ins_encode %{
 8817     int opcode = this->ideal_Opcode();
 8818     int vlen_enc = vector_length_encoding(this, $src);
 8819     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8820     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8821                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
 8822   %}
 8823   ins_pipe( pipe_slow );
 8824 %}
 8825 
 8826 // --------------------------------- Vector Trailing Zeros Count --------------------------------------
 8827 
 8828 instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
 8829   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 8830                                               Matcher::vector_length_in_bytes(n->in(1))));
 8831   match(Set dst (CountTrailingZerosV src));
 8832   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 8833   ins_cost(400);
 8834   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
 8835   ins_encode %{
 8836     int vlen_enc = vector_length_encoding(this, $src);
 8837     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8838     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 8839                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 8840   %}
 8841   ins_pipe( pipe_slow );
 8842 %}
 8843 
 8844 instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 8845   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 8846             VM_Version::supports_avx512cd() &&
 8847             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 8848   match(Set dst (CountTrailingZerosV src));
 8849   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 8850   ins_cost(400);
 8851   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
 8852   ins_encode %{
 8853     int vlen_enc = vector_length_encoding(this, $src);
 8854     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8855     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8856                                         $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 8857   %}
 8858   ins_pipe( pipe_slow );
 8859 %}
 8860 
 8861 instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
 8862   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 8863   match(Set dst (CountTrailingZerosV src));
 8864   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
 8865   ins_cost(400);
 8866   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
 8867   ins_encode %{
 8868     int vlen_enc = vector_length_encoding(this, $src);
 8869     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8870     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8871                                         $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 8872                                         $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
 8873   %}
 8874   ins_pipe( pipe_slow );
 8875 %}
 8876 
 8877 instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 8878   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 8879   match(Set dst (CountTrailingZerosV src));
 8880   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 8881   format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 8882   ins_encode %{
 8883     int vlen_enc = vector_length_encoding(this, $src);
 8884     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8885     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8886                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 8887   %}
 8888   ins_pipe( pipe_slow );
 8889 %}
 8890 
 8891 
 8892 // --------------------------------- Bitwise Ternary Logic ----------------------------------
 8893 
 8894 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
 8895   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
 8896   effect(TEMP dst);
 8897   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 8898   ins_encode %{
 8899     int vector_len = vector_length_encoding(this);
 8900     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
 8901   %}
 8902   ins_pipe( pipe_slow );
 8903 %}
 8904 
 8905 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
 8906   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
 8907   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
 8908   effect(TEMP dst);
 8909   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 8910   ins_encode %{
 8911     int vector_len = vector_length_encoding(this);
 8912     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
 8913   %}
 8914   ins_pipe( pipe_slow );
 8915 %}
 8916 
 8917 // --------------------------------- Rotation Operations ----------------------------------
 8918 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
 8919   match(Set dst (RotateLeftV src shift));
 8920   match(Set dst (RotateRightV src shift));
 8921   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
 8922   ins_encode %{
 8923     int opcode      = this->ideal_Opcode();
 8924     int vector_len  = vector_length_encoding(this);
 8925     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 8926     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 8927   %}
 8928   ins_pipe( pipe_slow );
 8929 %}
 8930 
 8931 instruct vprorate(vec dst, vec src, vec shift) %{
 8932   match(Set dst (RotateLeftV src shift));
 8933   match(Set dst (RotateRightV src shift));
 8934   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
 8935   ins_encode %{
 8936     int opcode      = this->ideal_Opcode();
 8937     int vector_len  = vector_length_encoding(this);
 8938     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 8939     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
 8940   %}
 8941   ins_pipe( pipe_slow );
 8942 %}
 8943 
 8944 // ---------------------------------- Masked Operations ------------------------------------
 8945 instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
 8946   predicate(!n->in(3)->bottom_type()->isa_vectmask());
 8947   match(Set dst (LoadVectorMasked mem mask));
 8948   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 8949   ins_encode %{
 8950     BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
 8951     int vlen_enc = vector_length_encoding(this);
 8952     __ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
 8953   %}
 8954   ins_pipe( pipe_slow );
 8955 %}
 8956 
 8957 
 8958 instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
 8959   predicate(n->in(3)->bottom_type()->isa_vectmask());
 8960   match(Set dst (LoadVectorMasked mem mask));
 8961   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 8962   ins_encode %{
 8963     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
 8964     int vector_len = vector_length_encoding(this);
 8965     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
 8966   %}
 8967   ins_pipe( pipe_slow );
 8968 %}
 8969 
 8970 instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
 8971   predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
 8972   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 8973   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 8974   ins_encode %{
 8975     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 8976     int vlen_enc = vector_length_encoding(src_node);
 8977     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 8978     __ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8979   %}
 8980   ins_pipe( pipe_slow );
 8981 %}
 8982 
 8983 instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
 8984   predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
 8985   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 8986   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 8987   ins_encode %{
 8988     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 8989     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 8990     int vlen_enc = vector_length_encoding(src_node);
 8991     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
 8992   %}
 8993   ins_pipe( pipe_slow );
 8994 %}
 8995 
 8996 #ifdef _LP64
 8997 instruct verify_vector_alignment(rRegP addr, immL32 mask, rFlagsReg cr) %{
 8998   match(Set addr (VerifyVectorAlignment addr mask));
 8999   effect(KILL cr);
 9000   format %{ "verify_vector_alignment $addr $mask \t! verify alignment" %}
 9001   ins_encode %{
 9002     Label Lskip;
 9003     // check if masked bits of addr are zero
 9004     __ testq($addr$$Register, $mask$$constant);
 9005     __ jccb(Assembler::equal, Lskip);
 9006     __ stop("verify_vector_alignment found a misaligned vector memory access");
 9007     __ bind(Lskip);
 9008   %}
 9009   ins_pipe(pipe_slow);
 9010 %}
 9011 
 9012 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 9013   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
 9014   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
 9015   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
 9016   ins_encode %{
 9017     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
 9018     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
 9019 
 9020     Label DONE;
 9021     int vlen_enc = vector_length_encoding(this, $src1);
 9022     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9023 
 9024     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
 9025     __ mov64($dst$$Register, -1L);
 9026     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
 9027     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
 9028     __ jccb(Assembler::carrySet, DONE);
 9029     __ kmovql($dst$$Register, $ktmp1$$KRegister);
 9030     __ notq($dst$$Register);
 9031     __ tzcntq($dst$$Register, $dst$$Register);
 9032     __ bind(DONE);
 9033   %}
 9034   ins_pipe( pipe_slow );
 9035 %}
 9036 
 9037 
 9038 instruct vmask_gen(kReg dst, rRegL len, rRegL temp, rFlagsReg cr) %{
 9039   match(Set dst (VectorMaskGen len));
 9040   effect(TEMP temp, KILL cr);
 9041   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
 9042   ins_encode %{
 9043     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
 9044   %}
 9045   ins_pipe( pipe_slow );
 9046 %}
 9047 
 9048 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
 9049   match(Set dst (VectorMaskGen len));
 9050   format %{ "vector_mask_gen $len \t! vector mask generator" %}
 9051   effect(TEMP temp);
 9052   ins_encode %{
 9053     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
 9054     __ kmovql($dst$$KRegister, $temp$$Register);
 9055   %}
 9056   ins_pipe( pipe_slow );
 9057 %}
 9058 
 9059 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
 9060   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9061   match(Set dst (VectorMaskToLong mask));
 9062   effect(TEMP dst, KILL cr);
 9063   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
 9064   ins_encode %{
 9065     int opcode = this->ideal_Opcode();
 9066     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9067     int mask_len = Matcher::vector_length(this, $mask);
 9068     int mask_size = mask_len * type2aelembytes(mbt);
 9069     int vlen_enc = vector_length_encoding(this, $mask);
 9070     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9071                              $dst$$Register, mask_len, mask_size, vlen_enc);
 9072   %}
 9073   ins_pipe( pipe_slow );
 9074 %}
 9075 
 9076 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
 9077   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9078   match(Set dst (VectorMaskToLong mask));
 9079   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
 9080   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9081   ins_encode %{
 9082     int opcode = this->ideal_Opcode();
 9083     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9084     int mask_len = Matcher::vector_length(this, $mask);
 9085     int vlen_enc = vector_length_encoding(this, $mask);
 9086     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9087                              $dst$$Register, mask_len, mbt, vlen_enc);
 9088   %}
 9089   ins_pipe( pipe_slow );
 9090 %}
 9091 
 9092 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
 9093   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9094   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
 9095   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
 9096   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9097   ins_encode %{
 9098     int opcode = this->ideal_Opcode();
 9099     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9100     int mask_len = Matcher::vector_length(this, $mask);
 9101     int vlen_enc = vector_length_encoding(this, $mask);
 9102     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9103                              $dst$$Register, mask_len, mbt, vlen_enc);
 9104   %}
 9105   ins_pipe( pipe_slow );
 9106 %}
 9107 
 9108 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9109   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9110   match(Set dst (VectorMaskTrueCount mask));
 9111   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9112   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
 9113   ins_encode %{
 9114     int opcode = this->ideal_Opcode();
 9115     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9116     int mask_len = Matcher::vector_length(this, $mask);
 9117     int mask_size = mask_len * type2aelembytes(mbt);
 9118     int vlen_enc = vector_length_encoding(this, $mask);
 9119     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9120                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9121   %}
 9122   ins_pipe( pipe_slow );
 9123 %}
 9124 
 9125 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9126   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9127   match(Set dst (VectorMaskTrueCount mask));
 9128   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9129   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9130   ins_encode %{
 9131     int opcode = this->ideal_Opcode();
 9132     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9133     int mask_len = Matcher::vector_length(this, $mask);
 9134     int vlen_enc = vector_length_encoding(this, $mask);
 9135     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9136                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9137   %}
 9138   ins_pipe( pipe_slow );
 9139 %}
 9140 
 9141 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9142   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9143   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
 9144   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9145   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9146   ins_encode %{
 9147     int opcode = this->ideal_Opcode();
 9148     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9149     int mask_len = Matcher::vector_length(this, $mask);
 9150     int vlen_enc = vector_length_encoding(this, $mask);
 9151     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9152                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9153   %}
 9154   ins_pipe( pipe_slow );
 9155 %}
 9156 
 9157 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9158   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9159   match(Set dst (VectorMaskFirstTrue mask));
 9160   match(Set dst (VectorMaskLastTrue mask));
 9161   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9162   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
 9163   ins_encode %{
 9164     int opcode = this->ideal_Opcode();
 9165     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9166     int mask_len = Matcher::vector_length(this, $mask);
 9167     int mask_size = mask_len * type2aelembytes(mbt);
 9168     int vlen_enc = vector_length_encoding(this, $mask);
 9169     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9170                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9171   %}
 9172   ins_pipe( pipe_slow );
 9173 %}
 9174 
 9175 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9176   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9177   match(Set dst (VectorMaskFirstTrue mask));
 9178   match(Set dst (VectorMaskLastTrue mask));
 9179   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9180   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9181   ins_encode %{
 9182     int opcode = this->ideal_Opcode();
 9183     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9184     int mask_len = Matcher::vector_length(this, $mask);
 9185     int vlen_enc = vector_length_encoding(this, $mask);
 9186     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9187                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9188   %}
 9189   ins_pipe( pipe_slow );
 9190 %}
 9191 
 9192 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9193   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9194   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
 9195   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
 9196   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9197   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9198   ins_encode %{
 9199     int opcode = this->ideal_Opcode();
 9200     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9201     int mask_len = Matcher::vector_length(this, $mask);
 9202     int vlen_enc = vector_length_encoding(this, $mask);
 9203     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9204                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9205   %}
 9206   ins_pipe( pipe_slow );
 9207 %}
 9208 
 9209 // --------------------------------- Compress/Expand Operations ---------------------------
 9210 #ifdef _LP64
 9211 instruct vcompress_reg_avx(vec dst, vec src, vec mask, rRegI rtmp, rRegL rscratch, vec perm, vec xtmp, rFlagsReg cr) %{
 9212   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 9213   match(Set dst (CompressV src mask));
 9214   match(Set dst (ExpandV src mask));
 9215   effect(TEMP_DEF dst, TEMP perm, TEMP xtmp, TEMP rtmp, TEMP rscratch, KILL cr);
 9216   format %{ "vector_compress $dst, $src, $mask \t!using $xtmp, $rtmp, $rscratch and $perm as TEMP" %}
 9217   ins_encode %{
 9218     int opcode = this->ideal_Opcode();
 9219     int vlen_enc = vector_length_encoding(this);
 9220     BasicType bt  = Matcher::vector_element_basic_type(this);
 9221     __ vector_compress_expand_avx2(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$XMMRegister, $rtmp$$Register,
 9222                                    $rscratch$$Register, $perm$$XMMRegister, $xtmp$$XMMRegister, bt, vlen_enc);
 9223   %}
 9224   ins_pipe( pipe_slow );
 9225 %}
 9226 #endif
 9227 
 9228 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
 9229   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 9230   match(Set dst (CompressV src mask));
 9231   match(Set dst (ExpandV src mask));
 9232   format %{ "vector_compress_expand $dst, $src, $mask" %}
 9233   ins_encode %{
 9234     int opcode = this->ideal_Opcode();
 9235     int vector_len = vector_length_encoding(this);
 9236     BasicType bt  = Matcher::vector_element_basic_type(this);
 9237     __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
 9238   %}
 9239   ins_pipe( pipe_slow );
 9240 %}
 9241 
 9242 instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
 9243   match(Set dst (CompressM mask));
 9244   effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
 9245   format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
 9246   ins_encode %{
 9247     assert(this->in(1)->bottom_type()->isa_vectmask(), "");
 9248     int mask_len = Matcher::vector_length(this);
 9249     __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
 9250   %}
 9251   ins_pipe( pipe_slow );
 9252 %}
 9253 
 9254 #endif // _LP64
 9255 
 9256 // -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
 9257 
 9258 instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9259   predicate(!VM_Version::supports_gfni());
 9260   match(Set dst (ReverseV src));
 9261   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9262   format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9263   ins_encode %{
 9264     int vec_enc = vector_length_encoding(this);
 9265     BasicType bt = Matcher::vector_element_basic_type(this);
 9266     __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9267                           $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9268   %}
 9269   ins_pipe( pipe_slow );
 9270 %}
 9271 
 9272 instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp) %{
 9273   predicate(VM_Version::supports_gfni());
 9274   match(Set dst (ReverseV src));
 9275   effect(TEMP dst, TEMP xtmp);
 9276   format %{ "vector_reverse_bit_gfni $dst, $src!\t using $xtmp as TEMP" %}
 9277   ins_encode %{
 9278     int vec_enc = vector_length_encoding(this);
 9279     BasicType bt  = Matcher::vector_element_basic_type(this);
 9280     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, 0x8040201008040201L, 1));
 9281     __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, addr, vec_enc,
 9282                                $xtmp$$XMMRegister);
 9283   %}
 9284   ins_pipe( pipe_slow );
 9285 %}
 9286 
 9287 instruct vreverse_byte_reg(vec dst, vec src) %{
 9288   predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
 9289   match(Set dst (ReverseBytesV src));
 9290   effect(TEMP dst);
 9291   format %{ "vector_reverse_byte $dst, $src" %}
 9292   ins_encode %{
 9293     int vec_enc = vector_length_encoding(this);
 9294     BasicType bt = Matcher::vector_element_basic_type(this);
 9295     __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, vec_enc);
 9296   %}
 9297   ins_pipe( pipe_slow );
 9298 %}
 9299 
 9300 instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9301   predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
 9302   match(Set dst (ReverseBytesV src));
 9303   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9304   format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9305   ins_encode %{
 9306     int vec_enc = vector_length_encoding(this);
 9307     BasicType bt = Matcher::vector_element_basic_type(this);
 9308     __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9309                              $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9310   %}
 9311   ins_pipe( pipe_slow );
 9312 %}
 9313 
 9314 // ---------------------------------- Vector Count Leading Zeros -----------------------------------
 9315 
 9316 instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
 9317   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9318                                               Matcher::vector_length_in_bytes(n->in(1))));
 9319   match(Set dst (CountLeadingZerosV src));
 9320   format %{ "vector_count_leading_zeros $dst, $src" %}
 9321   ins_encode %{
 9322      int vlen_enc = vector_length_encoding(this, $src);
 9323      BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9324      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9325                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
 9326   %}
 9327   ins_pipe( pipe_slow );
 9328 %}
 9329 
 9330 instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9331   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9332                                               Matcher::vector_length_in_bytes(n->in(1))));
 9333   match(Set dst (CountLeadingZerosV src mask));
 9334   format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
 9335   ins_encode %{
 9336     int vlen_enc = vector_length_encoding(this, $src);
 9337     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9338     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9339     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
 9340                                        xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
 9341   %}
 9342   ins_pipe( pipe_slow );
 9343 %}
 9344 
 9345 instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
 9346   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9347             VM_Version::supports_avx512cd() &&
 9348             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9349   match(Set dst (CountLeadingZerosV src));
 9350   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 9351   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
 9352   ins_encode %{
 9353     int vlen_enc = vector_length_encoding(this, $src);
 9354     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9355     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9356                                        $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
 9357   %}
 9358   ins_pipe( pipe_slow );
 9359 %}
 9360 
 9361 instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
 9362   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9363   match(Set dst (CountLeadingZerosV src));
 9364   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 9365   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
 9366   ins_encode %{
 9367     int vlen_enc = vector_length_encoding(this, $src);
 9368     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9369     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9370                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
 9371                                        $rtmp$$Register, true, vlen_enc);
 9372   %}
 9373   ins_pipe( pipe_slow );
 9374 %}
 9375 
 9376 instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
 9377   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
 9378             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9379   match(Set dst (CountLeadingZerosV src));
 9380   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
 9381   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
 9382   ins_encode %{
 9383     int vlen_enc = vector_length_encoding(this, $src);
 9384     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9385     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9386                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
 9387   %}
 9388   ins_pipe( pipe_slow );
 9389 %}
 9390 
 9391 instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9392   predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
 9393             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9394   match(Set dst (CountLeadingZerosV src));
 9395   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9396   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9397   ins_encode %{
 9398     int vlen_enc = vector_length_encoding(this, $src);
 9399     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9400     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9401                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9402   %}
 9403   ins_pipe( pipe_slow );
 9404 %}
 9405 
 9406 // ---------------------------------- Vector Masked Operations ------------------------------------
 9407 
 9408 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
 9409   match(Set dst (AddVB (Binary dst src2) mask));
 9410   match(Set dst (AddVS (Binary dst src2) mask));
 9411   match(Set dst (AddVI (Binary dst src2) mask));
 9412   match(Set dst (AddVL (Binary dst src2) mask));
 9413   match(Set dst (AddVF (Binary dst src2) mask));
 9414   match(Set dst (AddVD (Binary dst src2) mask));
 9415   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9416   ins_encode %{
 9417     int vlen_enc = vector_length_encoding(this);
 9418     BasicType bt = Matcher::vector_element_basic_type(this);
 9419     int opc = this->ideal_Opcode();
 9420     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9421                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9422   %}
 9423   ins_pipe( pipe_slow );
 9424 %}
 9425 
 9426 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
 9427   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
 9428   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
 9429   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
 9430   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
 9431   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
 9432   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
 9433   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9434   ins_encode %{
 9435     int vlen_enc = vector_length_encoding(this);
 9436     BasicType bt = Matcher::vector_element_basic_type(this);
 9437     int opc = this->ideal_Opcode();
 9438     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9439                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9440   %}
 9441   ins_pipe( pipe_slow );
 9442 %}
 9443 
 9444 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
 9445   match(Set dst (XorV (Binary dst src2) mask));
 9446   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9447   ins_encode %{
 9448     int vlen_enc = vector_length_encoding(this);
 9449     BasicType bt = Matcher::vector_element_basic_type(this);
 9450     int opc = this->ideal_Opcode();
 9451     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9452                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9453   %}
 9454   ins_pipe( pipe_slow );
 9455 %}
 9456 
 9457 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
 9458   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
 9459   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9460   ins_encode %{
 9461     int vlen_enc = vector_length_encoding(this);
 9462     BasicType bt = Matcher::vector_element_basic_type(this);
 9463     int opc = this->ideal_Opcode();
 9464     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9465                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9466   %}
 9467   ins_pipe( pipe_slow );
 9468 %}
 9469 
 9470 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
 9471   match(Set dst (OrV (Binary dst src2) mask));
 9472   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9473   ins_encode %{
 9474     int vlen_enc = vector_length_encoding(this);
 9475     BasicType bt = Matcher::vector_element_basic_type(this);
 9476     int opc = this->ideal_Opcode();
 9477     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9478                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9479   %}
 9480   ins_pipe( pipe_slow );
 9481 %}
 9482 
 9483 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
 9484   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
 9485   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9486   ins_encode %{
 9487     int vlen_enc = vector_length_encoding(this);
 9488     BasicType bt = Matcher::vector_element_basic_type(this);
 9489     int opc = this->ideal_Opcode();
 9490     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9491                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9492   %}
 9493   ins_pipe( pipe_slow );
 9494 %}
 9495 
 9496 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
 9497   match(Set dst (AndV (Binary dst src2) mask));
 9498   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9499   ins_encode %{
 9500     int vlen_enc = vector_length_encoding(this);
 9501     BasicType bt = Matcher::vector_element_basic_type(this);
 9502     int opc = this->ideal_Opcode();
 9503     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9504                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9505   %}
 9506   ins_pipe( pipe_slow );
 9507 %}
 9508 
 9509 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
 9510   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
 9511   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9512   ins_encode %{
 9513     int vlen_enc = vector_length_encoding(this);
 9514     BasicType bt = Matcher::vector_element_basic_type(this);
 9515     int opc = this->ideal_Opcode();
 9516     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9517                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9518   %}
 9519   ins_pipe( pipe_slow );
 9520 %}
 9521 
 9522 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
 9523   match(Set dst (SubVB (Binary dst src2) mask));
 9524   match(Set dst (SubVS (Binary dst src2) mask));
 9525   match(Set dst (SubVI (Binary dst src2) mask));
 9526   match(Set dst (SubVL (Binary dst src2) mask));
 9527   match(Set dst (SubVF (Binary dst src2) mask));
 9528   match(Set dst (SubVD (Binary dst src2) mask));
 9529   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9530   ins_encode %{
 9531     int vlen_enc = vector_length_encoding(this);
 9532     BasicType bt = Matcher::vector_element_basic_type(this);
 9533     int opc = this->ideal_Opcode();
 9534     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9535                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9536   %}
 9537   ins_pipe( pipe_slow );
 9538 %}
 9539 
 9540 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
 9541   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
 9542   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
 9543   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
 9544   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
 9545   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
 9546   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
 9547   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9548   ins_encode %{
 9549     int vlen_enc = vector_length_encoding(this);
 9550     BasicType bt = Matcher::vector_element_basic_type(this);
 9551     int opc = this->ideal_Opcode();
 9552     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9553                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9554   %}
 9555   ins_pipe( pipe_slow );
 9556 %}
 9557 
 9558 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
 9559   match(Set dst (MulVS (Binary dst src2) mask));
 9560   match(Set dst (MulVI (Binary dst src2) mask));
 9561   match(Set dst (MulVL (Binary dst src2) mask));
 9562   match(Set dst (MulVF (Binary dst src2) mask));
 9563   match(Set dst (MulVD (Binary dst src2) mask));
 9564   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9565   ins_encode %{
 9566     int vlen_enc = vector_length_encoding(this);
 9567     BasicType bt = Matcher::vector_element_basic_type(this);
 9568     int opc = this->ideal_Opcode();
 9569     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9570                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9571   %}
 9572   ins_pipe( pipe_slow );
 9573 %}
 9574 
 9575 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
 9576   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
 9577   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
 9578   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
 9579   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
 9580   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
 9581   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9582   ins_encode %{
 9583     int vlen_enc = vector_length_encoding(this);
 9584     BasicType bt = Matcher::vector_element_basic_type(this);
 9585     int opc = this->ideal_Opcode();
 9586     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9587                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9588   %}
 9589   ins_pipe( pipe_slow );
 9590 %}
 9591 
 9592 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
 9593   match(Set dst (SqrtVF dst mask));
 9594   match(Set dst (SqrtVD dst mask));
 9595   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
 9596   ins_encode %{
 9597     int vlen_enc = vector_length_encoding(this);
 9598     BasicType bt = Matcher::vector_element_basic_type(this);
 9599     int opc = this->ideal_Opcode();
 9600     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9601                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9602   %}
 9603   ins_pipe( pipe_slow );
 9604 %}
 9605 
 9606 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
 9607   match(Set dst (DivVF (Binary dst src2) mask));
 9608   match(Set dst (DivVD (Binary dst src2) mask));
 9609   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9610   ins_encode %{
 9611     int vlen_enc = vector_length_encoding(this);
 9612     BasicType bt = Matcher::vector_element_basic_type(this);
 9613     int opc = this->ideal_Opcode();
 9614     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9615                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9616   %}
 9617   ins_pipe( pipe_slow );
 9618 %}
 9619 
 9620 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
 9621   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
 9622   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
 9623   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9624   ins_encode %{
 9625     int vlen_enc = vector_length_encoding(this);
 9626     BasicType bt = Matcher::vector_element_basic_type(this);
 9627     int opc = this->ideal_Opcode();
 9628     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9629                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9630   %}
 9631   ins_pipe( pipe_slow );
 9632 %}
 9633 
 9634 
 9635 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9636   match(Set dst (RotateLeftV (Binary dst shift) mask));
 9637   match(Set dst (RotateRightV (Binary dst shift) mask));
 9638   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
 9639   ins_encode %{
 9640     int vlen_enc = vector_length_encoding(this);
 9641     BasicType bt = Matcher::vector_element_basic_type(this);
 9642     int opc = this->ideal_Opcode();
 9643     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9644                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9645   %}
 9646   ins_pipe( pipe_slow );
 9647 %}
 9648 
 9649 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
 9650   match(Set dst (RotateLeftV (Binary dst src2) mask));
 9651   match(Set dst (RotateRightV (Binary dst src2) mask));
 9652   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
 9653   ins_encode %{
 9654     int vlen_enc = vector_length_encoding(this);
 9655     BasicType bt = Matcher::vector_element_basic_type(this);
 9656     int opc = this->ideal_Opcode();
 9657     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9658                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9659   %}
 9660   ins_pipe( pipe_slow );
 9661 %}
 9662 
 9663 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9664   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
 9665   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
 9666   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
 9667   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
 9668   ins_encode %{
 9669     int vlen_enc = vector_length_encoding(this);
 9670     BasicType bt = Matcher::vector_element_basic_type(this);
 9671     int opc = this->ideal_Opcode();
 9672     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9673                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9674   %}
 9675   ins_pipe( pipe_slow );
 9676 %}
 9677 
 9678 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9679   predicate(!n->as_ShiftV()->is_var_shift());
 9680   match(Set dst (LShiftVS (Binary dst src2) mask));
 9681   match(Set dst (LShiftVI (Binary dst src2) mask));
 9682   match(Set dst (LShiftVL (Binary dst src2) mask));
 9683   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9684   ins_encode %{
 9685     int vlen_enc = vector_length_encoding(this);
 9686     BasicType bt = Matcher::vector_element_basic_type(this);
 9687     int opc = this->ideal_Opcode();
 9688     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9689                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9690   %}
 9691   ins_pipe( pipe_slow );
 9692 %}
 9693 
 9694 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9695   predicate(n->as_ShiftV()->is_var_shift());
 9696   match(Set dst (LShiftVS (Binary dst src2) mask));
 9697   match(Set dst (LShiftVI (Binary dst src2) mask));
 9698   match(Set dst (LShiftVL (Binary dst src2) mask));
 9699   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9700   ins_encode %{
 9701     int vlen_enc = vector_length_encoding(this);
 9702     BasicType bt = Matcher::vector_element_basic_type(this);
 9703     int opc = this->ideal_Opcode();
 9704     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9705                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9706   %}
 9707   ins_pipe( pipe_slow );
 9708 %}
 9709 
 9710 instruct vlshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9711   match(Set dst (LShiftVS (Binary dst (LoadVector src2)) mask));
 9712   match(Set dst (LShiftVI (Binary dst (LoadVector src2)) mask));
 9713   match(Set dst (LShiftVL (Binary dst (LoadVector src2)) mask));
 9714   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9715   ins_encode %{
 9716     int vlen_enc = vector_length_encoding(this);
 9717     BasicType bt = Matcher::vector_element_basic_type(this);
 9718     int opc = this->ideal_Opcode();
 9719     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9720                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9721   %}
 9722   ins_pipe( pipe_slow );
 9723 %}
 9724 
 9725 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9726   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
 9727   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
 9728   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
 9729   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
 9730   ins_encode %{
 9731     int vlen_enc = vector_length_encoding(this);
 9732     BasicType bt = Matcher::vector_element_basic_type(this);
 9733     int opc = this->ideal_Opcode();
 9734     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9735                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9736   %}
 9737   ins_pipe( pipe_slow );
 9738 %}
 9739 
 9740 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9741   predicate(!n->as_ShiftV()->is_var_shift());
 9742   match(Set dst (RShiftVS (Binary dst src2) mask));
 9743   match(Set dst (RShiftVI (Binary dst src2) mask));
 9744   match(Set dst (RShiftVL (Binary dst src2) mask));
 9745   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9746   ins_encode %{
 9747     int vlen_enc = vector_length_encoding(this);
 9748     BasicType bt = Matcher::vector_element_basic_type(this);
 9749     int opc = this->ideal_Opcode();
 9750     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9751                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9752   %}
 9753   ins_pipe( pipe_slow );
 9754 %}
 9755 
 9756 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9757   predicate(n->as_ShiftV()->is_var_shift());
 9758   match(Set dst (RShiftVS (Binary dst src2) mask));
 9759   match(Set dst (RShiftVI (Binary dst src2) mask));
 9760   match(Set dst (RShiftVL (Binary dst src2) mask));
 9761   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9762   ins_encode %{
 9763     int vlen_enc = vector_length_encoding(this);
 9764     BasicType bt = Matcher::vector_element_basic_type(this);
 9765     int opc = this->ideal_Opcode();
 9766     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9767                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9768   %}
 9769   ins_pipe( pipe_slow );
 9770 %}
 9771 
 9772 instruct vrshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9773   match(Set dst (RShiftVS (Binary dst (LoadVector src2)) mask));
 9774   match(Set dst (RShiftVI (Binary dst (LoadVector src2)) mask));
 9775   match(Set dst (RShiftVL (Binary dst (LoadVector src2)) mask));
 9776   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9777   ins_encode %{
 9778     int vlen_enc = vector_length_encoding(this);
 9779     BasicType bt = Matcher::vector_element_basic_type(this);
 9780     int opc = this->ideal_Opcode();
 9781     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9782                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9783   %}
 9784   ins_pipe( pipe_slow );
 9785 %}
 9786 
 9787 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9788   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
 9789   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
 9790   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
 9791   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
 9792   ins_encode %{
 9793     int vlen_enc = vector_length_encoding(this);
 9794     BasicType bt = Matcher::vector_element_basic_type(this);
 9795     int opc = this->ideal_Opcode();
 9796     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9797                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9798   %}
 9799   ins_pipe( pipe_slow );
 9800 %}
 9801 
 9802 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9803   predicate(!n->as_ShiftV()->is_var_shift());
 9804   match(Set dst (URShiftVS (Binary dst src2) mask));
 9805   match(Set dst (URShiftVI (Binary dst src2) mask));
 9806   match(Set dst (URShiftVL (Binary dst src2) mask));
 9807   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9808   ins_encode %{
 9809     int vlen_enc = vector_length_encoding(this);
 9810     BasicType bt = Matcher::vector_element_basic_type(this);
 9811     int opc = this->ideal_Opcode();
 9812     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9813                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9814   %}
 9815   ins_pipe( pipe_slow );
 9816 %}
 9817 
 9818 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9819   predicate(n->as_ShiftV()->is_var_shift());
 9820   match(Set dst (URShiftVS (Binary dst src2) mask));
 9821   match(Set dst (URShiftVI (Binary dst src2) mask));
 9822   match(Set dst (URShiftVL (Binary dst src2) mask));
 9823   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9824   ins_encode %{
 9825     int vlen_enc = vector_length_encoding(this);
 9826     BasicType bt = Matcher::vector_element_basic_type(this);
 9827     int opc = this->ideal_Opcode();
 9828     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9829                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9830   %}
 9831   ins_pipe( pipe_slow );
 9832 %}
 9833 
 9834 instruct vurshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9835   match(Set dst (URShiftVS (Binary dst (LoadVector src2)) mask));
 9836   match(Set dst (URShiftVI (Binary dst (LoadVector src2)) mask));
 9837   match(Set dst (URShiftVL (Binary dst (LoadVector src2)) mask));
 9838   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9839   ins_encode %{
 9840     int vlen_enc = vector_length_encoding(this);
 9841     BasicType bt = Matcher::vector_element_basic_type(this);
 9842     int opc = this->ideal_Opcode();
 9843     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9844                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9845   %}
 9846   ins_pipe( pipe_slow );
 9847 %}
 9848 
 9849 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
 9850   match(Set dst (MaxV (Binary dst src2) mask));
 9851   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
 9852   ins_encode %{
 9853     int vlen_enc = vector_length_encoding(this);
 9854     BasicType bt = Matcher::vector_element_basic_type(this);
 9855     int opc = this->ideal_Opcode();
 9856     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9857                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9858   %}
 9859   ins_pipe( pipe_slow );
 9860 %}
 9861 
 9862 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
 9863   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
 9864   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
 9865   ins_encode %{
 9866     int vlen_enc = vector_length_encoding(this);
 9867     BasicType bt = Matcher::vector_element_basic_type(this);
 9868     int opc = this->ideal_Opcode();
 9869     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9870                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9871   %}
 9872   ins_pipe( pipe_slow );
 9873 %}
 9874 
 9875 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
 9876   match(Set dst (MinV (Binary dst src2) mask));
 9877   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
 9878   ins_encode %{
 9879     int vlen_enc = vector_length_encoding(this);
 9880     BasicType bt = Matcher::vector_element_basic_type(this);
 9881     int opc = this->ideal_Opcode();
 9882     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9883                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9884   %}
 9885   ins_pipe( pipe_slow );
 9886 %}
 9887 
 9888 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
 9889   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
 9890   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
 9891   ins_encode %{
 9892     int vlen_enc = vector_length_encoding(this);
 9893     BasicType bt = Matcher::vector_element_basic_type(this);
 9894     int opc = this->ideal_Opcode();
 9895     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9896                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9897   %}
 9898   ins_pipe( pipe_slow );
 9899 %}
 9900 
 9901 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
 9902   match(Set dst (VectorRearrange (Binary dst src2) mask));
 9903   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
 9904   ins_encode %{
 9905     int vlen_enc = vector_length_encoding(this);
 9906     BasicType bt = Matcher::vector_element_basic_type(this);
 9907     int opc = this->ideal_Opcode();
 9908     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9909                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
 9910   %}
 9911   ins_pipe( pipe_slow );
 9912 %}
 9913 
 9914 instruct vabs_masked(vec dst, kReg mask) %{
 9915   match(Set dst (AbsVB dst mask));
 9916   match(Set dst (AbsVS dst mask));
 9917   match(Set dst (AbsVI dst mask));
 9918   match(Set dst (AbsVL dst mask));
 9919   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
 9920   ins_encode %{
 9921     int vlen_enc = vector_length_encoding(this);
 9922     BasicType bt = Matcher::vector_element_basic_type(this);
 9923     int opc = this->ideal_Opcode();
 9924     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9925                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9926   %}
 9927   ins_pipe( pipe_slow );
 9928 %}
 9929 
 9930 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
 9931   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
 9932   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
 9933   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
 9934   ins_encode %{
 9935     assert(UseFMA, "Needs FMA instructions support.");
 9936     int vlen_enc = vector_length_encoding(this);
 9937     BasicType bt = Matcher::vector_element_basic_type(this);
 9938     int opc = this->ideal_Opcode();
 9939     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9940                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
 9941   %}
 9942   ins_pipe( pipe_slow );
 9943 %}
 9944 
 9945 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
 9946   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
 9947   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
 9948   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
 9949   ins_encode %{
 9950     assert(UseFMA, "Needs FMA instructions support.");
 9951     int vlen_enc = vector_length_encoding(this);
 9952     BasicType bt = Matcher::vector_element_basic_type(this);
 9953     int opc = this->ideal_Opcode();
 9954     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9955                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
 9956   %}
 9957   ins_pipe( pipe_slow );
 9958 %}
 9959 
 9960 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask) %{
 9961   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
 9962   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask" %}
 9963   ins_encode %{
 9964     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 9965     int vlen_enc = vector_length_encoding(this, $src1);
 9966     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9967 
 9968     // Comparison i
 9969     switch (src1_elem_bt) {
 9970       case T_BYTE: {
 9971         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 9972         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9973         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9974         break;
 9975       }
 9976       case T_SHORT: {
 9977         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 9978         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9979         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9980         break;
 9981       }
 9982       case T_INT: {
 9983         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 9984         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9985         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9986         break;
 9987       }
 9988       case T_LONG: {
 9989         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 9990         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9991         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9992         break;
 9993       }
 9994       case T_FLOAT: {
 9995         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 9996         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 9997         break;
 9998       }
 9999       case T_DOUBLE: {
10000         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10001         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10002         break;
10003       }
10004       default: assert(false, "%s", type2name(src1_elem_bt)); break;
10005     }
10006   %}
10007   ins_pipe( pipe_slow );
10008 %}
10009 
10010 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
10011   predicate(Matcher::vector_length(n) <= 32);
10012   match(Set dst (MaskAll src));
10013   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
10014   ins_encode %{
10015     int mask_len = Matcher::vector_length(this);
10016     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
10017   %}
10018   ins_pipe( pipe_slow );
10019 %}
10020 
10021 #ifdef _LP64
10022 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
10023   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
10024   match(Set dst (XorVMask src (MaskAll cnt)));
10025   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
10026   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
10027   ins_encode %{
10028     uint masklen = Matcher::vector_length(this);
10029     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
10030   %}
10031   ins_pipe( pipe_slow );
10032 %}
10033 
10034 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
10035   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
10036             (Matcher::vector_length(n) == 16) ||
10037             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
10038   match(Set dst (XorVMask src (MaskAll cnt)));
10039   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
10040   ins_encode %{
10041     uint masklen = Matcher::vector_length(this);
10042     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
10043   %}
10044   ins_pipe( pipe_slow );
10045 %}
10046 
10047 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
10048   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) <= 8);
10049   match(Set dst (VectorLongToMask src));
10050   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
10051   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
10052   ins_encode %{
10053     int mask_len = Matcher::vector_length(this);
10054     int vec_enc  = vector_length_encoding(mask_len);
10055     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10056                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
10057   %}
10058   ins_pipe( pipe_slow );
10059 %}
10060 
10061 
10062 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
10063   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) > 8);
10064   match(Set dst (VectorLongToMask src));
10065   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
10066   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
10067   ins_encode %{
10068     int mask_len = Matcher::vector_length(this);
10069     assert(mask_len <= 32, "invalid mask length");
10070     int vec_enc  = vector_length_encoding(mask_len);
10071     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10072                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
10073   %}
10074   ins_pipe( pipe_slow );
10075 %}
10076 
10077 instruct long_to_mask_evex(kReg dst, rRegL src) %{
10078   predicate(n->bottom_type()->isa_vectmask());
10079   match(Set dst (VectorLongToMask src));
10080   format %{ "long_to_mask_evex $dst, $src\t!" %}
10081   ins_encode %{
10082     __ kmov($dst$$KRegister, $src$$Register);
10083   %}
10084   ins_pipe( pipe_slow );
10085 %}
10086 #endif
10087 
10088 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10089   match(Set dst (AndVMask src1 src2));
10090   match(Set dst (OrVMask src1 src2));
10091   match(Set dst (XorVMask src1 src2));
10092   effect(TEMP kscratch);
10093   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10094   ins_encode %{
10095     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10096     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10097     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
10098     uint masklen = Matcher::vector_length(this);
10099     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10100     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10101   %}
10102   ins_pipe( pipe_slow );
10103 %}
10104 
10105 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10106   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10107   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10108   ins_encode %{
10109     int vlen_enc = vector_length_encoding(this);
10110     BasicType bt = Matcher::vector_element_basic_type(this);
10111     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10112                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10113   %}
10114   ins_pipe( pipe_slow );
10115 %}
10116 
10117 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10118   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10119   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10120   ins_encode %{
10121     int vlen_enc = vector_length_encoding(this);
10122     BasicType bt = Matcher::vector_element_basic_type(this);
10123     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10124                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10125   %}
10126   ins_pipe( pipe_slow );
10127 %}
10128 
10129 instruct castMM(kReg dst)
10130 %{
10131   match(Set dst (CastVV dst));
10132 
10133   size(0);
10134   format %{ "# castVV of $dst" %}
10135   ins_encode(/* empty encoding */);
10136   ins_cost(0);
10137   ins_pipe(empty);
10138 %}
10139 
10140 instruct castVV(vec dst)
10141 %{
10142   match(Set dst (CastVV dst));
10143 
10144   size(0);
10145   format %{ "# castVV of $dst" %}
10146   ins_encode(/* empty encoding */);
10147   ins_cost(0);
10148   ins_pipe(empty);
10149 %}
10150 
10151 instruct castVVLeg(legVec dst)
10152 %{
10153   match(Set dst (CastVV dst));
10154 
10155   size(0);
10156   format %{ "# castVV of $dst" %}
10157   ins_encode(/* empty encoding */);
10158   ins_cost(0);
10159   ins_pipe(empty);
10160 %}
10161 
10162 instruct FloatClassCheck_reg_reg_vfpclass(rRegI dst, regF src, kReg ktmp, rFlagsReg cr)
10163 %{
10164   match(Set dst (IsInfiniteF src));
10165   effect(TEMP ktmp, KILL cr);
10166   format %{ "float_class_check $dst, $src" %}
10167   ins_encode %{
10168     __ vfpclassss($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10169     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10170   %}
10171   ins_pipe(pipe_slow);
10172 %}
10173 
10174 instruct DoubleClassCheck_reg_reg_vfpclass(rRegI dst, regD src, kReg ktmp, rFlagsReg cr)
10175 %{
10176   match(Set dst (IsInfiniteD src));
10177   effect(TEMP ktmp, KILL cr);
10178   format %{ "double_class_check $dst, $src" %}
10179   ins_encode %{
10180     __ vfpclasssd($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10181     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10182   %}
10183   ins_pipe(pipe_slow);
10184 %}