1 //
    2 // Copyright (c) 2011, 2024, Oracle and/or its affiliates. All rights reserved.
    3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4 //
    5 // This code is free software; you can redistribute it and/or modify it
    6 // under the terms of the GNU General Public License version 2 only, as
    7 // published by the Free Software Foundation.
    8 //
    9 // This code is distributed in the hope that it will be useful, but WITHOUT
   10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12 // version 2 for more details (a copy is included in the LICENSE file that
   13 // accompanied this code).
   14 //
   15 // You should have received a copy of the GNU General Public License version
   16 // 2 along with this work; if not, write to the Free Software Foundation,
   17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18 //
   19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20 // or visit www.oracle.com if you need additional information or have any
   21 // questions.
   22 //
   23 //
   24 
   25 // X86 Common Architecture Description File
   26 
   27 //----------REGISTER DEFINITION BLOCK------------------------------------------
   28 // This information is used by the matcher and the register allocator to
   29 // describe individual registers and classes of registers within the target
   30 // architecture.
   31 
   32 register %{
   33 //----------Architecture Description Register Definitions----------------------
   34 // General Registers
   35 // "reg_def"  name ( register save type, C convention save type,
   36 //                   ideal register type, encoding );
   37 // Register Save Types:
   38 //
   39 // NS  = No-Save:       The register allocator assumes that these registers
   40 //                      can be used without saving upon entry to the method, &
   41 //                      that they do not need to be saved at call sites.
   42 //
   43 // SOC = Save-On-Call:  The register allocator assumes that these registers
   44 //                      can be used without saving upon entry to the method,
   45 //                      but that they must be saved at call sites.
   46 //
   47 // SOE = Save-On-Entry: The register allocator assumes that these registers
   48 //                      must be saved before using them upon entry to the
   49 //                      method, but they do not need to be saved at call
   50 //                      sites.
   51 //
   52 // AS  = Always-Save:   The register allocator assumes that these registers
   53 //                      must be saved before using them upon entry to the
   54 //                      method, & that they must be saved at call sites.
   55 //
   56 // Ideal Register Type is used to determine how to save & restore a
   57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
   58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
   59 //
   60 // The encoding number is the actual bit-pattern placed into the opcodes.
   61 
   62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
   63 // Word a in each register holds a Float, words ab hold a Double.
   64 // The whole registers are used in SSE4.2 version intrinsics,
   65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
   66 // UseXMMForArrayCopy and UseSuperword flags).
   67 // For pre EVEX enabled architectures:
   68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
   69 // For EVEX enabled architectures:
   70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
   71 //
   72 // Linux ABI:   No register preserved across function calls
   73 //              XMM0-XMM7 might hold parameters
   74 // Windows ABI: XMM6-XMM15 preserved across function calls
   75 //              XMM0-XMM3 might hold parameters
   76 
   77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
   79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
   80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
   81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
   82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
   83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
   84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
   85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
   86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
   87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
   88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
   89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
   90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
   91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
   92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
   93 
   94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
   96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
   97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
   98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
   99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
  103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
  104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
  105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
  106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
  107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
  108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
  109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
  110 
  111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
  120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
  121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
  122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
  123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
  124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
  125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
  126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
  127 
  128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
  129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
  130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
  131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
  132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
  133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
  134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
  135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
  136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
  137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
  138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
  139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
  140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
  141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
  142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
  143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
  144 
  145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
  146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
  147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
  148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
  149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
  150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
  151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
  152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
  153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
  154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
  155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
  156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
  157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
  158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
  159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
  160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
  161 
  162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
  163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
  164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
  165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
  166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
  167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
  168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
  169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
  170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
  171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
  172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
  173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
  174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
  175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
  176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
  177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
  178 
  179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
  180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
  181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
  182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
  183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
  184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
  185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
  186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
  187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
  188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
  189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
  190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
  191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
  192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
  193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
  194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
  195 
  196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
  197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
  198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
  199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
  200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
  201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
  202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
  203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
  204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
  205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
  206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
  207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
  208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
  209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
  210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
  211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
  212 
  213 #ifdef _LP64
  214 
  215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
  216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
  217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
  218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
  219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
  220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
  221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
  222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
  223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
  224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
  225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
  226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
  227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
  228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
  229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
  230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
  231 
  232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
  233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
  234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
  235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
  236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
  237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
  238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
  239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
  240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
  241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
  242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
  243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
  244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
  245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
  246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
  247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
  248 
  249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
  250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
  251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
  252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
  253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
  254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
  255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
  256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
  257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
  258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
  259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
  260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
  261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
  262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
  263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
  264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
  265 
  266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
  267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
  268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
  269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
  270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
  271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
  272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
  273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
  274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
  275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
  276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
  277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
  278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
  279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
  280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
  281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
  282 
  283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
  284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
  285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
  286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
  287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
  288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
  289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
  290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
  291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
  292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
  293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
  294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
  295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
  296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
  297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
  298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
  299 
  300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
  301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
  302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
  303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
  304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
  305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
  306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
  307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
  308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
  309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
  310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
  311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
  312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
  313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
  314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
  315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
  316 
  317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
  318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
  319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
  320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
  321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
  322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
  323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
  324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
  325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
  326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
  327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
  328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
  329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
  330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
  331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
  332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
  333 
  334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
  335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
  336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
  337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
  338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
  339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
  340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
  341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
  342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
  343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
  344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
  345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
  346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
  347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
  348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
  349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
  350 
  351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
  352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
  353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
  354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
  355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
  356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
  357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
  358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
  359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
  360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
  361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
  362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
  363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
  364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
  365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
  366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
  367 
  368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
  369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
  370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
  371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
  372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
  373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
  374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
  375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
  376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
  377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
  378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
  379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
  380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
  381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
  382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
  383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
  384 
  385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
  386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
  387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
  388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
  389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
  390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
  391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
  392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
  393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
  394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
  395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
  396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
  397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
  398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
  399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
  400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
  401 
  402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
  403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
  404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
  405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
  406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
  407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
  408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
  409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
  410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
  411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
  412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
  413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
  414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
  415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
  416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
  417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
  418 
  419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
  420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
  421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
  422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
  423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
  424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
  425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
  426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
  427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
  428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
  429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
  430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
  431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
  432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
  433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
  434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
  435 
  436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
  437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
  438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
  439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
  440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
  441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
  442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
  443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
  444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
  445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
  446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
  447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
  448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
  449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
  450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
  451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
  452 
  453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
  454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
  455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
  456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
  457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
  458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
  459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
  460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
  461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
  462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
  463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
  464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
  465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
  466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
  467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
  468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
  469 
  470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
  471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
  472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
  473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
  474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
  475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
  476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
  477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
  478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
  479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
  480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
  481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
  482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
  483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
  484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
  485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
  486 
  487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
  488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
  489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
  490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
  491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
  492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
  493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
  494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
  495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
  496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
  497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
  498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
  499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
  500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
  501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
  502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
  503 
  504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
  505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
  506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
  507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
  508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
  509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
  510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
  511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
  512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
  513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
  514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
  515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
  516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
  517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
  518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
  519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
  520 
  521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
  522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
  523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
  524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
  525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
  526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
  527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
  528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
  529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
  530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
  531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
  532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
  533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
  534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
  535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
  536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
  537 
  538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
  539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
  540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
  541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
  542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
  543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
  544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
  545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
  546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
  547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
  548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
  549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
  550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
  551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
  552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
  553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
  554 
  555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
  556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
  557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
  558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
  559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
  560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
  561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
  562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
  563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
  564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
  565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
  566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
  567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
  568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
  569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
  570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
  571 
  572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
  573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
  574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
  575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
  576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
  577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
  578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
  579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
  580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
  581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
  582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
  583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
  584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
  585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
  586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
  587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
  588 
  589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
  590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
  591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
  592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
  593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
  594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
  595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
  596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
  597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
  598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
  599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
  600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
  601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
  602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
  603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
  604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
  605 
  606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
  607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
  608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
  609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
  610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
  611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
  612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
  613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
  614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
  615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
  616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
  617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
  618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
  619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
  620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
  621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
  622 
  623 #endif // _LP64
  624 
  625 #ifdef _LP64
  626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
  627 #else
  628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
  629 #endif // _LP64
  630 
  631 // AVX3 Mask Registers.
  632 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
  633 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
  634 
  635 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
  636 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
  637 
  638 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
  639 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
  640 
  641 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
  642 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
  643 
  644 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
  645 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
  646 
  647 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
  648 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
  649 
  650 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
  651 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
  652 
  653 
  654 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
  655                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
  656                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
  657                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
  658                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
  659                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
  660                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
  661                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
  662 #ifdef _LP64
  663                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
  664                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
  665                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
  666                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
  667                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
  668                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
  669                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
  670                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
  671                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
  672                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
  673                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
  674                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
  675                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
  676                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
  677                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
  678                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
  679                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
  680                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
  681                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
  682                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
  683                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
  684                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
  685                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
  686                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
  687 #endif
  688                       );
  689 
  690 alloc_class chunk2(K7, K7_H,
  691                    K6, K6_H,
  692                    K5, K5_H,
  693                    K4, K4_H,
  694                    K3, K3_H,
  695                    K2, K2_H,
  696                    K1, K1_H);
  697 
  698 reg_class  vectmask_reg(K1, K1_H,
  699                         K2, K2_H,
  700                         K3, K3_H,
  701                         K4, K4_H,
  702                         K5, K5_H,
  703                         K6, K6_H,
  704                         K7, K7_H);
  705 
  706 reg_class vectmask_reg_K1(K1, K1_H);
  707 reg_class vectmask_reg_K2(K2, K2_H);
  708 reg_class vectmask_reg_K3(K3, K3_H);
  709 reg_class vectmask_reg_K4(K4, K4_H);
  710 reg_class vectmask_reg_K5(K5, K5_H);
  711 reg_class vectmask_reg_K6(K6, K6_H);
  712 reg_class vectmask_reg_K7(K7, K7_H);
  713 
  714 // flags allocation class should be last.
  715 alloc_class chunk3(RFLAGS);
  716 
  717 
  718 // Singleton class for condition codes
  719 reg_class int_flags(RFLAGS);
  720 
  721 // Class for pre evex float registers
  722 reg_class float_reg_legacy(XMM0,
  723                     XMM1,
  724                     XMM2,
  725                     XMM3,
  726                     XMM4,
  727                     XMM5,
  728                     XMM6,
  729                     XMM7
  730 #ifdef _LP64
  731                    ,XMM8,
  732                     XMM9,
  733                     XMM10,
  734                     XMM11,
  735                     XMM12,
  736                     XMM13,
  737                     XMM14,
  738                     XMM15
  739 #endif
  740                     );
  741 
  742 // Class for evex float registers
  743 reg_class float_reg_evex(XMM0,
  744                     XMM1,
  745                     XMM2,
  746                     XMM3,
  747                     XMM4,
  748                     XMM5,
  749                     XMM6,
  750                     XMM7
  751 #ifdef _LP64
  752                    ,XMM8,
  753                     XMM9,
  754                     XMM10,
  755                     XMM11,
  756                     XMM12,
  757                     XMM13,
  758                     XMM14,
  759                     XMM15,
  760                     XMM16,
  761                     XMM17,
  762                     XMM18,
  763                     XMM19,
  764                     XMM20,
  765                     XMM21,
  766                     XMM22,
  767                     XMM23,
  768                     XMM24,
  769                     XMM25,
  770                     XMM26,
  771                     XMM27,
  772                     XMM28,
  773                     XMM29,
  774                     XMM30,
  775                     XMM31
  776 #endif
  777                     );
  778 
  779 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
  780 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  781 
  782 // Class for pre evex double registers
  783 reg_class double_reg_legacy(XMM0,  XMM0b,
  784                      XMM1,  XMM1b,
  785                      XMM2,  XMM2b,
  786                      XMM3,  XMM3b,
  787                      XMM4,  XMM4b,
  788                      XMM5,  XMM5b,
  789                      XMM6,  XMM6b,
  790                      XMM7,  XMM7b
  791 #ifdef _LP64
  792                     ,XMM8,  XMM8b,
  793                      XMM9,  XMM9b,
  794                      XMM10, XMM10b,
  795                      XMM11, XMM11b,
  796                      XMM12, XMM12b,
  797                      XMM13, XMM13b,
  798                      XMM14, XMM14b,
  799                      XMM15, XMM15b
  800 #endif
  801                      );
  802 
  803 // Class for evex double registers
  804 reg_class double_reg_evex(XMM0,  XMM0b,
  805                      XMM1,  XMM1b,
  806                      XMM2,  XMM2b,
  807                      XMM3,  XMM3b,
  808                      XMM4,  XMM4b,
  809                      XMM5,  XMM5b,
  810                      XMM6,  XMM6b,
  811                      XMM7,  XMM7b
  812 #ifdef _LP64
  813                     ,XMM8,  XMM8b,
  814                      XMM9,  XMM9b,
  815                      XMM10, XMM10b,
  816                      XMM11, XMM11b,
  817                      XMM12, XMM12b,
  818                      XMM13, XMM13b,
  819                      XMM14, XMM14b,
  820                      XMM15, XMM15b,
  821                      XMM16, XMM16b,
  822                      XMM17, XMM17b,
  823                      XMM18, XMM18b,
  824                      XMM19, XMM19b,
  825                      XMM20, XMM20b,
  826                      XMM21, XMM21b,
  827                      XMM22, XMM22b,
  828                      XMM23, XMM23b,
  829                      XMM24, XMM24b,
  830                      XMM25, XMM25b,
  831                      XMM26, XMM26b,
  832                      XMM27, XMM27b,
  833                      XMM28, XMM28b,
  834                      XMM29, XMM29b,
  835                      XMM30, XMM30b,
  836                      XMM31, XMM31b
  837 #endif
  838                      );
  839 
  840 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
  841 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  842 
  843 // Class for pre evex 32bit vector registers
  844 reg_class vectors_reg_legacy(XMM0,
  845                       XMM1,
  846                       XMM2,
  847                       XMM3,
  848                       XMM4,
  849                       XMM5,
  850                       XMM6,
  851                       XMM7
  852 #ifdef _LP64
  853                      ,XMM8,
  854                       XMM9,
  855                       XMM10,
  856                       XMM11,
  857                       XMM12,
  858                       XMM13,
  859                       XMM14,
  860                       XMM15
  861 #endif
  862                       );
  863 
  864 // Class for evex 32bit vector registers
  865 reg_class vectors_reg_evex(XMM0,
  866                       XMM1,
  867                       XMM2,
  868                       XMM3,
  869                       XMM4,
  870                       XMM5,
  871                       XMM6,
  872                       XMM7
  873 #ifdef _LP64
  874                      ,XMM8,
  875                       XMM9,
  876                       XMM10,
  877                       XMM11,
  878                       XMM12,
  879                       XMM13,
  880                       XMM14,
  881                       XMM15,
  882                       XMM16,
  883                       XMM17,
  884                       XMM18,
  885                       XMM19,
  886                       XMM20,
  887                       XMM21,
  888                       XMM22,
  889                       XMM23,
  890                       XMM24,
  891                       XMM25,
  892                       XMM26,
  893                       XMM27,
  894                       XMM28,
  895                       XMM29,
  896                       XMM30,
  897                       XMM31
  898 #endif
  899                       );
  900 
  901 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
  902 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  903 
  904 // Class for all 64bit vector registers
  905 reg_class vectord_reg_legacy(XMM0,  XMM0b,
  906                       XMM1,  XMM1b,
  907                       XMM2,  XMM2b,
  908                       XMM3,  XMM3b,
  909                       XMM4,  XMM4b,
  910                       XMM5,  XMM5b,
  911                       XMM6,  XMM6b,
  912                       XMM7,  XMM7b
  913 #ifdef _LP64
  914                      ,XMM8,  XMM8b,
  915                       XMM9,  XMM9b,
  916                       XMM10, XMM10b,
  917                       XMM11, XMM11b,
  918                       XMM12, XMM12b,
  919                       XMM13, XMM13b,
  920                       XMM14, XMM14b,
  921                       XMM15, XMM15b
  922 #endif
  923                       );
  924 
  925 // Class for all 64bit vector registers
  926 reg_class vectord_reg_evex(XMM0,  XMM0b,
  927                       XMM1,  XMM1b,
  928                       XMM2,  XMM2b,
  929                       XMM3,  XMM3b,
  930                       XMM4,  XMM4b,
  931                       XMM5,  XMM5b,
  932                       XMM6,  XMM6b,
  933                       XMM7,  XMM7b
  934 #ifdef _LP64
  935                      ,XMM8,  XMM8b,
  936                       XMM9,  XMM9b,
  937                       XMM10, XMM10b,
  938                       XMM11, XMM11b,
  939                       XMM12, XMM12b,
  940                       XMM13, XMM13b,
  941                       XMM14, XMM14b,
  942                       XMM15, XMM15b,
  943                       XMM16, XMM16b,
  944                       XMM17, XMM17b,
  945                       XMM18, XMM18b,
  946                       XMM19, XMM19b,
  947                       XMM20, XMM20b,
  948                       XMM21, XMM21b,
  949                       XMM22, XMM22b,
  950                       XMM23, XMM23b,
  951                       XMM24, XMM24b,
  952                       XMM25, XMM25b,
  953                       XMM26, XMM26b,
  954                       XMM27, XMM27b,
  955                       XMM28, XMM28b,
  956                       XMM29, XMM29b,
  957                       XMM30, XMM30b,
  958                       XMM31, XMM31b
  959 #endif
  960                       );
  961 
  962 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
  963 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  964 
  965 // Class for all 128bit vector registers
  966 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
  967                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  968                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  969                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  970                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  971                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  972                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  973                       XMM7,  XMM7b,  XMM7c,  XMM7d
  974 #ifdef _LP64
  975                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  976                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  977                       XMM10, XMM10b, XMM10c, XMM10d,
  978                       XMM11, XMM11b, XMM11c, XMM11d,
  979                       XMM12, XMM12b, XMM12c, XMM12d,
  980                       XMM13, XMM13b, XMM13c, XMM13d,
  981                       XMM14, XMM14b, XMM14c, XMM14d,
  982                       XMM15, XMM15b, XMM15c, XMM15d
  983 #endif
  984                       );
  985 
  986 // Class for all 128bit vector registers
  987 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
  988                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  989                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  990                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  991                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  992                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  993                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  994                       XMM7,  XMM7b,  XMM7c,  XMM7d
  995 #ifdef _LP64
  996                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  997                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  998                       XMM10, XMM10b, XMM10c, XMM10d,
  999                       XMM11, XMM11b, XMM11c, XMM11d,
 1000                       XMM12, XMM12b, XMM12c, XMM12d,
 1001                       XMM13, XMM13b, XMM13c, XMM13d,
 1002                       XMM14, XMM14b, XMM14c, XMM14d,
 1003                       XMM15, XMM15b, XMM15c, XMM15d,
 1004                       XMM16, XMM16b, XMM16c, XMM16d,
 1005                       XMM17, XMM17b, XMM17c, XMM17d,
 1006                       XMM18, XMM18b, XMM18c, XMM18d,
 1007                       XMM19, XMM19b, XMM19c, XMM19d,
 1008                       XMM20, XMM20b, XMM20c, XMM20d,
 1009                       XMM21, XMM21b, XMM21c, XMM21d,
 1010                       XMM22, XMM22b, XMM22c, XMM22d,
 1011                       XMM23, XMM23b, XMM23c, XMM23d,
 1012                       XMM24, XMM24b, XMM24c, XMM24d,
 1013                       XMM25, XMM25b, XMM25c, XMM25d,
 1014                       XMM26, XMM26b, XMM26c, XMM26d,
 1015                       XMM27, XMM27b, XMM27c, XMM27d,
 1016                       XMM28, XMM28b, XMM28c, XMM28d,
 1017                       XMM29, XMM29b, XMM29c, XMM29d,
 1018                       XMM30, XMM30b, XMM30c, XMM30d,
 1019                       XMM31, XMM31b, XMM31c, XMM31d
 1020 #endif
 1021                       );
 1022 
 1023 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 1024 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1025 
 1026 // Class for all 256bit vector registers
 1027 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1028                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1029                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1030                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1031                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1032                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1033                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1034                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1035 #ifdef _LP64
 1036                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1037                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1038                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1039                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1040                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1041                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1042                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1043                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 1044 #endif
 1045                       );
 1046 
 1047 // Class for all 256bit vector registers
 1048 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1049                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1050                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1051                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1052                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1053                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1054                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1055                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1056 #ifdef _LP64
 1057                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1058                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1059                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1060                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1061                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1062                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1063                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1064                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
 1065                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
 1066                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
 1067                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
 1068                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
 1069                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
 1070                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
 1071                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
 1072                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
 1073                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
 1074                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
 1075                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
 1076                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
 1077                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
 1078                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
 1079                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
 1080                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
 1081 #endif
 1082                       );
 1083 
 1084 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
 1085 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1086 
 1087 // Class for all 512bit vector registers
 1088 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1089                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1090                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1091                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1092                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1093                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1094                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1095                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1096 #ifdef _LP64
 1097                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1098                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1099                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1100                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1101                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1102                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1103                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1104                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1105                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 1106                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 1107                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 1108                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 1109                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 1110                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 1111                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 1112                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 1113                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 1114                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 1115                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 1116                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 1117                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 1118                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 1119                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 1120                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 1121 #endif
 1122                       );
 1123 
 1124 // Class for restricted 512bit vector registers
 1125 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1126                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1127                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1128                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1129                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1130                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1131                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1132                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1133 #ifdef _LP64
 1134                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1135                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1136                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1137                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1138                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1139                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1140                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1141                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1142 #endif
 1143                       );
 1144 
 1145 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
 1146 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 1147 
 1148 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 1149 %}
 1150 
 1151 
 1152 //----------SOURCE BLOCK-------------------------------------------------------
 1153 // This is a block of C++ code which provides values, functions, and
 1154 // definitions necessary in the rest of the architecture description
 1155 
 1156 source_hpp %{
 1157 // Header information of the source block.
 1158 // Method declarations/definitions which are used outside
 1159 // the ad-scope can conveniently be defined here.
 1160 //
 1161 // To keep related declarations/definitions/uses close together,
 1162 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 1163 
 1164 #include "runtime/vm_version.hpp"
 1165 
 1166 class NativeJump;
 1167 
 1168 class CallStubImpl {
 1169 
 1170   //--------------------------------------------------------------
 1171   //---<  Used for optimization in Compile::shorten_branches  >---
 1172   //--------------------------------------------------------------
 1173 
 1174  public:
 1175   // Size of call trampoline stub.
 1176   static uint size_call_trampoline() {
 1177     return 0; // no call trampolines on this platform
 1178   }
 1179 
 1180   // number of relocations needed by a call trampoline stub
 1181   static uint reloc_call_trampoline() {
 1182     return 0; // no call trampolines on this platform
 1183   }
 1184 };
 1185 
 1186 class HandlerImpl {
 1187 
 1188  public:
 1189 
 1190   static int emit_exception_handler(C2_MacroAssembler *masm);
 1191   static int emit_deopt_handler(C2_MacroAssembler* masm);
 1192 
 1193   static uint size_exception_handler() {
 1194     // NativeCall instruction size is the same as NativeJump.
 1195     // exception handler starts out as jump and can be patched to
 1196     // a call be deoptimization.  (4932387)
 1197     // Note that this value is also credited (in output.cpp) to
 1198     // the size of the code section.
 1199     return NativeJump::instruction_size;
 1200   }
 1201 
 1202 #ifdef _LP64
 1203   static uint size_deopt_handler() {
 1204     // three 5 byte instructions plus one move for unreachable address.
 1205     return 15+3;
 1206   }
 1207 #else
 1208   static uint size_deopt_handler() {
 1209     // NativeCall instruction size is the same as NativeJump.
 1210     // exception handler starts out as jump and can be patched to
 1211     // a call be deoptimization.  (4932387)
 1212     // Note that this value is also credited (in output.cpp) to
 1213     // the size of the code section.
 1214     return 5 + NativeJump::instruction_size; // pushl(); jmp;
 1215   }
 1216 #endif
 1217 };
 1218 
 1219 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
 1220   switch(bytes) {
 1221     case  4: // fall-through
 1222     case  8: // fall-through
 1223     case 16: return Assembler::AVX_128bit;
 1224     case 32: return Assembler::AVX_256bit;
 1225     case 64: return Assembler::AVX_512bit;
 1226 
 1227     default: {
 1228       ShouldNotReachHere();
 1229       return Assembler::AVX_NoVec;
 1230     }
 1231   }
 1232 }
 1233 
 1234 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
 1235   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
 1236 }
 1237 
 1238 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
 1239   uint def_idx = use->operand_index(opnd);
 1240   Node* def = use->in(def_idx);
 1241   return vector_length_encoding(def);
 1242 }
 1243 
 1244 static inline bool is_vector_popcount_predicate(BasicType bt) {
 1245   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1246          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1247 }
 1248 
 1249 static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
 1250   return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
 1251            (VM_Version::supports_avx512vl() || vlen_bytes == 64);
 1252 }
 1253 
 1254 class Node::PD {
 1255 public:
 1256   enum NodeFlags {
 1257     Flag_intel_jcc_erratum    = Node::_last_flag << 1,
 1258     Flag_sets_carry_flag      = Node::_last_flag << 2,
 1259     Flag_sets_parity_flag     = Node::_last_flag << 3,
 1260     Flag_sets_zero_flag       = Node::_last_flag << 4,
 1261     Flag_sets_overflow_flag   = Node::_last_flag << 5,
 1262     Flag_sets_sign_flag       = Node::_last_flag << 6,
 1263     Flag_clears_carry_flag    = Node::_last_flag << 7,
 1264     Flag_clears_parity_flag   = Node::_last_flag << 8,
 1265     Flag_clears_zero_flag     = Node::_last_flag << 9,
 1266     Flag_clears_overflow_flag = Node::_last_flag << 10,
 1267     Flag_clears_sign_flag     = Node::_last_flag << 11,
 1268     _last_flag                = Flag_clears_sign_flag
 1269   };
 1270 };
 1271 
 1272 %} // end source_hpp
 1273 
 1274 source %{
 1275 
 1276 #include "opto/addnode.hpp"
 1277 #include "c2_intelJccErratum_x86.hpp"
 1278 
 1279 void PhaseOutput::pd_perform_mach_node_analysis() {
 1280   if (VM_Version::has_intel_jcc_erratum()) {
 1281     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
 1282     _buf_sizes._code += extra_padding;
 1283   }
 1284 }
 1285 
 1286 int MachNode::pd_alignment_required() const {
 1287   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
 1288     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
 1289     return IntelJccErratum::largest_jcc_size() + 1;
 1290   } else {
 1291     return 1;
 1292   }
 1293 }
 1294 
 1295 int MachNode::compute_padding(int current_offset) const {
 1296   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
 1297     Compile* C = Compile::current();
 1298     PhaseOutput* output = C->output();
 1299     Block* block = output->block();
 1300     int index = output->index();
 1301     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
 1302   } else {
 1303     return 0;
 1304   }
 1305 }
 1306 
 1307 // Emit exception handler code.
 1308 // Stuff framesize into a register and call a VM stub routine.
 1309 int HandlerImpl::emit_exception_handler(C2_MacroAssembler* masm) {
 1310 
 1311   // Note that the code buffer's insts_mark is always relative to insts.
 1312   // That's why we must use the macroassembler to generate a handler.
 1313   address base = __ start_a_stub(size_exception_handler());
 1314   if (base == nullptr) {
 1315     ciEnv::current()->record_failure("CodeCache is full");
 1316     return 0;  // CodeBuffer::expand failed
 1317   }
 1318   int offset = __ offset();
 1319   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 1320   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 1321   __ end_a_stub();
 1322   return offset;
 1323 }
 1324 
 1325 // Emit deopt handler code.
 1326 int HandlerImpl::emit_deopt_handler(C2_MacroAssembler* masm) {
 1327 
 1328   // Note that the code buffer's insts_mark is always relative to insts.
 1329   // That's why we must use the macroassembler to generate a handler.
 1330   address base = __ start_a_stub(size_deopt_handler());
 1331   if (base == nullptr) {
 1332     ciEnv::current()->record_failure("CodeCache is full");
 1333     return 0;  // CodeBuffer::expand failed
 1334   }
 1335   int offset = __ offset();
 1336 
 1337 #ifdef _LP64
 1338   address the_pc = (address) __ pc();
 1339   Label next;
 1340   // push a "the_pc" on the stack without destroying any registers
 1341   // as they all may be live.
 1342 
 1343   // push address of "next"
 1344   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 1345   __ bind(next);
 1346   // adjust it so it matches "the_pc"
 1347   __ subptr(Address(rsp, 0), __ offset() - offset);
 1348 #else
 1349   InternalAddress here(__ pc());
 1350   __ pushptr(here.addr(), noreg);
 1351 #endif
 1352 
 1353   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 1354   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
 1355   __ end_a_stub();
 1356   return offset;
 1357 }
 1358 
 1359 static Assembler::Width widthForType(BasicType bt) {
 1360   if (bt == T_BYTE) {
 1361     return Assembler::B;
 1362   } else if (bt == T_SHORT) {
 1363     return Assembler::W;
 1364   } else if (bt == T_INT) {
 1365     return Assembler::D;
 1366   } else {
 1367     assert(bt == T_LONG, "not a long: %s", type2name(bt));
 1368     return Assembler::Q;
 1369   }
 1370 }
 1371 
 1372 //=============================================================================
 1373 
 1374   // Float masks come from different places depending on platform.
 1375 #ifdef _LP64
 1376   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 1377   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 1378   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 1379   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 1380 #else
 1381   static address float_signmask()  { return (address)float_signmask_pool; }
 1382   static address float_signflip()  { return (address)float_signflip_pool; }
 1383   static address double_signmask() { return (address)double_signmask_pool; }
 1384   static address double_signflip() { return (address)double_signflip_pool; }
 1385 #endif
 1386   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
 1387   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
 1388   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
 1389   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
 1390   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
 1391   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
 1392   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
 1393   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
 1394   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
 1395   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
 1396   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
 1397   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
 1398   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
 1399   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
 1400   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
 1401 
 1402 //=============================================================================
 1403 bool Matcher::match_rule_supported(int opcode) {
 1404   if (!has_match_rule(opcode)) {
 1405     return false; // no match rule present
 1406   }
 1407   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1408   switch (opcode) {
 1409     case Op_AbsVL:
 1410     case Op_StoreVectorScatter:
 1411       if (UseAVX < 3) {
 1412         return false;
 1413       }
 1414       break;
 1415     case Op_PopCountI:
 1416     case Op_PopCountL:
 1417       if (!UsePopCountInstruction) {
 1418         return false;
 1419       }
 1420       break;
 1421     case Op_PopCountVI:
 1422       if (UseAVX < 2) {
 1423         return false;
 1424       }
 1425       break;
 1426     case Op_CompressV:
 1427     case Op_ExpandV:
 1428     case Op_PopCountVL:
 1429       if (UseAVX < 2) {
 1430         return false;
 1431       }
 1432       break;
 1433     case Op_MulVI:
 1434       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
 1435         return false;
 1436       }
 1437       break;
 1438     case Op_MulVL:
 1439       if (UseSSE < 4) { // only with SSE4_1 or AVX
 1440         return false;
 1441       }
 1442       break;
 1443     case Op_MulReductionVL:
 1444       if (VM_Version::supports_avx512dq() == false) {
 1445         return false;
 1446       }
 1447       break;
 1448     case Op_AddReductionVL:
 1449       if (UseSSE < 2) { // requires at least SSE2
 1450         return false;
 1451       }
 1452       break;
 1453     case Op_AbsVB:
 1454     case Op_AbsVS:
 1455     case Op_AbsVI:
 1456     case Op_AddReductionVI:
 1457     case Op_AndReductionV:
 1458     case Op_OrReductionV:
 1459     case Op_XorReductionV:
 1460       if (UseSSE < 3) { // requires at least SSSE3
 1461         return false;
 1462       }
 1463       break;
 1464     case Op_VectorLoadShuffle:
 1465     case Op_VectorRearrange:
 1466     case Op_MulReductionVI:
 1467       if (UseSSE < 4) { // requires at least SSE4
 1468         return false;
 1469       }
 1470       break;
 1471     case Op_IsInfiniteF:
 1472     case Op_IsInfiniteD:
 1473       if (!VM_Version::supports_avx512dq()) {
 1474         return false;
 1475       }
 1476       break;
 1477     case Op_SqrtVD:
 1478     case Op_SqrtVF:
 1479     case Op_VectorMaskCmp:
 1480     case Op_VectorCastB2X:
 1481     case Op_VectorCastS2X:
 1482     case Op_VectorCastI2X:
 1483     case Op_VectorCastL2X:
 1484     case Op_VectorCastF2X:
 1485     case Op_VectorCastD2X:
 1486     case Op_VectorUCastB2X:
 1487     case Op_VectorUCastS2X:
 1488     case Op_VectorUCastI2X:
 1489     case Op_VectorMaskCast:
 1490       if (UseAVX < 1) { // enabled for AVX only
 1491         return false;
 1492       }
 1493       break;
 1494     case Op_PopulateIndex:
 1495       if (!is_LP64 || (UseAVX < 2)) {
 1496         return false;
 1497       }
 1498       break;
 1499     case Op_RoundVF:
 1500       if (UseAVX < 2) { // enabled for AVX2 only
 1501         return false;
 1502       }
 1503       break;
 1504     case Op_RoundVD:
 1505       if (UseAVX < 3) {
 1506         return false;  // enabled for AVX3 only
 1507       }
 1508       break;
 1509     case Op_CompareAndSwapL:
 1510 #ifdef _LP64
 1511     case Op_CompareAndSwapP:
 1512 #endif
 1513       break;
 1514     case Op_StrIndexOf:
 1515       if (!UseSSE42Intrinsics) {
 1516         return false;
 1517       }
 1518       break;
 1519     case Op_StrIndexOfChar:
 1520       if (!UseSSE42Intrinsics) {
 1521         return false;
 1522       }
 1523       break;
 1524     case Op_OnSpinWait:
 1525       if (VM_Version::supports_on_spin_wait() == false) {
 1526         return false;
 1527       }
 1528       break;
 1529     case Op_MulVB:
 1530     case Op_LShiftVB:
 1531     case Op_RShiftVB:
 1532     case Op_URShiftVB:
 1533     case Op_VectorInsert:
 1534     case Op_VectorLoadMask:
 1535     case Op_VectorStoreMask:
 1536     case Op_VectorBlend:
 1537       if (UseSSE < 4) {
 1538         return false;
 1539       }
 1540       break;
 1541 #ifdef _LP64
 1542     case Op_MaxD:
 1543     case Op_MaxF:
 1544     case Op_MinD:
 1545     case Op_MinF:
 1546       if (UseAVX < 1) { // enabled for AVX only
 1547         return false;
 1548       }
 1549       break;
 1550 #endif
 1551     case Op_CacheWB:
 1552     case Op_CacheWBPreSync:
 1553     case Op_CacheWBPostSync:
 1554       if (!VM_Version::supports_data_cache_line_flush()) {
 1555         return false;
 1556       }
 1557       break;
 1558     case Op_ExtractB:
 1559     case Op_ExtractL:
 1560     case Op_ExtractI:
 1561     case Op_RoundDoubleMode:
 1562       if (UseSSE < 4) {
 1563         return false;
 1564       }
 1565       break;
 1566     case Op_RoundDoubleModeV:
 1567       if (VM_Version::supports_avx() == false) {
 1568         return false; // 128bit vroundpd is not available
 1569       }
 1570       break;
 1571     case Op_LoadVectorGather:
 1572     case Op_LoadVectorGatherMasked:
 1573       if (UseAVX < 2) {
 1574         return false;
 1575       }
 1576       break;
 1577     case Op_FmaF:
 1578     case Op_FmaD:
 1579     case Op_FmaVD:
 1580     case Op_FmaVF:
 1581       if (!UseFMA) {
 1582         return false;
 1583       }
 1584       break;
 1585     case Op_MacroLogicV:
 1586       if (UseAVX < 3 || !UseVectorMacroLogic) {
 1587         return false;
 1588       }
 1589       break;
 1590 
 1591     case Op_VectorCmpMasked:
 1592     case Op_VectorMaskGen:
 1593       if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1594         return false;
 1595       }
 1596       break;
 1597     case Op_VectorMaskFirstTrue:
 1598     case Op_VectorMaskLastTrue:
 1599     case Op_VectorMaskTrueCount:
 1600     case Op_VectorMaskToLong:
 1601       if (!is_LP64 || UseAVX < 1) {
 1602          return false;
 1603       }
 1604       break;
 1605     case Op_RoundF:
 1606     case Op_RoundD:
 1607       if (!is_LP64) {
 1608         return false;
 1609       }
 1610       break;
 1611     case Op_CopySignD:
 1612     case Op_CopySignF:
 1613       if (UseAVX < 3 || !is_LP64)  {
 1614         return false;
 1615       }
 1616       if (!VM_Version::supports_avx512vl()) {
 1617         return false;
 1618       }
 1619       break;
 1620 #ifndef _LP64
 1621     case Op_AddReductionVF:
 1622     case Op_AddReductionVD:
 1623     case Op_MulReductionVF:
 1624     case Op_MulReductionVD:
 1625       if (UseSSE < 1) { // requires at least SSE
 1626         return false;
 1627       }
 1628       break;
 1629     case Op_MulAddVS2VI:
 1630     case Op_RShiftVL:
 1631     case Op_AbsVD:
 1632     case Op_NegVD:
 1633       if (UseSSE < 2) {
 1634         return false;
 1635       }
 1636       break;
 1637 #endif // !LP64
 1638     case Op_CompressBits:
 1639       if (!VM_Version::supports_bmi2() || (!is_LP64 && UseSSE < 2)) {
 1640         return false;
 1641       }
 1642       break;
 1643     case Op_ExpandBits:
 1644       if (!VM_Version::supports_bmi2() || (!is_LP64 && (UseSSE < 2 || !VM_Version::supports_bmi1()))) {
 1645         return false;
 1646       }
 1647       break;
 1648     case Op_SignumF:
 1649       if (UseSSE < 1) {
 1650         return false;
 1651       }
 1652       break;
 1653     case Op_SignumD:
 1654       if (UseSSE < 2) {
 1655         return false;
 1656       }
 1657       break;
 1658     case Op_CompressM:
 1659       if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
 1660         return false;
 1661       }
 1662       break;
 1663     case Op_SqrtF:
 1664       if (UseSSE < 1) {
 1665         return false;
 1666       }
 1667       break;
 1668     case Op_SqrtD:
 1669 #ifdef _LP64
 1670       if (UseSSE < 2) {
 1671         return false;
 1672       }
 1673 #else
 1674       // x86_32.ad has a special match rule for SqrtD.
 1675       // Together with common x86 rules, this handles all UseSSE cases.
 1676 #endif
 1677       break;
 1678     case Op_ConvF2HF:
 1679     case Op_ConvHF2F:
 1680       if (!VM_Version::supports_float16()) {
 1681         return false;
 1682       }
 1683       break;
 1684     case Op_VectorCastF2HF:
 1685     case Op_VectorCastHF2F:
 1686       if (!VM_Version::supports_f16c() && !VM_Version::supports_evex()) {
 1687         return false;
 1688       }
 1689       break;
 1690   }
 1691   return true;  // Match rules are supported by default.
 1692 }
 1693 
 1694 //------------------------------------------------------------------------
 1695 
 1696 static inline bool is_pop_count_instr_target(BasicType bt) {
 1697   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1698          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1699 }
 1700 
 1701 bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
 1702   return match_rule_supported_vector(opcode, vlen, bt);
 1703 }
 1704 
 1705 // Identify extra cases that we might want to provide match rules for vector nodes and
 1706 // other intrinsics guarded with vector length (vlen) and element type (bt).
 1707 bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
 1708   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1709   if (!match_rule_supported(opcode)) {
 1710     return false;
 1711   }
 1712   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
 1713   //   * SSE2 supports 128bit vectors for all types;
 1714   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
 1715   //   * AVX2 supports 256bit vectors for all types;
 1716   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
 1717   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
 1718   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
 1719   // And MaxVectorSize is taken into account as well.
 1720   if (!vector_size_supported(bt, vlen)) {
 1721     return false;
 1722   }
 1723   // Special cases which require vector length follow:
 1724   //   * implementation limitations
 1725   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
 1726   //   * 128bit vroundpd instruction is present only in AVX1
 1727   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1728   switch (opcode) {
 1729     case Op_AbsVF:
 1730     case Op_NegVF:
 1731       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
 1732         return false; // 512bit vandps and vxorps are not available
 1733       }
 1734       break;
 1735     case Op_AbsVD:
 1736     case Op_NegVD:
 1737       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
 1738         return false; // 512bit vpmullq, vandpd and vxorpd are not available
 1739       }
 1740       break;
 1741     case Op_RotateRightV:
 1742     case Op_RotateLeftV:
 1743       if (bt != T_INT && bt != T_LONG) {
 1744         return false;
 1745       } // fallthrough
 1746     case Op_MacroLogicV:
 1747       if (!VM_Version::supports_evex() ||
 1748           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
 1749         return false;
 1750       }
 1751       break;
 1752     case Op_ClearArray:
 1753     case Op_VectorMaskGen:
 1754     case Op_VectorCmpMasked:
 1755       if (!is_LP64 || !VM_Version::supports_avx512bw()) {
 1756         return false;
 1757       }
 1758       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
 1759         return false;
 1760       }
 1761       break;
 1762     case Op_LoadVectorMasked:
 1763     case Op_StoreVectorMasked:
 1764       if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
 1765         return false;
 1766       }
 1767       break;
 1768     case Op_UMinV:
 1769     case Op_UMaxV:
 1770       if (UseAVX == 0) {
 1771         return false;
 1772       }
 1773       break;
 1774     case Op_MaxV:
 1775     case Op_MinV:
 1776       if (UseSSE < 4 && is_integral_type(bt)) {
 1777         return false;
 1778       }
 1779       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
 1780           // Float/Double intrinsics are enabled for AVX family currently.
 1781           if (UseAVX == 0) {
 1782             return false;
 1783           }
 1784           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
 1785             return false;
 1786           }
 1787       }
 1788       break;
 1789     case Op_CallLeafVector:
 1790       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
 1791         return false;
 1792       }
 1793       break;
 1794     case Op_AddReductionVI:
 1795       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
 1796         return false;
 1797       }
 1798       // fallthrough
 1799     case Op_AndReductionV:
 1800     case Op_OrReductionV:
 1801     case Op_XorReductionV:
 1802       if (is_subword_type(bt) && (UseSSE < 4)) {
 1803         return false;
 1804       }
 1805 #ifndef _LP64
 1806       if (bt == T_BYTE || bt == T_LONG) {
 1807         return false;
 1808       }
 1809 #endif
 1810       break;
 1811 #ifndef _LP64
 1812     case Op_VectorInsert:
 1813       if (bt == T_LONG || bt == T_DOUBLE) {
 1814         return false;
 1815       }
 1816       break;
 1817 #endif
 1818     case Op_MinReductionV:
 1819     case Op_MaxReductionV:
 1820       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
 1821         return false;
 1822       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
 1823         return false;
 1824       }
 1825       // Float/Double intrinsics enabled for AVX family.
 1826       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
 1827         return false;
 1828       }
 1829       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
 1830         return false;
 1831       }
 1832 #ifndef _LP64
 1833       if (bt == T_BYTE || bt == T_LONG) {
 1834         return false;
 1835       }
 1836 #endif
 1837       break;
 1838     case Op_VectorTest:
 1839       if (UseSSE < 4) {
 1840         return false; // Implementation limitation
 1841       } else if (size_in_bits < 32) {
 1842         return false; // Implementation limitation
 1843       }
 1844       break;
 1845     case Op_VectorLoadShuffle:
 1846     case Op_VectorRearrange:
 1847       if(vlen == 2) {
 1848         return false; // Implementation limitation due to how shuffle is loaded
 1849       } else if (size_in_bits == 256 && UseAVX < 2) {
 1850         return false; // Implementation limitation
 1851       }
 1852       break;
 1853     case Op_VectorLoadMask:
 1854     case Op_VectorMaskCast:
 1855       if (size_in_bits == 256 && UseAVX < 2) {
 1856         return false; // Implementation limitation
 1857       }
 1858       // fallthrough
 1859     case Op_VectorStoreMask:
 1860       if (vlen == 2) {
 1861         return false; // Implementation limitation
 1862       }
 1863       break;
 1864     case Op_PopulateIndex:
 1865       if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
 1866         return false;
 1867       }
 1868       break;
 1869     case Op_VectorCastB2X:
 1870     case Op_VectorCastS2X:
 1871     case Op_VectorCastI2X:
 1872       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
 1873         return false;
 1874       }
 1875       break;
 1876     case Op_VectorCastL2X:
 1877       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
 1878         return false;
 1879       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
 1880         return false;
 1881       }
 1882       break;
 1883     case Op_VectorCastF2X: {
 1884         // As per JLS section 5.1.3 narrowing conversion to sub-word types
 1885         // happen after intermediate conversion to integer and special handling
 1886         // code needs AVX2 vpcmpeqd instruction for 256 bit vectors.
 1887         int src_size_in_bits = type2aelembytes(T_FLOAT) * vlen * BitsPerByte;
 1888         if (is_integral_type(bt) && src_size_in_bits == 256 && UseAVX < 2) {
 1889           return false;
 1890         }
 1891       }
 1892       // fallthrough
 1893     case Op_VectorCastD2X:
 1894       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
 1895         return false;
 1896       }
 1897       break;
 1898     case Op_VectorCastF2HF:
 1899     case Op_VectorCastHF2F:
 1900       if (!VM_Version::supports_f16c() &&
 1901          ((!VM_Version::supports_evex() ||
 1902          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())))) {
 1903         return false;
 1904       }
 1905       break;
 1906     case Op_RoundVD:
 1907       if (!VM_Version::supports_avx512dq()) {
 1908         return false;
 1909       }
 1910       break;
 1911     case Op_MulReductionVI:
 1912       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1913         return false;
 1914       }
 1915       break;
 1916     case Op_LoadVectorGatherMasked:
 1917       if (!is_subword_type(bt) && size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1918         return false;
 1919       }
 1920       if (is_subword_type(bt) &&
 1921          (!is_LP64                                                ||
 1922          (size_in_bits > 256 && !VM_Version::supports_avx512bw()) ||
 1923          (size_in_bits < 64)                                      ||
 1924          (bt == T_SHORT && !VM_Version::supports_bmi2()))) {
 1925         return false;
 1926       }
 1927       break;
 1928     case Op_StoreVectorScatterMasked:
 1929     case Op_StoreVectorScatter:
 1930       if (is_subword_type(bt)) {
 1931         return false;
 1932       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1933         return false;
 1934       }
 1935       // fallthrough
 1936     case Op_LoadVectorGather:
 1937       if (!is_subword_type(bt) && size_in_bits == 64) {
 1938         return false;
 1939       }
 1940       if (is_subword_type(bt) && size_in_bits < 64) {
 1941         return false;
 1942       }
 1943       break;
 1944     case Op_SaturatingAddV:
 1945     case Op_SaturatingSubV:
 1946       if (UseAVX < 1) {
 1947         return false; // Implementation limitation
 1948       }
 1949       if (is_subword_type(bt) && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1950         return false;
 1951       }
 1952       break;
 1953     case Op_SelectFromTwoVector:
 1954        if (size_in_bits < 128 || (size_in_bits < 512 && !VM_Version::supports_avx512vl())) {
 1955          return false;
 1956        }
 1957        if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 1958          return false;
 1959        }
 1960        if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 1961          return false;
 1962        }
 1963        if ((bt == T_INT || bt == T_FLOAT || bt == T_DOUBLE) && !VM_Version::supports_evex()) {
 1964          return false;
 1965        }
 1966        break;
 1967     case Op_MaskAll:
 1968       if (!VM_Version::supports_evex()) {
 1969         return false;
 1970       }
 1971       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
 1972         return false;
 1973       }
 1974       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1975         return false;
 1976       }
 1977       break;
 1978     case Op_VectorMaskCmp:
 1979       if (vlen < 2 || size_in_bits < 32) {
 1980         return false;
 1981       }
 1982       break;
 1983     case Op_CompressM:
 1984       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1985         return false;
 1986       }
 1987       break;
 1988     case Op_CompressV:
 1989     case Op_ExpandV:
 1990       if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
 1991         return false;
 1992       }
 1993       if (!is_LP64 && !VM_Version::supports_avx512vl() && size_in_bits < 512) {
 1994         return false;
 1995       }
 1996       if (size_in_bits < 128 ) {
 1997         return false;
 1998       }
 1999     case Op_VectorLongToMask:
 2000       if (UseAVX < 1 || !is_LP64) {
 2001         return false;
 2002       }
 2003       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
 2004         return false;
 2005       }
 2006       break;
 2007     case Op_SignumVD:
 2008     case Op_SignumVF:
 2009       if (UseAVX < 1) {
 2010         return false;
 2011       }
 2012       break;
 2013     case Op_PopCountVI:
 2014     case Op_PopCountVL: {
 2015         if (!is_pop_count_instr_target(bt) &&
 2016             (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
 2017           return false;
 2018         }
 2019       }
 2020       break;
 2021     case Op_ReverseV:
 2022     case Op_ReverseBytesV:
 2023       if (UseAVX < 2) {
 2024         return false;
 2025       }
 2026       break;
 2027     case Op_CountTrailingZerosV:
 2028     case Op_CountLeadingZerosV:
 2029       if (UseAVX < 2) {
 2030         return false;
 2031       }
 2032       break;
 2033   }
 2034   return true;  // Per default match rules are supported.
 2035 }
 2036 
 2037 bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
 2038   // ADLC based match_rule_supported routine checks for the existence of pattern based
 2039   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
 2040   // of their non-masked counterpart with mask edge being the differentiator.
 2041   // This routine does a strict check on the existence of masked operation patterns
 2042   // by returning a default false value for all the other opcodes apart from the
 2043   // ones whose masked instruction patterns are defined in this file.
 2044   if (!match_rule_supported_vector(opcode, vlen, bt)) {
 2045     return false;
 2046   }
 2047 
 2048   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 2049   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 2050   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
 2051     return false;
 2052   }
 2053   switch(opcode) {
 2054     // Unary masked operations
 2055     case Op_AbsVB:
 2056     case Op_AbsVS:
 2057       if(!VM_Version::supports_avx512bw()) {
 2058         return false;  // Implementation limitation
 2059       }
 2060     case Op_AbsVI:
 2061     case Op_AbsVL:
 2062       return true;
 2063 
 2064     // Ternary masked operations
 2065     case Op_FmaVF:
 2066     case Op_FmaVD:
 2067       return true;
 2068 
 2069     case Op_MacroLogicV:
 2070       if(bt != T_INT && bt != T_LONG) {
 2071         return false;
 2072       }
 2073       return true;
 2074 
 2075     // Binary masked operations
 2076     case Op_AddVB:
 2077     case Op_AddVS:
 2078     case Op_SubVB:
 2079     case Op_SubVS:
 2080     case Op_MulVS:
 2081     case Op_LShiftVS:
 2082     case Op_RShiftVS:
 2083     case Op_URShiftVS:
 2084       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2085       if (!VM_Version::supports_avx512bw()) {
 2086         return false;  // Implementation limitation
 2087       }
 2088       return true;
 2089 
 2090     case Op_MulVL:
 2091       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2092       if (!VM_Version::supports_avx512dq()) {
 2093         return false;  // Implementation limitation
 2094       }
 2095       return true;
 2096 
 2097     case Op_AndV:
 2098     case Op_OrV:
 2099     case Op_XorV:
 2100     case Op_RotateRightV:
 2101     case Op_RotateLeftV:
 2102       if (bt != T_INT && bt != T_LONG) {
 2103         return false; // Implementation limitation
 2104       }
 2105       return true;
 2106 
 2107     case Op_VectorLoadMask:
 2108       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2109       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2110         return false;
 2111       }
 2112       return true;
 2113 
 2114     case Op_AddVI:
 2115     case Op_AddVL:
 2116     case Op_AddVF:
 2117     case Op_AddVD:
 2118     case Op_SubVI:
 2119     case Op_SubVL:
 2120     case Op_SubVF:
 2121     case Op_SubVD:
 2122     case Op_MulVI:
 2123     case Op_MulVF:
 2124     case Op_MulVD:
 2125     case Op_DivVF:
 2126     case Op_DivVD:
 2127     case Op_SqrtVF:
 2128     case Op_SqrtVD:
 2129     case Op_LShiftVI:
 2130     case Op_LShiftVL:
 2131     case Op_RShiftVI:
 2132     case Op_RShiftVL:
 2133     case Op_URShiftVI:
 2134     case Op_URShiftVL:
 2135     case Op_LoadVectorMasked:
 2136     case Op_StoreVectorMasked:
 2137     case Op_LoadVectorGatherMasked:
 2138     case Op_StoreVectorScatterMasked:
 2139       return true;
 2140 
 2141     case Op_UMinV:
 2142     case Op_UMaxV:
 2143       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 2144         return false;
 2145       } // fallthrough
 2146     case Op_MaxV:
 2147     case Op_MinV:
 2148       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2149         return false; // Implementation limitation
 2150       }
 2151       if (is_floating_point_type(bt)) {
 2152         return false; // Implementation limitation
 2153       }
 2154       return true;
 2155     case Op_SaturatingAddV:
 2156     case Op_SaturatingSubV:
 2157       if (!is_subword_type(bt)) {
 2158         return false;
 2159       }
 2160       if (size_in_bits < 128 || !VM_Version::supports_avx512bw()) {
 2161         return false; // Implementation limitation
 2162       }
 2163       return true;
 2164 
 2165     case Op_VectorMaskCmp:
 2166       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2167         return false; // Implementation limitation
 2168       }
 2169       return true;
 2170 
 2171     case Op_VectorRearrange:
 2172       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 2173         return false; // Implementation limitation
 2174       }
 2175       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 2176         return false; // Implementation limitation
 2177       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
 2178         return false; // Implementation limitation
 2179       }
 2180       return true;
 2181 
 2182     // Binary Logical operations
 2183     case Op_AndVMask:
 2184     case Op_OrVMask:
 2185     case Op_XorVMask:
 2186       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
 2187         return false; // Implementation limitation
 2188       }
 2189       return true;
 2190 
 2191     case Op_PopCountVI:
 2192     case Op_PopCountVL:
 2193       if (!is_pop_count_instr_target(bt)) {
 2194         return false;
 2195       }
 2196       return true;
 2197 
 2198     case Op_MaskAll:
 2199       return true;
 2200 
 2201     case Op_CountLeadingZerosV:
 2202       if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
 2203         return true;
 2204       }
 2205     default:
 2206       return false;
 2207   }
 2208 }
 2209 
 2210 bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
 2211   return false;
 2212 }
 2213 
 2214 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
 2215   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
 2216   bool legacy = (generic_opnd->opcode() == LEGVEC);
 2217   if (!VM_Version::supports_avx512vlbwdq() && // KNL
 2218       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
 2219     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
 2220     return new legVecZOper();
 2221   }
 2222   if (legacy) {
 2223     switch (ideal_reg) {
 2224       case Op_VecS: return new legVecSOper();
 2225       case Op_VecD: return new legVecDOper();
 2226       case Op_VecX: return new legVecXOper();
 2227       case Op_VecY: return new legVecYOper();
 2228       case Op_VecZ: return new legVecZOper();
 2229     }
 2230   } else {
 2231     switch (ideal_reg) {
 2232       case Op_VecS: return new vecSOper();
 2233       case Op_VecD: return new vecDOper();
 2234       case Op_VecX: return new vecXOper();
 2235       case Op_VecY: return new vecYOper();
 2236       case Op_VecZ: return new vecZOper();
 2237     }
 2238   }
 2239   ShouldNotReachHere();
 2240   return nullptr;
 2241 }
 2242 
 2243 bool Matcher::is_reg2reg_move(MachNode* m) {
 2244   switch (m->rule()) {
 2245     case MoveVec2Leg_rule:
 2246     case MoveLeg2Vec_rule:
 2247     case MoveF2VL_rule:
 2248     case MoveF2LEG_rule:
 2249     case MoveVL2F_rule:
 2250     case MoveLEG2F_rule:
 2251     case MoveD2VL_rule:
 2252     case MoveD2LEG_rule:
 2253     case MoveVL2D_rule:
 2254     case MoveLEG2D_rule:
 2255       return true;
 2256     default:
 2257       return false;
 2258   }
 2259 }
 2260 
 2261 bool Matcher::is_generic_vector(MachOper* opnd) {
 2262   switch (opnd->opcode()) {
 2263     case VEC:
 2264     case LEGVEC:
 2265       return true;
 2266     default:
 2267       return false;
 2268   }
 2269 }
 2270 
 2271 //------------------------------------------------------------------------
 2272 
 2273 const RegMask* Matcher::predicate_reg_mask(void) {
 2274   return &_VECTMASK_REG_mask;
 2275 }
 2276 
 2277 // Max vector size in bytes. 0 if not supported.
 2278 int Matcher::vector_width_in_bytes(BasicType bt) {
 2279   assert(is_java_primitive(bt), "only primitive type vectors");
 2280   if (UseSSE < 2) return 0;
 2281   // SSE2 supports 128bit vectors for all types.
 2282   // AVX2 supports 256bit vectors for all types.
 2283   // AVX2/EVEX supports 512bit vectors for all types.
 2284   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
 2285   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 2286   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 2287     size = (UseAVX > 2) ? 64 : 32;
 2288   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
 2289     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
 2290   // Use flag to limit vector size.
 2291   size = MIN2(size,(int)MaxVectorSize);
 2292   // Minimum 2 values in vector (or 4 for bytes).
 2293   switch (bt) {
 2294   case T_DOUBLE:
 2295   case T_LONG:
 2296     if (size < 16) return 0;
 2297     break;
 2298   case T_FLOAT:
 2299   case T_INT:
 2300     if (size < 8) return 0;
 2301     break;
 2302   case T_BOOLEAN:
 2303     if (size < 4) return 0;
 2304     break;
 2305   case T_CHAR:
 2306     if (size < 4) return 0;
 2307     break;
 2308   case T_BYTE:
 2309     if (size < 4) return 0;
 2310     break;
 2311   case T_SHORT:
 2312     if (size < 4) return 0;
 2313     break;
 2314   default:
 2315     ShouldNotReachHere();
 2316   }
 2317   return size;
 2318 }
 2319 
 2320 // Limits on vector size (number of elements) loaded into vector.
 2321 int Matcher::max_vector_size(const BasicType bt) {
 2322   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 2323 }
 2324 int Matcher::min_vector_size(const BasicType bt) {
 2325   int max_size = max_vector_size(bt);
 2326   // Min size which can be loaded into vector is 4 bytes.
 2327   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 2328   // Support for calling svml double64 vectors
 2329   if (bt == T_DOUBLE) {
 2330     size = 1;
 2331   }
 2332   return MIN2(size,max_size);
 2333 }
 2334 
 2335 int Matcher::max_vector_size_auto_vectorization(const BasicType bt) {
 2336   // Limit the max vector size for auto vectorization to 256 bits (32 bytes)
 2337   // by default on Cascade Lake
 2338   if (VM_Version::is_default_intel_cascade_lake()) {
 2339     return MIN2(Matcher::max_vector_size(bt), 32 / type2aelembytes(bt));
 2340   }
 2341   return Matcher::max_vector_size(bt);
 2342 }
 2343 
 2344 int Matcher::scalable_vector_reg_size(const BasicType bt) {
 2345   return -1;
 2346 }
 2347 
 2348 // Vector ideal reg corresponding to specified size in bytes
 2349 uint Matcher::vector_ideal_reg(int size) {
 2350   assert(MaxVectorSize >= size, "");
 2351   switch(size) {
 2352     case  4: return Op_VecS;
 2353     case  8: return Op_VecD;
 2354     case 16: return Op_VecX;
 2355     case 32: return Op_VecY;
 2356     case 64: return Op_VecZ;
 2357   }
 2358   ShouldNotReachHere();
 2359   return 0;
 2360 }
 2361 
 2362 // Check for shift by small constant as well
 2363 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
 2364   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
 2365       shift->in(2)->get_int() <= 3 &&
 2366       // Are there other uses besides address expressions?
 2367       !matcher->is_visited(shift)) {
 2368     address_visited.set(shift->_idx); // Flag as address_visited
 2369     mstack.push(shift->in(2), Matcher::Visit);
 2370     Node *conv = shift->in(1);
 2371 #ifdef _LP64
 2372     // Allow Matcher to match the rule which bypass
 2373     // ConvI2L operation for an array index on LP64
 2374     // if the index value is positive.
 2375     if (conv->Opcode() == Op_ConvI2L &&
 2376         conv->as_Type()->type()->is_long()->_lo >= 0 &&
 2377         // Are there other uses besides address expressions?
 2378         !matcher->is_visited(conv)) {
 2379       address_visited.set(conv->_idx); // Flag as address_visited
 2380       mstack.push(conv->in(1), Matcher::Pre_Visit);
 2381     } else
 2382 #endif
 2383       mstack.push(conv, Matcher::Pre_Visit);
 2384     return true;
 2385   }
 2386   return false;
 2387 }
 2388 
 2389 // This function identifies sub-graphs in which a 'load' node is
 2390 // input to two different nodes, and such that it can be matched
 2391 // with BMI instructions like blsi, blsr, etc.
 2392 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
 2393 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
 2394 // refers to the same node.
 2395 //
 2396 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
 2397 // This is a temporary solution until we make DAGs expressible in ADL.
 2398 template<typename ConType>
 2399 class FusedPatternMatcher {
 2400   Node* _op1_node;
 2401   Node* _mop_node;
 2402   int _con_op;
 2403 
 2404   static int match_next(Node* n, int next_op, int next_op_idx) {
 2405     if (n->in(1) == nullptr || n->in(2) == nullptr) {
 2406       return -1;
 2407     }
 2408 
 2409     if (next_op_idx == -1) { // n is commutative, try rotations
 2410       if (n->in(1)->Opcode() == next_op) {
 2411         return 1;
 2412       } else if (n->in(2)->Opcode() == next_op) {
 2413         return 2;
 2414       }
 2415     } else {
 2416       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
 2417       if (n->in(next_op_idx)->Opcode() == next_op) {
 2418         return next_op_idx;
 2419       }
 2420     }
 2421     return -1;
 2422   }
 2423 
 2424  public:
 2425   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
 2426     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
 2427 
 2428   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
 2429              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
 2430              typename ConType::NativeType con_value) {
 2431     if (_op1_node->Opcode() != op1) {
 2432       return false;
 2433     }
 2434     if (_mop_node->outcnt() > 2) {
 2435       return false;
 2436     }
 2437     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
 2438     if (op1_op2_idx == -1) {
 2439       return false;
 2440     }
 2441     // Memory operation must be the other edge
 2442     int op1_mop_idx = (op1_op2_idx & 1) + 1;
 2443 
 2444     // Check that the mop node is really what we want
 2445     if (_op1_node->in(op1_mop_idx) == _mop_node) {
 2446       Node* op2_node = _op1_node->in(op1_op2_idx);
 2447       if (op2_node->outcnt() > 1) {
 2448         return false;
 2449       }
 2450       assert(op2_node->Opcode() == op2, "Should be");
 2451       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
 2452       if (op2_con_idx == -1) {
 2453         return false;
 2454       }
 2455       // Memory operation must be the other edge
 2456       int op2_mop_idx = (op2_con_idx & 1) + 1;
 2457       // Check that the memory operation is the same node
 2458       if (op2_node->in(op2_mop_idx) == _mop_node) {
 2459         // Now check the constant
 2460         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
 2461         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
 2462           return true;
 2463         }
 2464       }
 2465     }
 2466     return false;
 2467   }
 2468 };
 2469 
 2470 static bool is_bmi_pattern(Node* n, Node* m) {
 2471   assert(UseBMI1Instructions, "sanity");
 2472   if (n != nullptr && m != nullptr) {
 2473     if (m->Opcode() == Op_LoadI) {
 2474       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
 2475       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
 2476              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
 2477              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
 2478     } else if (m->Opcode() == Op_LoadL) {
 2479       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
 2480       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
 2481              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
 2482              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
 2483     }
 2484   }
 2485   return false;
 2486 }
 2487 
 2488 // Should the matcher clone input 'm' of node 'n'?
 2489 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 2490   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
 2491   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
 2492     mstack.push(m, Visit);
 2493     return true;
 2494   }
 2495   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
 2496     mstack.push(m, Visit);           // m = ShiftCntV
 2497     return true;
 2498   }
 2499   if (is_encode_and_store_pattern(n, m)) {
 2500     mstack.push(m, Visit);
 2501     return true;
 2502   }
 2503   return false;
 2504 }
 2505 
 2506 // Should the Matcher clone shifts on addressing modes, expecting them
 2507 // to be subsumed into complex addressing expressions or compute them
 2508 // into registers?
 2509 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 2510   Node *off = m->in(AddPNode::Offset);
 2511   if (off->is_Con()) {
 2512     address_visited.test_set(m->_idx); // Flag as address_visited
 2513     Node *adr = m->in(AddPNode::Address);
 2514 
 2515     // Intel can handle 2 adds in addressing mode, with one of them using an immediate offset.
 2516     // AtomicAdd is not an addressing expression.
 2517     // Cheap to find it by looking for screwy base.
 2518     if (adr->is_AddP() &&
 2519         !adr->in(AddPNode::Base)->is_top() &&
 2520         !adr->in(AddPNode::Offset)->is_Con() &&
 2521         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
 2522         // Are there other uses besides address expressions?
 2523         !is_visited(adr)) {
 2524       address_visited.set(adr->_idx); // Flag as address_visited
 2525       Node *shift = adr->in(AddPNode::Offset);
 2526       if (!clone_shift(shift, this, mstack, address_visited)) {
 2527         mstack.push(shift, Pre_Visit);
 2528       }
 2529       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
 2530       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
 2531     } else {
 2532       mstack.push(adr, Pre_Visit);
 2533     }
 2534 
 2535     // Clone X+offset as it also folds into most addressing expressions
 2536     mstack.push(off, Visit);
 2537     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2538     return true;
 2539   } else if (clone_shift(off, this, mstack, address_visited)) {
 2540     address_visited.test_set(m->_idx); // Flag as address_visited
 2541     mstack.push(m->in(AddPNode::Address), Pre_Visit);
 2542     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2543     return true;
 2544   }
 2545   return false;
 2546 }
 2547 
 2548 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
 2549   switch (bt) {
 2550     case BoolTest::eq:
 2551       return Assembler::eq;
 2552     case BoolTest::ne:
 2553       return Assembler::neq;
 2554     case BoolTest::le:
 2555     case BoolTest::ule:
 2556       return Assembler::le;
 2557     case BoolTest::ge:
 2558     case BoolTest::uge:
 2559       return Assembler::nlt;
 2560     case BoolTest::lt:
 2561     case BoolTest::ult:
 2562       return Assembler::lt;
 2563     case BoolTest::gt:
 2564     case BoolTest::ugt:
 2565       return Assembler::nle;
 2566     default : ShouldNotReachHere(); return Assembler::_false;
 2567   }
 2568 }
 2569 
 2570 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
 2571   switch (bt) {
 2572   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
 2573   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
 2574   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
 2575   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
 2576   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
 2577   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
 2578   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
 2579   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
 2580   }
 2581 }
 2582 
 2583 // Helper methods for MachSpillCopyNode::implementation().
 2584 static void vec_mov_helper(C2_MacroAssembler *masm, int src_lo, int dst_lo,
 2585                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 2586   assert(ireg == Op_VecS || // 32bit vector
 2587          ((src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 2588           (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi),
 2589          "no non-adjacent vector moves" );
 2590   if (masm) {
 2591     switch (ireg) {
 2592     case Op_VecS: // copy whole register
 2593     case Op_VecD:
 2594     case Op_VecX:
 2595 #ifndef _LP64
 2596       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2597 #else
 2598       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2599         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2600       } else {
 2601         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2602      }
 2603 #endif
 2604       break;
 2605     case Op_VecY:
 2606 #ifndef _LP64
 2607       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2608 #else
 2609       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2610         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2611       } else {
 2612         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2613      }
 2614 #endif
 2615       break;
 2616     case Op_VecZ:
 2617       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
 2618       break;
 2619     default:
 2620       ShouldNotReachHere();
 2621     }
 2622 #ifndef PRODUCT
 2623   } else {
 2624     switch (ireg) {
 2625     case Op_VecS:
 2626     case Op_VecD:
 2627     case Op_VecX:
 2628       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2629       break;
 2630     case Op_VecY:
 2631     case Op_VecZ:
 2632       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2633       break;
 2634     default:
 2635       ShouldNotReachHere();
 2636     }
 2637 #endif
 2638   }
 2639 }
 2640 
 2641 void vec_spill_helper(C2_MacroAssembler *masm, bool is_load,
 2642                      int stack_offset, int reg, uint ireg, outputStream* st) {
 2643   if (masm) {
 2644     if (is_load) {
 2645       switch (ireg) {
 2646       case Op_VecS:
 2647         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2648         break;
 2649       case Op_VecD:
 2650         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2651         break;
 2652       case Op_VecX:
 2653 #ifndef _LP64
 2654         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2655 #else
 2656         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2657           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2658         } else {
 2659           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2660           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2661         }
 2662 #endif
 2663         break;
 2664       case Op_VecY:
 2665 #ifndef _LP64
 2666         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2667 #else
 2668         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2669           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2670         } else {
 2671           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2672           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2673         }
 2674 #endif
 2675         break;
 2676       case Op_VecZ:
 2677         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
 2678         break;
 2679       default:
 2680         ShouldNotReachHere();
 2681       }
 2682     } else { // store
 2683       switch (ireg) {
 2684       case Op_VecS:
 2685         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2686         break;
 2687       case Op_VecD:
 2688         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2689         break;
 2690       case Op_VecX:
 2691 #ifndef _LP64
 2692         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2693 #else
 2694         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2695           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2696         }
 2697         else {
 2698           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2699         }
 2700 #endif
 2701         break;
 2702       case Op_VecY:
 2703 #ifndef _LP64
 2704         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2705 #else
 2706         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2707           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2708         }
 2709         else {
 2710           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2711         }
 2712 #endif
 2713         break;
 2714       case Op_VecZ:
 2715         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2716         break;
 2717       default:
 2718         ShouldNotReachHere();
 2719       }
 2720     }
 2721 #ifndef PRODUCT
 2722   } else {
 2723     if (is_load) {
 2724       switch (ireg) {
 2725       case Op_VecS:
 2726         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2727         break;
 2728       case Op_VecD:
 2729         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2730         break;
 2731        case Op_VecX:
 2732         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2733         break;
 2734       case Op_VecY:
 2735       case Op_VecZ:
 2736         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2737         break;
 2738       default:
 2739         ShouldNotReachHere();
 2740       }
 2741     } else { // store
 2742       switch (ireg) {
 2743       case Op_VecS:
 2744         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2745         break;
 2746       case Op_VecD:
 2747         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2748         break;
 2749        case Op_VecX:
 2750         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2751         break;
 2752       case Op_VecY:
 2753       case Op_VecZ:
 2754         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2755         break;
 2756       default:
 2757         ShouldNotReachHere();
 2758       }
 2759     }
 2760 #endif
 2761   }
 2762 }
 2763 
 2764 template <class T>
 2765 static inline GrowableArray<jvalue>* vreplicate_imm(BasicType bt, T con, int len) {
 2766   GrowableArray<jvalue>* val = new GrowableArray<jvalue>(len);
 2767   jvalue ele;
 2768   switch (bt) {
 2769     case T_BYTE:   ele.b = con; break;
 2770     case T_SHORT:  ele.s = con; break;
 2771     case T_INT:    ele.i = con; break;
 2772     case T_LONG:   ele.j = con; break;
 2773     case T_FLOAT:  ele.f = con; break;
 2774     case T_DOUBLE: ele.d = con; break;
 2775     default: ShouldNotReachHere();
 2776   }
 2777   for (int i = 0; i < len; i++) {
 2778     val->append(ele);
 2779   }
 2780   return val;
 2781 }
 2782 
 2783 static inline jlong high_bit_set(BasicType bt) {
 2784   switch (bt) {
 2785     case T_BYTE:  return 0x8080808080808080;
 2786     case T_SHORT: return 0x8000800080008000;
 2787     case T_INT:   return 0x8000000080000000;
 2788     case T_LONG:  return 0x8000000000000000;
 2789     default:
 2790       ShouldNotReachHere();
 2791       return 0;
 2792   }
 2793 }
 2794 
 2795 #ifndef PRODUCT
 2796   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 2797     st->print("nop \t# %d bytes pad for loops and calls", _count);
 2798   }
 2799 #endif
 2800 
 2801   void MachNopNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc*) const {
 2802     __ nop(_count);
 2803   }
 2804 
 2805   uint MachNopNode::size(PhaseRegAlloc*) const {
 2806     return _count;
 2807   }
 2808 
 2809 #ifndef PRODUCT
 2810   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 2811     st->print("# breakpoint");
 2812   }
 2813 #endif
 2814 
 2815   void MachBreakpointNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc* ra_) const {
 2816     __ int3();
 2817   }
 2818 
 2819   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 2820     return MachNode::size(ra_);
 2821   }
 2822 
 2823 %}
 2824 
 2825 encode %{
 2826 
 2827   enc_class call_epilog %{
 2828     if (VerifyStackAtCalls) {
 2829       // Check that stack depth is unchanged: find majik cookie on stack
 2830       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 2831       Label L;
 2832       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 2833       __ jccb(Assembler::equal, L);
 2834       // Die if stack mismatch
 2835       __ int3();
 2836       __ bind(L);
 2837     }
 2838     if (tf()->returns_inline_type_as_fields() && !_method->is_method_handle_intrinsic()) {
 2839       // The last return value is not set by the callee but used to pass IsInit information to compiled code.
 2840       // Search for the corresponding projection, get the register and emit code that initialized it.
 2841       uint con = (tf()->range_cc()->cnt() - 1);
 2842       for (DUIterator_Fast imax, i = fast_outs(imax); i < imax; i++) {
 2843         ProjNode* proj = fast_out(i)->as_Proj();
 2844         if (proj->_con == con) {
 2845           // Set IsInit if rax is non-null (a non-null value is returned buffered or scalarized)
 2846           OptoReg::Name optoReg = ra_->get_reg_first(proj);
 2847           VMReg reg = OptoReg::as_VMReg(optoReg, ra_->_framesize, OptoReg::reg2stack(ra_->_matcher._new_SP));
 2848           Register toReg = reg->is_reg() ? reg->as_Register() : rscratch1;
 2849           __ testq(rax, rax);
 2850           __ setb(Assembler::notZero, toReg);
 2851           __ movzbl(toReg, toReg);
 2852           if (reg->is_stack()) {
 2853             int st_off = reg->reg2stack() * VMRegImpl::stack_slot_size;
 2854             __ movq(Address(rsp, st_off), toReg);
 2855           }
 2856           break;
 2857         }
 2858       }
 2859       if (return_value_is_used()) {
 2860         // An inline type is returned as fields in multiple registers.
 2861         // Rax either contains an oop if the inline type is buffered or a pointer
 2862         // to the corresponding InlineKlass with the lowest bit set to 1. Zero rax
 2863         // if the lowest bit is set to allow C2 to use the oop after null checking.
 2864         // rax &= (rax & 1) - 1
 2865         __ movptr(rscratch1, rax);
 2866         __ andptr(rscratch1, 0x1);
 2867         __ subptr(rscratch1, 0x1);
 2868         __ andptr(rax, rscratch1);
 2869       }
 2870     }
 2871   %}
 2872 
 2873 %}
 2874 
 2875 // Operands for bound floating pointer register arguments
 2876 operand rxmm0() %{
 2877   constraint(ALLOC_IN_RC(xmm0_reg));
 2878   match(VecX);
 2879   format%{%}
 2880   interface(REG_INTER);
 2881 %}
 2882 
 2883 //----------OPERANDS-----------------------------------------------------------
 2884 // Operand definitions must precede instruction definitions for correct parsing
 2885 // in the ADLC because operands constitute user defined types which are used in
 2886 // instruction definitions.
 2887 
 2888 // Vectors
 2889 
 2890 // Dummy generic vector class. Should be used for all vector operands.
 2891 // Replaced with vec[SDXYZ] during post-selection pass.
 2892 operand vec() %{
 2893   constraint(ALLOC_IN_RC(dynamic));
 2894   match(VecX);
 2895   match(VecY);
 2896   match(VecZ);
 2897   match(VecS);
 2898   match(VecD);
 2899 
 2900   format %{ %}
 2901   interface(REG_INTER);
 2902 %}
 2903 
 2904 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
 2905 // Replaced with legVec[SDXYZ] during post-selection cleanup.
 2906 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
 2907 // runtime code generation via reg_class_dynamic.
 2908 operand legVec() %{
 2909   constraint(ALLOC_IN_RC(dynamic));
 2910   match(VecX);
 2911   match(VecY);
 2912   match(VecZ);
 2913   match(VecS);
 2914   match(VecD);
 2915 
 2916   format %{ %}
 2917   interface(REG_INTER);
 2918 %}
 2919 
 2920 // Replaces vec during post-selection cleanup. See above.
 2921 operand vecS() %{
 2922   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
 2923   match(VecS);
 2924 
 2925   format %{ %}
 2926   interface(REG_INTER);
 2927 %}
 2928 
 2929 // Replaces legVec during post-selection cleanup. See above.
 2930 operand legVecS() %{
 2931   constraint(ALLOC_IN_RC(vectors_reg_legacy));
 2932   match(VecS);
 2933 
 2934   format %{ %}
 2935   interface(REG_INTER);
 2936 %}
 2937 
 2938 // Replaces vec during post-selection cleanup. See above.
 2939 operand vecD() %{
 2940   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
 2941   match(VecD);
 2942 
 2943   format %{ %}
 2944   interface(REG_INTER);
 2945 %}
 2946 
 2947 // Replaces legVec during post-selection cleanup. See above.
 2948 operand legVecD() %{
 2949   constraint(ALLOC_IN_RC(vectord_reg_legacy));
 2950   match(VecD);
 2951 
 2952   format %{ %}
 2953   interface(REG_INTER);
 2954 %}
 2955 
 2956 // Replaces vec during post-selection cleanup. See above.
 2957 operand vecX() %{
 2958   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
 2959   match(VecX);
 2960 
 2961   format %{ %}
 2962   interface(REG_INTER);
 2963 %}
 2964 
 2965 // Replaces legVec during post-selection cleanup. See above.
 2966 operand legVecX() %{
 2967   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
 2968   match(VecX);
 2969 
 2970   format %{ %}
 2971   interface(REG_INTER);
 2972 %}
 2973 
 2974 // Replaces vec during post-selection cleanup. See above.
 2975 operand vecY() %{
 2976   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
 2977   match(VecY);
 2978 
 2979   format %{ %}
 2980   interface(REG_INTER);
 2981 %}
 2982 
 2983 // Replaces legVec during post-selection cleanup. See above.
 2984 operand legVecY() %{
 2985   constraint(ALLOC_IN_RC(vectory_reg_legacy));
 2986   match(VecY);
 2987 
 2988   format %{ %}
 2989   interface(REG_INTER);
 2990 %}
 2991 
 2992 // Replaces vec during post-selection cleanup. See above.
 2993 operand vecZ() %{
 2994   constraint(ALLOC_IN_RC(vectorz_reg));
 2995   match(VecZ);
 2996 
 2997   format %{ %}
 2998   interface(REG_INTER);
 2999 %}
 3000 
 3001 // Replaces legVec during post-selection cleanup. See above.
 3002 operand legVecZ() %{
 3003   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
 3004   match(VecZ);
 3005 
 3006   format %{ %}
 3007   interface(REG_INTER);
 3008 %}
 3009 
 3010 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 3011 
 3012 // ============================================================================
 3013 
 3014 instruct ShouldNotReachHere() %{
 3015   match(Halt);
 3016   format %{ "stop\t# ShouldNotReachHere" %}
 3017   ins_encode %{
 3018     if (is_reachable()) {
 3019       __ stop(_halt_reason);
 3020     }
 3021   %}
 3022   ins_pipe(pipe_slow);
 3023 %}
 3024 
 3025 // ============================================================================
 3026 
 3027 instruct addF_reg(regF dst, regF src) %{
 3028   predicate((UseSSE>=1) && (UseAVX == 0));
 3029   match(Set dst (AddF dst src));
 3030 
 3031   format %{ "addss   $dst, $src" %}
 3032   ins_cost(150);
 3033   ins_encode %{
 3034     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 3035   %}
 3036   ins_pipe(pipe_slow);
 3037 %}
 3038 
 3039 instruct addF_mem(regF dst, memory src) %{
 3040   predicate((UseSSE>=1) && (UseAVX == 0));
 3041   match(Set dst (AddF dst (LoadF src)));
 3042 
 3043   format %{ "addss   $dst, $src" %}
 3044   ins_cost(150);
 3045   ins_encode %{
 3046     __ addss($dst$$XMMRegister, $src$$Address);
 3047   %}
 3048   ins_pipe(pipe_slow);
 3049 %}
 3050 
 3051 instruct addF_imm(regF dst, immF con) %{
 3052   predicate((UseSSE>=1) && (UseAVX == 0));
 3053   match(Set dst (AddF dst con));
 3054   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3055   ins_cost(150);
 3056   ins_encode %{
 3057     __ addss($dst$$XMMRegister, $constantaddress($con));
 3058   %}
 3059   ins_pipe(pipe_slow);
 3060 %}
 3061 
 3062 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 3063   predicate(UseAVX > 0);
 3064   match(Set dst (AddF src1 src2));
 3065 
 3066   format %{ "vaddss  $dst, $src1, $src2" %}
 3067   ins_cost(150);
 3068   ins_encode %{
 3069     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3070   %}
 3071   ins_pipe(pipe_slow);
 3072 %}
 3073 
 3074 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 3075   predicate(UseAVX > 0);
 3076   match(Set dst (AddF src1 (LoadF src2)));
 3077 
 3078   format %{ "vaddss  $dst, $src1, $src2" %}
 3079   ins_cost(150);
 3080   ins_encode %{
 3081     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3082   %}
 3083   ins_pipe(pipe_slow);
 3084 %}
 3085 
 3086 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 3087   predicate(UseAVX > 0);
 3088   match(Set dst (AddF src con));
 3089 
 3090   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3091   ins_cost(150);
 3092   ins_encode %{
 3093     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3094   %}
 3095   ins_pipe(pipe_slow);
 3096 %}
 3097 
 3098 instruct addD_reg(regD dst, regD src) %{
 3099   predicate((UseSSE>=2) && (UseAVX == 0));
 3100   match(Set dst (AddD dst src));
 3101 
 3102   format %{ "addsd   $dst, $src" %}
 3103   ins_cost(150);
 3104   ins_encode %{
 3105     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 3106   %}
 3107   ins_pipe(pipe_slow);
 3108 %}
 3109 
 3110 instruct addD_mem(regD dst, memory src) %{
 3111   predicate((UseSSE>=2) && (UseAVX == 0));
 3112   match(Set dst (AddD dst (LoadD src)));
 3113 
 3114   format %{ "addsd   $dst, $src" %}
 3115   ins_cost(150);
 3116   ins_encode %{
 3117     __ addsd($dst$$XMMRegister, $src$$Address);
 3118   %}
 3119   ins_pipe(pipe_slow);
 3120 %}
 3121 
 3122 instruct addD_imm(regD dst, immD con) %{
 3123   predicate((UseSSE>=2) && (UseAVX == 0));
 3124   match(Set dst (AddD dst con));
 3125   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3126   ins_cost(150);
 3127   ins_encode %{
 3128     __ addsd($dst$$XMMRegister, $constantaddress($con));
 3129   %}
 3130   ins_pipe(pipe_slow);
 3131 %}
 3132 
 3133 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 3134   predicate(UseAVX > 0);
 3135   match(Set dst (AddD src1 src2));
 3136 
 3137   format %{ "vaddsd  $dst, $src1, $src2" %}
 3138   ins_cost(150);
 3139   ins_encode %{
 3140     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3141   %}
 3142   ins_pipe(pipe_slow);
 3143 %}
 3144 
 3145 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 3146   predicate(UseAVX > 0);
 3147   match(Set dst (AddD src1 (LoadD src2)));
 3148 
 3149   format %{ "vaddsd  $dst, $src1, $src2" %}
 3150   ins_cost(150);
 3151   ins_encode %{
 3152     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3153   %}
 3154   ins_pipe(pipe_slow);
 3155 %}
 3156 
 3157 instruct addD_reg_imm(regD dst, regD src, immD con) %{
 3158   predicate(UseAVX > 0);
 3159   match(Set dst (AddD src con));
 3160 
 3161   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3162   ins_cost(150);
 3163   ins_encode %{
 3164     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3165   %}
 3166   ins_pipe(pipe_slow);
 3167 %}
 3168 
 3169 instruct subF_reg(regF dst, regF src) %{
 3170   predicate((UseSSE>=1) && (UseAVX == 0));
 3171   match(Set dst (SubF dst src));
 3172 
 3173   format %{ "subss   $dst, $src" %}
 3174   ins_cost(150);
 3175   ins_encode %{
 3176     __ subss($dst$$XMMRegister, $src$$XMMRegister);
 3177   %}
 3178   ins_pipe(pipe_slow);
 3179 %}
 3180 
 3181 instruct subF_mem(regF dst, memory src) %{
 3182   predicate((UseSSE>=1) && (UseAVX == 0));
 3183   match(Set dst (SubF dst (LoadF src)));
 3184 
 3185   format %{ "subss   $dst, $src" %}
 3186   ins_cost(150);
 3187   ins_encode %{
 3188     __ subss($dst$$XMMRegister, $src$$Address);
 3189   %}
 3190   ins_pipe(pipe_slow);
 3191 %}
 3192 
 3193 instruct subF_imm(regF dst, immF con) %{
 3194   predicate((UseSSE>=1) && (UseAVX == 0));
 3195   match(Set dst (SubF dst con));
 3196   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3197   ins_cost(150);
 3198   ins_encode %{
 3199     __ subss($dst$$XMMRegister, $constantaddress($con));
 3200   %}
 3201   ins_pipe(pipe_slow);
 3202 %}
 3203 
 3204 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
 3205   predicate(UseAVX > 0);
 3206   match(Set dst (SubF src1 src2));
 3207 
 3208   format %{ "vsubss  $dst, $src1, $src2" %}
 3209   ins_cost(150);
 3210   ins_encode %{
 3211     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3212   %}
 3213   ins_pipe(pipe_slow);
 3214 %}
 3215 
 3216 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
 3217   predicate(UseAVX > 0);
 3218   match(Set dst (SubF src1 (LoadF src2)));
 3219 
 3220   format %{ "vsubss  $dst, $src1, $src2" %}
 3221   ins_cost(150);
 3222   ins_encode %{
 3223     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3224   %}
 3225   ins_pipe(pipe_slow);
 3226 %}
 3227 
 3228 instruct subF_reg_imm(regF dst, regF src, immF con) %{
 3229   predicate(UseAVX > 0);
 3230   match(Set dst (SubF src con));
 3231 
 3232   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3233   ins_cost(150);
 3234   ins_encode %{
 3235     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3236   %}
 3237   ins_pipe(pipe_slow);
 3238 %}
 3239 
 3240 instruct subD_reg(regD dst, regD src) %{
 3241   predicate((UseSSE>=2) && (UseAVX == 0));
 3242   match(Set dst (SubD dst src));
 3243 
 3244   format %{ "subsd   $dst, $src" %}
 3245   ins_cost(150);
 3246   ins_encode %{
 3247     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
 3248   %}
 3249   ins_pipe(pipe_slow);
 3250 %}
 3251 
 3252 instruct subD_mem(regD dst, memory src) %{
 3253   predicate((UseSSE>=2) && (UseAVX == 0));
 3254   match(Set dst (SubD dst (LoadD src)));
 3255 
 3256   format %{ "subsd   $dst, $src" %}
 3257   ins_cost(150);
 3258   ins_encode %{
 3259     __ subsd($dst$$XMMRegister, $src$$Address);
 3260   %}
 3261   ins_pipe(pipe_slow);
 3262 %}
 3263 
 3264 instruct subD_imm(regD dst, immD con) %{
 3265   predicate((UseSSE>=2) && (UseAVX == 0));
 3266   match(Set dst (SubD dst con));
 3267   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3268   ins_cost(150);
 3269   ins_encode %{
 3270     __ subsd($dst$$XMMRegister, $constantaddress($con));
 3271   %}
 3272   ins_pipe(pipe_slow);
 3273 %}
 3274 
 3275 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
 3276   predicate(UseAVX > 0);
 3277   match(Set dst (SubD src1 src2));
 3278 
 3279   format %{ "vsubsd  $dst, $src1, $src2" %}
 3280   ins_cost(150);
 3281   ins_encode %{
 3282     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3283   %}
 3284   ins_pipe(pipe_slow);
 3285 %}
 3286 
 3287 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
 3288   predicate(UseAVX > 0);
 3289   match(Set dst (SubD src1 (LoadD src2)));
 3290 
 3291   format %{ "vsubsd  $dst, $src1, $src2" %}
 3292   ins_cost(150);
 3293   ins_encode %{
 3294     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3295   %}
 3296   ins_pipe(pipe_slow);
 3297 %}
 3298 
 3299 instruct subD_reg_imm(regD dst, regD src, immD con) %{
 3300   predicate(UseAVX > 0);
 3301   match(Set dst (SubD src con));
 3302 
 3303   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3304   ins_cost(150);
 3305   ins_encode %{
 3306     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3307   %}
 3308   ins_pipe(pipe_slow);
 3309 %}
 3310 
 3311 instruct mulF_reg(regF dst, regF src) %{
 3312   predicate((UseSSE>=1) && (UseAVX == 0));
 3313   match(Set dst (MulF dst src));
 3314 
 3315   format %{ "mulss   $dst, $src" %}
 3316   ins_cost(150);
 3317   ins_encode %{
 3318     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
 3319   %}
 3320   ins_pipe(pipe_slow);
 3321 %}
 3322 
 3323 instruct mulF_mem(regF dst, memory src) %{
 3324   predicate((UseSSE>=1) && (UseAVX == 0));
 3325   match(Set dst (MulF dst (LoadF src)));
 3326 
 3327   format %{ "mulss   $dst, $src" %}
 3328   ins_cost(150);
 3329   ins_encode %{
 3330     __ mulss($dst$$XMMRegister, $src$$Address);
 3331   %}
 3332   ins_pipe(pipe_slow);
 3333 %}
 3334 
 3335 instruct mulF_imm(regF dst, immF con) %{
 3336   predicate((UseSSE>=1) && (UseAVX == 0));
 3337   match(Set dst (MulF dst con));
 3338   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3339   ins_cost(150);
 3340   ins_encode %{
 3341     __ mulss($dst$$XMMRegister, $constantaddress($con));
 3342   %}
 3343   ins_pipe(pipe_slow);
 3344 %}
 3345 
 3346 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
 3347   predicate(UseAVX > 0);
 3348   match(Set dst (MulF src1 src2));
 3349 
 3350   format %{ "vmulss  $dst, $src1, $src2" %}
 3351   ins_cost(150);
 3352   ins_encode %{
 3353     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3354   %}
 3355   ins_pipe(pipe_slow);
 3356 %}
 3357 
 3358 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 3359   predicate(UseAVX > 0);
 3360   match(Set dst (MulF src1 (LoadF src2)));
 3361 
 3362   format %{ "vmulss  $dst, $src1, $src2" %}
 3363   ins_cost(150);
 3364   ins_encode %{
 3365     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3366   %}
 3367   ins_pipe(pipe_slow);
 3368 %}
 3369 
 3370 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 3371   predicate(UseAVX > 0);
 3372   match(Set dst (MulF src con));
 3373 
 3374   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3375   ins_cost(150);
 3376   ins_encode %{
 3377     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3378   %}
 3379   ins_pipe(pipe_slow);
 3380 %}
 3381 
 3382 instruct mulD_reg(regD dst, regD src) %{
 3383   predicate((UseSSE>=2) && (UseAVX == 0));
 3384   match(Set dst (MulD dst src));
 3385 
 3386   format %{ "mulsd   $dst, $src" %}
 3387   ins_cost(150);
 3388   ins_encode %{
 3389     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
 3390   %}
 3391   ins_pipe(pipe_slow);
 3392 %}
 3393 
 3394 instruct mulD_mem(regD dst, memory src) %{
 3395   predicate((UseSSE>=2) && (UseAVX == 0));
 3396   match(Set dst (MulD dst (LoadD src)));
 3397 
 3398   format %{ "mulsd   $dst, $src" %}
 3399   ins_cost(150);
 3400   ins_encode %{
 3401     __ mulsd($dst$$XMMRegister, $src$$Address);
 3402   %}
 3403   ins_pipe(pipe_slow);
 3404 %}
 3405 
 3406 instruct mulD_imm(regD dst, immD con) %{
 3407   predicate((UseSSE>=2) && (UseAVX == 0));
 3408   match(Set dst (MulD dst con));
 3409   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3410   ins_cost(150);
 3411   ins_encode %{
 3412     __ mulsd($dst$$XMMRegister, $constantaddress($con));
 3413   %}
 3414   ins_pipe(pipe_slow);
 3415 %}
 3416 
 3417 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
 3418   predicate(UseAVX > 0);
 3419   match(Set dst (MulD src1 src2));
 3420 
 3421   format %{ "vmulsd  $dst, $src1, $src2" %}
 3422   ins_cost(150);
 3423   ins_encode %{
 3424     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3425   %}
 3426   ins_pipe(pipe_slow);
 3427 %}
 3428 
 3429 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
 3430   predicate(UseAVX > 0);
 3431   match(Set dst (MulD src1 (LoadD src2)));
 3432 
 3433   format %{ "vmulsd  $dst, $src1, $src2" %}
 3434   ins_cost(150);
 3435   ins_encode %{
 3436     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3437   %}
 3438   ins_pipe(pipe_slow);
 3439 %}
 3440 
 3441 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
 3442   predicate(UseAVX > 0);
 3443   match(Set dst (MulD src con));
 3444 
 3445   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3446   ins_cost(150);
 3447   ins_encode %{
 3448     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3449   %}
 3450   ins_pipe(pipe_slow);
 3451 %}
 3452 
 3453 instruct divF_reg(regF dst, regF src) %{
 3454   predicate((UseSSE>=1) && (UseAVX == 0));
 3455   match(Set dst (DivF dst src));
 3456 
 3457   format %{ "divss   $dst, $src" %}
 3458   ins_cost(150);
 3459   ins_encode %{
 3460     __ divss($dst$$XMMRegister, $src$$XMMRegister);
 3461   %}
 3462   ins_pipe(pipe_slow);
 3463 %}
 3464 
 3465 instruct divF_mem(regF dst, memory src) %{
 3466   predicate((UseSSE>=1) && (UseAVX == 0));
 3467   match(Set dst (DivF dst (LoadF src)));
 3468 
 3469   format %{ "divss   $dst, $src" %}
 3470   ins_cost(150);
 3471   ins_encode %{
 3472     __ divss($dst$$XMMRegister, $src$$Address);
 3473   %}
 3474   ins_pipe(pipe_slow);
 3475 %}
 3476 
 3477 instruct divF_imm(regF dst, immF con) %{
 3478   predicate((UseSSE>=1) && (UseAVX == 0));
 3479   match(Set dst (DivF dst con));
 3480   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3481   ins_cost(150);
 3482   ins_encode %{
 3483     __ divss($dst$$XMMRegister, $constantaddress($con));
 3484   %}
 3485   ins_pipe(pipe_slow);
 3486 %}
 3487 
 3488 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
 3489   predicate(UseAVX > 0);
 3490   match(Set dst (DivF src1 src2));
 3491 
 3492   format %{ "vdivss  $dst, $src1, $src2" %}
 3493   ins_cost(150);
 3494   ins_encode %{
 3495     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3496   %}
 3497   ins_pipe(pipe_slow);
 3498 %}
 3499 
 3500 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
 3501   predicate(UseAVX > 0);
 3502   match(Set dst (DivF src1 (LoadF src2)));
 3503 
 3504   format %{ "vdivss  $dst, $src1, $src2" %}
 3505   ins_cost(150);
 3506   ins_encode %{
 3507     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3508   %}
 3509   ins_pipe(pipe_slow);
 3510 %}
 3511 
 3512 instruct divF_reg_imm(regF dst, regF src, immF con) %{
 3513   predicate(UseAVX > 0);
 3514   match(Set dst (DivF src con));
 3515 
 3516   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3517   ins_cost(150);
 3518   ins_encode %{
 3519     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3520   %}
 3521   ins_pipe(pipe_slow);
 3522 %}
 3523 
 3524 instruct divD_reg(regD dst, regD src) %{
 3525   predicate((UseSSE>=2) && (UseAVX == 0));
 3526   match(Set dst (DivD dst src));
 3527 
 3528   format %{ "divsd   $dst, $src" %}
 3529   ins_cost(150);
 3530   ins_encode %{
 3531     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
 3532   %}
 3533   ins_pipe(pipe_slow);
 3534 %}
 3535 
 3536 instruct divD_mem(regD dst, memory src) %{
 3537   predicate((UseSSE>=2) && (UseAVX == 0));
 3538   match(Set dst (DivD dst (LoadD src)));
 3539 
 3540   format %{ "divsd   $dst, $src" %}
 3541   ins_cost(150);
 3542   ins_encode %{
 3543     __ divsd($dst$$XMMRegister, $src$$Address);
 3544   %}
 3545   ins_pipe(pipe_slow);
 3546 %}
 3547 
 3548 instruct divD_imm(regD dst, immD con) %{
 3549   predicate((UseSSE>=2) && (UseAVX == 0));
 3550   match(Set dst (DivD dst con));
 3551   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3552   ins_cost(150);
 3553   ins_encode %{
 3554     __ divsd($dst$$XMMRegister, $constantaddress($con));
 3555   %}
 3556   ins_pipe(pipe_slow);
 3557 %}
 3558 
 3559 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
 3560   predicate(UseAVX > 0);
 3561   match(Set dst (DivD src1 src2));
 3562 
 3563   format %{ "vdivsd  $dst, $src1, $src2" %}
 3564   ins_cost(150);
 3565   ins_encode %{
 3566     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3567   %}
 3568   ins_pipe(pipe_slow);
 3569 %}
 3570 
 3571 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
 3572   predicate(UseAVX > 0);
 3573   match(Set dst (DivD src1 (LoadD src2)));
 3574 
 3575   format %{ "vdivsd  $dst, $src1, $src2" %}
 3576   ins_cost(150);
 3577   ins_encode %{
 3578     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3579   %}
 3580   ins_pipe(pipe_slow);
 3581 %}
 3582 
 3583 instruct divD_reg_imm(regD dst, regD src, immD con) %{
 3584   predicate(UseAVX > 0);
 3585   match(Set dst (DivD src con));
 3586 
 3587   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3588   ins_cost(150);
 3589   ins_encode %{
 3590     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3591   %}
 3592   ins_pipe(pipe_slow);
 3593 %}
 3594 
 3595 instruct absF_reg(regF dst) %{
 3596   predicate((UseSSE>=1) && (UseAVX == 0));
 3597   match(Set dst (AbsF dst));
 3598   ins_cost(150);
 3599   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
 3600   ins_encode %{
 3601     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
 3602   %}
 3603   ins_pipe(pipe_slow);
 3604 %}
 3605 
 3606 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
 3607   predicate(UseAVX > 0);
 3608   match(Set dst (AbsF src));
 3609   ins_cost(150);
 3610   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
 3611   ins_encode %{
 3612     int vlen_enc = Assembler::AVX_128bit;
 3613     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
 3614               ExternalAddress(float_signmask()), vlen_enc);
 3615   %}
 3616   ins_pipe(pipe_slow);
 3617 %}
 3618 
 3619 instruct absD_reg(regD dst) %{
 3620   predicate((UseSSE>=2) && (UseAVX == 0));
 3621   match(Set dst (AbsD dst));
 3622   ins_cost(150);
 3623   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
 3624             "# abs double by sign masking" %}
 3625   ins_encode %{
 3626     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
 3627   %}
 3628   ins_pipe(pipe_slow);
 3629 %}
 3630 
 3631 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
 3632   predicate(UseAVX > 0);
 3633   match(Set dst (AbsD src));
 3634   ins_cost(150);
 3635   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
 3636             "# abs double by sign masking" %}
 3637   ins_encode %{
 3638     int vlen_enc = Assembler::AVX_128bit;
 3639     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
 3640               ExternalAddress(double_signmask()), vlen_enc);
 3641   %}
 3642   ins_pipe(pipe_slow);
 3643 %}
 3644 
 3645 instruct negF_reg(regF dst) %{
 3646   predicate((UseSSE>=1) && (UseAVX == 0));
 3647   match(Set dst (NegF dst));
 3648   ins_cost(150);
 3649   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
 3650   ins_encode %{
 3651     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
 3652   %}
 3653   ins_pipe(pipe_slow);
 3654 %}
 3655 
 3656 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
 3657   predicate(UseAVX > 0);
 3658   match(Set dst (NegF src));
 3659   ins_cost(150);
 3660   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
 3661   ins_encode %{
 3662     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
 3663                  ExternalAddress(float_signflip()));
 3664   %}
 3665   ins_pipe(pipe_slow);
 3666 %}
 3667 
 3668 instruct negD_reg(regD dst) %{
 3669   predicate((UseSSE>=2) && (UseAVX == 0));
 3670   match(Set dst (NegD dst));
 3671   ins_cost(150);
 3672   format %{ "xorpd   $dst, [0x8000000000000000]\t"
 3673             "# neg double by sign flipping" %}
 3674   ins_encode %{
 3675     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
 3676   %}
 3677   ins_pipe(pipe_slow);
 3678 %}
 3679 
 3680 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
 3681   predicate(UseAVX > 0);
 3682   match(Set dst (NegD src));
 3683   ins_cost(150);
 3684   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
 3685             "# neg double by sign flipping" %}
 3686   ins_encode %{
 3687     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
 3688                  ExternalAddress(double_signflip()));
 3689   %}
 3690   ins_pipe(pipe_slow);
 3691 %}
 3692 
 3693 // sqrtss instruction needs destination register to be pre initialized for best performance
 3694 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3695 instruct sqrtF_reg(regF dst) %{
 3696   predicate(UseSSE>=1);
 3697   match(Set dst (SqrtF dst));
 3698   format %{ "sqrtss  $dst, $dst" %}
 3699   ins_encode %{
 3700     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
 3701   %}
 3702   ins_pipe(pipe_slow);
 3703 %}
 3704 
 3705 // sqrtsd instruction needs destination register to be pre initialized for best performance
 3706 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3707 instruct sqrtD_reg(regD dst) %{
 3708   predicate(UseSSE>=2);
 3709   match(Set dst (SqrtD dst));
 3710   format %{ "sqrtsd  $dst, $dst" %}
 3711   ins_encode %{
 3712     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
 3713   %}
 3714   ins_pipe(pipe_slow);
 3715 %}
 3716 
 3717 instruct convF2HF_reg_reg(rRegI dst, vlRegF src, vlRegF tmp) %{
 3718   effect(TEMP tmp);
 3719   match(Set dst (ConvF2HF src));
 3720   ins_cost(125);
 3721   format %{ "vcvtps2ph $dst,$src \t using $tmp as TEMP"%}
 3722   ins_encode %{
 3723     __ flt_to_flt16($dst$$Register, $src$$XMMRegister, $tmp$$XMMRegister);
 3724   %}
 3725   ins_pipe( pipe_slow );
 3726 %}
 3727 
 3728 instruct convF2HF_mem_reg(memory mem, regF src, kReg ktmp, rRegI rtmp) %{
 3729   predicate((UseAVX > 2) && VM_Version::supports_avx512vl());
 3730   effect(TEMP ktmp, TEMP rtmp);
 3731   match(Set mem (StoreC mem (ConvF2HF src)));
 3732   format %{ "evcvtps2ph $mem,$src \t using $ktmp and $rtmp as TEMP" %}
 3733   ins_encode %{
 3734     __ movl($rtmp$$Register, 0x1);
 3735     __ kmovwl($ktmp$$KRegister, $rtmp$$Register);
 3736     __ evcvtps2ph($mem$$Address, $ktmp$$KRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
 3737   %}
 3738   ins_pipe( pipe_slow );
 3739 %}
 3740 
 3741 instruct vconvF2HF(vec dst, vec src) %{
 3742   match(Set dst (VectorCastF2HF src));
 3743   format %{ "vector_conv_F2HF $dst $src" %}
 3744   ins_encode %{
 3745     int vlen_enc = vector_length_encoding(this, $src);
 3746     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, vlen_enc);
 3747   %}
 3748   ins_pipe( pipe_slow );
 3749 %}
 3750 
 3751 instruct vconvF2HF_mem_reg(memory mem, vec src) %{
 3752   predicate(n->as_StoreVector()->memory_size() >= 16);
 3753   match(Set mem (StoreVector mem (VectorCastF2HF src)));
 3754   format %{ "vcvtps2ph $mem,$src" %}
 3755   ins_encode %{
 3756     int vlen_enc = vector_length_encoding(this, $src);
 3757     __ vcvtps2ph($mem$$Address, $src$$XMMRegister, 0x04, vlen_enc);
 3758   %}
 3759   ins_pipe( pipe_slow );
 3760 %}
 3761 
 3762 instruct convHF2F_reg_reg(vlRegF dst, rRegI src) %{
 3763   match(Set dst (ConvHF2F src));
 3764   format %{ "vcvtph2ps $dst,$src" %}
 3765   ins_encode %{
 3766     __ flt16_to_flt($dst$$XMMRegister, $src$$Register);
 3767   %}
 3768   ins_pipe( pipe_slow );
 3769 %}
 3770 
 3771 instruct vconvHF2F_reg_mem(vec dst, memory mem) %{
 3772   match(Set dst (VectorCastHF2F (LoadVector mem)));
 3773   format %{ "vcvtph2ps $dst,$mem" %}
 3774   ins_encode %{
 3775     int vlen_enc = vector_length_encoding(this);
 3776     __ vcvtph2ps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 3777   %}
 3778   ins_pipe( pipe_slow );
 3779 %}
 3780 
 3781 instruct vconvHF2F(vec dst, vec src) %{
 3782   match(Set dst (VectorCastHF2F src));
 3783   ins_cost(125);
 3784   format %{ "vector_conv_HF2F $dst,$src" %}
 3785   ins_encode %{
 3786     int vlen_enc = vector_length_encoding(this);
 3787     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 3788   %}
 3789   ins_pipe( pipe_slow );
 3790 %}
 3791 
 3792 // ---------------------------------------- VectorReinterpret ------------------------------------
 3793 instruct reinterpret_mask(kReg dst) %{
 3794   predicate(n->bottom_type()->isa_vectmask() &&
 3795             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
 3796   match(Set dst (VectorReinterpret dst));
 3797   ins_cost(125);
 3798   format %{ "vector_reinterpret $dst\t!" %}
 3799   ins_encode %{
 3800     // empty
 3801   %}
 3802   ins_pipe( pipe_slow );
 3803 %}
 3804 
 3805 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
 3806   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3807             n->bottom_type()->isa_vectmask() &&
 3808             n->in(1)->bottom_type()->isa_vectmask() &&
 3809             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
 3810             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3811   match(Set dst (VectorReinterpret src));
 3812   effect(TEMP xtmp);
 3813   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
 3814   ins_encode %{
 3815      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
 3816      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3817      assert(src_sz == dst_sz , "src and dst size mismatch");
 3818      int vlen_enc = vector_length_encoding(src_sz);
 3819      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3820      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3821   %}
 3822   ins_pipe( pipe_slow );
 3823 %}
 3824 
 3825 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
 3826   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3827             n->bottom_type()->isa_vectmask() &&
 3828             n->in(1)->bottom_type()->isa_vectmask() &&
 3829             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
 3830              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
 3831             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3832   match(Set dst (VectorReinterpret src));
 3833   effect(TEMP xtmp);
 3834   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
 3835   ins_encode %{
 3836      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
 3837      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3838      assert(src_sz == dst_sz , "src and dst size mismatch");
 3839      int vlen_enc = vector_length_encoding(src_sz);
 3840      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3841      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3842   %}
 3843   ins_pipe( pipe_slow );
 3844 %}
 3845 
 3846 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
 3847   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3848             n->bottom_type()->isa_vectmask() &&
 3849             n->in(1)->bottom_type()->isa_vectmask() &&
 3850             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
 3851              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
 3852             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3853   match(Set dst (VectorReinterpret src));
 3854   effect(TEMP xtmp);
 3855   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
 3856   ins_encode %{
 3857      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
 3858      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3859      assert(src_sz == dst_sz , "src and dst size mismatch");
 3860      int vlen_enc = vector_length_encoding(src_sz);
 3861      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3862      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3863   %}
 3864   ins_pipe( pipe_slow );
 3865 %}
 3866 
 3867 instruct reinterpret(vec dst) %{
 3868   predicate(!n->bottom_type()->isa_vectmask() &&
 3869             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
 3870   match(Set dst (VectorReinterpret dst));
 3871   ins_cost(125);
 3872   format %{ "vector_reinterpret $dst\t!" %}
 3873   ins_encode %{
 3874     // empty
 3875   %}
 3876   ins_pipe( pipe_slow );
 3877 %}
 3878 
 3879 instruct reinterpret_expand(vec dst, vec src) %{
 3880   predicate(UseAVX == 0 &&
 3881             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3882   match(Set dst (VectorReinterpret src));
 3883   ins_cost(125);
 3884   effect(TEMP dst);
 3885   format %{ "vector_reinterpret_expand $dst,$src" %}
 3886   ins_encode %{
 3887     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
 3888     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
 3889 
 3890     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
 3891     if (src_vlen_in_bytes == 4) {
 3892       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), noreg);
 3893     } else {
 3894       assert(src_vlen_in_bytes == 8, "");
 3895       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), noreg);
 3896     }
 3897     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 3898   %}
 3899   ins_pipe( pipe_slow );
 3900 %}
 3901 
 3902 instruct vreinterpret_expand4(legVec dst, vec src) %{
 3903   predicate(UseAVX > 0 &&
 3904             !n->bottom_type()->isa_vectmask() &&
 3905             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
 3906             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3907   match(Set dst (VectorReinterpret src));
 3908   ins_cost(125);
 3909   format %{ "vector_reinterpret_expand $dst,$src" %}
 3910   ins_encode %{
 3911     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, noreg);
 3912   %}
 3913   ins_pipe( pipe_slow );
 3914 %}
 3915 
 3916 
 3917 instruct vreinterpret_expand(legVec dst, vec src) %{
 3918   predicate(UseAVX > 0 &&
 3919             !n->bottom_type()->isa_vectmask() &&
 3920             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
 3921             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3922   match(Set dst (VectorReinterpret src));
 3923   ins_cost(125);
 3924   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
 3925   ins_encode %{
 3926     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3927       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3928       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3929       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3930       default: ShouldNotReachHere();
 3931     }
 3932   %}
 3933   ins_pipe( pipe_slow );
 3934 %}
 3935 
 3936 instruct reinterpret_shrink(vec dst, legVec src) %{
 3937   predicate(!n->bottom_type()->isa_vectmask() &&
 3938             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
 3939   match(Set dst (VectorReinterpret src));
 3940   ins_cost(125);
 3941   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
 3942   ins_encode %{
 3943     switch (Matcher::vector_length_in_bytes(this)) {
 3944       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
 3945       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3946       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3947       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3948       default: ShouldNotReachHere();
 3949     }
 3950   %}
 3951   ins_pipe( pipe_slow );
 3952 %}
 3953 
 3954 // ----------------------------------------------------------------------------------------------------
 3955 
 3956 #ifdef _LP64
 3957 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
 3958   match(Set dst (RoundDoubleMode src rmode));
 3959   format %{ "roundsd $dst,$src" %}
 3960   ins_cost(150);
 3961   ins_encode %{
 3962     assert(UseSSE >= 4, "required");
 3963     if ((UseAVX == 0) && ($dst$$XMMRegister != $src$$XMMRegister)) {
 3964       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 3965     }
 3966     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
 3967   %}
 3968   ins_pipe(pipe_slow);
 3969 %}
 3970 
 3971 instruct roundD_imm(legRegD dst, immD con, immU8 rmode) %{
 3972   match(Set dst (RoundDoubleMode con rmode));
 3973   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
 3974   ins_cost(150);
 3975   ins_encode %{
 3976     assert(UseSSE >= 4, "required");
 3977     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, noreg);
 3978   %}
 3979   ins_pipe(pipe_slow);
 3980 %}
 3981 
 3982 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
 3983   predicate(Matcher::vector_length(n) < 8);
 3984   match(Set dst (RoundDoubleModeV src rmode));
 3985   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
 3986   ins_encode %{
 3987     assert(UseAVX > 0, "required");
 3988     int vlen_enc = vector_length_encoding(this);
 3989     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
 3990   %}
 3991   ins_pipe( pipe_slow );
 3992 %}
 3993 
 3994 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
 3995   predicate(Matcher::vector_length(n) == 8);
 3996   match(Set dst (RoundDoubleModeV src rmode));
 3997   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
 3998   ins_encode %{
 3999     assert(UseAVX > 2, "required");
 4000     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
 4001   %}
 4002   ins_pipe( pipe_slow );
 4003 %}
 4004 
 4005 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
 4006   predicate(Matcher::vector_length(n) < 8);
 4007   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 4008   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
 4009   ins_encode %{
 4010     assert(UseAVX > 0, "required");
 4011     int vlen_enc = vector_length_encoding(this);
 4012     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
 4013   %}
 4014   ins_pipe( pipe_slow );
 4015 %}
 4016 
 4017 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
 4018   predicate(Matcher::vector_length(n) == 8);
 4019   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 4020   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
 4021   ins_encode %{
 4022     assert(UseAVX > 2, "required");
 4023     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
 4024   %}
 4025   ins_pipe( pipe_slow );
 4026 %}
 4027 #endif // _LP64
 4028 
 4029 instruct onspinwait() %{
 4030   match(OnSpinWait);
 4031   ins_cost(200);
 4032 
 4033   format %{
 4034     $$template
 4035     $$emit$$"pause\t! membar_onspinwait"
 4036   %}
 4037   ins_encode %{
 4038     __ pause();
 4039   %}
 4040   ins_pipe(pipe_slow);
 4041 %}
 4042 
 4043 // a * b + c
 4044 instruct fmaD_reg(regD a, regD b, regD c) %{
 4045   match(Set c (FmaD  c (Binary a b)));
 4046   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
 4047   ins_cost(150);
 4048   ins_encode %{
 4049     assert(UseFMA, "Needs FMA instructions support.");
 4050     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 4051   %}
 4052   ins_pipe( pipe_slow );
 4053 %}
 4054 
 4055 // a * b + c
 4056 instruct fmaF_reg(regF a, regF b, regF c) %{
 4057   match(Set c (FmaF  c (Binary a b)));
 4058   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
 4059   ins_cost(150);
 4060   ins_encode %{
 4061     assert(UseFMA, "Needs FMA instructions support.");
 4062     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 4063   %}
 4064   ins_pipe( pipe_slow );
 4065 %}
 4066 
 4067 // ====================VECTOR INSTRUCTIONS=====================================
 4068 
 4069 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
 4070 instruct MoveVec2Leg(legVec dst, vec src) %{
 4071   match(Set dst src);
 4072   format %{ "" %}
 4073   ins_encode %{
 4074     ShouldNotReachHere();
 4075   %}
 4076   ins_pipe( fpu_reg_reg );
 4077 %}
 4078 
 4079 instruct MoveLeg2Vec(vec dst, legVec src) %{
 4080   match(Set dst src);
 4081   format %{ "" %}
 4082   ins_encode %{
 4083     ShouldNotReachHere();
 4084   %}
 4085   ins_pipe( fpu_reg_reg );
 4086 %}
 4087 
 4088 // ============================================================================
 4089 
 4090 // Load vectors generic operand pattern
 4091 instruct loadV(vec dst, memory mem) %{
 4092   match(Set dst (LoadVector mem));
 4093   ins_cost(125);
 4094   format %{ "load_vector $dst,$mem" %}
 4095   ins_encode %{
 4096     __ load_vector($dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
 4097   %}
 4098   ins_pipe( pipe_slow );
 4099 %}
 4100 
 4101 // Store vectors generic operand pattern.
 4102 instruct storeV(memory mem, vec src) %{
 4103   match(Set mem (StoreVector mem src));
 4104   ins_cost(145);
 4105   format %{ "store_vector $mem,$src\n\t" %}
 4106   ins_encode %{
 4107     switch (Matcher::vector_length_in_bytes(this, $src)) {
 4108       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
 4109       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
 4110       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
 4111       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
 4112       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
 4113       default: ShouldNotReachHere();
 4114     }
 4115   %}
 4116   ins_pipe( pipe_slow );
 4117 %}
 4118 
 4119 // ---------------------------------------- Gather ------------------------------------
 4120 
 4121 // Gather BYTE, SHORT, INT, LONG, FLOAT, DOUBLE
 4122 
 4123 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
 4124   predicate(!VM_Version::supports_avx512vl() && !is_subword_type(Matcher::vector_element_basic_type(n)) &&
 4125             Matcher::vector_length_in_bytes(n) <= 32);
 4126   match(Set dst (LoadVectorGather mem idx));
 4127   effect(TEMP dst, TEMP tmp, TEMP mask);
 4128   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
 4129   ins_encode %{
 4130     int vlen_enc = vector_length_encoding(this);
 4131     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4132     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4133     __ vpcmpeqd($mask$$XMMRegister, $mask$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4134     __ lea($tmp$$Register, $mem$$Address);
 4135     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4136   %}
 4137   ins_pipe( pipe_slow );
 4138 %}
 4139 
 4140 
 4141 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
 4142   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4143             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4144   match(Set dst (LoadVectorGather mem idx));
 4145   effect(TEMP dst, TEMP tmp, TEMP ktmp);
 4146   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
 4147   ins_encode %{
 4148     int vlen_enc = vector_length_encoding(this);
 4149     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4150     __ kxnorwl($ktmp$$KRegister, $ktmp$$KRegister, $ktmp$$KRegister);
 4151     __ lea($tmp$$Register, $mem$$Address);
 4152     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4153   %}
 4154   ins_pipe( pipe_slow );
 4155 %}
 4156 
 4157 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4158   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4159             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4160   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
 4161   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
 4162   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
 4163   ins_encode %{
 4164     assert(UseAVX > 2, "sanity");
 4165     int vlen_enc = vector_length_encoding(this);
 4166     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4167     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4168     // Note: Since gather instruction partially updates the opmask register used
 4169     // for predication hense moving mask operand to a temporary.
 4170     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4171     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4172     __ lea($tmp$$Register, $mem$$Address);
 4173     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4174   %}
 4175   ins_pipe( pipe_slow );
 4176 %}
 4177 
 4178 instruct vgather_subwordLE8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegI rtmp) %{
 4179   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4180   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4181   effect(TEMP tmp, TEMP rtmp);
 4182   format %{ "vector_gatherLE8 $dst, $mem, $idx_base\t! using $tmp and $rtmp as TEMP" %}
 4183   ins_encode %{
 4184     int vlen_enc = vector_length_encoding(this);
 4185     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4186     __ lea($tmp$$Register, $mem$$Address);
 4187     __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp$$Register, vlen_enc);
 4188   %}
 4189   ins_pipe( pipe_slow );
 4190 %}
 4191 
 4192 instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegP idx_base_temp,
 4193                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4194   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4195   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4196   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4197   format %{ "vector_gatherGT8 $dst, $mem, $idx_base\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4198   ins_encode %{
 4199     int vlen_enc = vector_length_encoding(this);
 4200     int vector_len = Matcher::vector_length(this);
 4201     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4202     __ lea($tmp$$Register, $mem$$Address);
 4203     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4204     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, noreg, $xtmp1$$XMMRegister,
 4205                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4206   %}
 4207   ins_pipe( pipe_slow );
 4208 %}
 4209 
 4210 instruct vgather_subwordLE8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegI rtmp, rFlagsReg cr) %{
 4211   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4212   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4213   effect(TEMP tmp, TEMP rtmp, KILL cr);
 4214   format %{ "vector_gatherLE8_off $dst, $mem, $idx_base, $offset\t! using $tmp and $rtmp as TEMP" %}
 4215   ins_encode %{
 4216     int vlen_enc = vector_length_encoding(this);
 4217     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4218     __ lea($tmp$$Register, $mem$$Address);
 4219     __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register, $rtmp$$Register, vlen_enc);
 4220   %}
 4221   ins_pipe( pipe_slow );
 4222 %}
 4223 
 4224 
 4225 instruct vgather_subwordGT8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegP idx_base_temp,
 4226                                  vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4227   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4228   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4229   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4230   format %{ "vector_gatherGT8_off $dst, $mem, $idx_base, $offset\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4231   ins_encode %{
 4232     int vlen_enc = vector_length_encoding(this);
 4233     int vector_len = Matcher::vector_length(this);
 4234     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4235     __ lea($tmp$$Register, $mem$$Address);
 4236     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4237     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, noreg, $xtmp1$$XMMRegister,
 4238                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4239   %}
 4240   ins_pipe( pipe_slow );
 4241 %}
 4242 
 4243 
 4244 #ifdef _LP64
 4245 instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4246   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4247   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4248   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4249   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4250   ins_encode %{
 4251     int vlen_enc = vector_length_encoding(this);
 4252     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4253     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4254     __ lea($tmp$$Register, $mem$$Address);
 4255     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4256     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4257   %}
 4258   ins_pipe( pipe_slow );
 4259 %}
 4260 
 4261 instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4262                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4263   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4264   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4265   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4266   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4267   ins_encode %{
 4268     int vlen_enc = vector_length_encoding(this);
 4269     int vector_len = Matcher::vector_length(this);
 4270     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4271     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4272     __ lea($tmp$$Register, $mem$$Address);
 4273     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4274     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4275     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4276                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4277   %}
 4278   ins_pipe( pipe_slow );
 4279 %}
 4280 
 4281 instruct vgather_masked_subwordLE8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4282   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4283   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4284   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4285   format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4286   ins_encode %{
 4287     int vlen_enc = vector_length_encoding(this);
 4288     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4289     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4290     __ lea($tmp$$Register, $mem$$Address);
 4291     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4292     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
 4293                                 $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4294   %}
 4295   ins_pipe( pipe_slow );
 4296 %}
 4297 
 4298 instruct vgather_masked_subwordGT8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4299                                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4300   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4301   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4302   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4303   format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4304   ins_encode %{
 4305     int vlen_enc = vector_length_encoding(this);
 4306     int vector_len = Matcher::vector_length(this);
 4307     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4308     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4309     __ lea($tmp$$Register, $mem$$Address);
 4310     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4311     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4312     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4313                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4314   %}
 4315   ins_pipe( pipe_slow );
 4316 %}
 4317 
 4318 instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4319   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4320   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4321   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4322   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4323   ins_encode %{
 4324     int vlen_enc = vector_length_encoding(this);
 4325     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4326     __ lea($tmp$$Register, $mem$$Address);
 4327     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4328     if (elem_bt == T_SHORT) {
 4329       __ movl($mask_idx$$Register, 0x55555555);
 4330       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4331     }
 4332     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4333     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4334   %}
 4335   ins_pipe( pipe_slow );
 4336 %}
 4337 
 4338 instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegP tmp, rRegP idx_base_temp,
 4339                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4340   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4341   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4342   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4343   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4344   ins_encode %{
 4345     int vlen_enc = vector_length_encoding(this);
 4346     int vector_len = Matcher::vector_length(this);
 4347     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4348     __ lea($tmp$$Register, $mem$$Address);
 4349     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4350     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4351     if (elem_bt == T_SHORT) {
 4352       __ movl($mask_idx$$Register, 0x55555555);
 4353       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4354     }
 4355     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4356     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4357                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4358   %}
 4359   ins_pipe( pipe_slow );
 4360 %}
 4361 
 4362 instruct vgather_masked_subwordLE8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4363   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4364   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4365   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4366   format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4367   ins_encode %{
 4368     int vlen_enc = vector_length_encoding(this);
 4369     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4370     __ lea($tmp$$Register, $mem$$Address);
 4371     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4372     if (elem_bt == T_SHORT) {
 4373       __ movl($mask_idx$$Register, 0x55555555);
 4374       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4375     }
 4376     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4377     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
 4378                                 $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4379   %}
 4380   ins_pipe( pipe_slow );
 4381 %}
 4382 
 4383 instruct vgather_masked_subwordGT8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegP tmp, rRegP idx_base_temp,
 4384                                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4385   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4386   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4387   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4388   format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4389   ins_encode %{
 4390     int vlen_enc = vector_length_encoding(this);
 4391     int vector_len = Matcher::vector_length(this);
 4392     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4393     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4394     __ lea($tmp$$Register, $mem$$Address);
 4395     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4396     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4397     if (elem_bt == T_SHORT) {
 4398       __ movl($mask_idx$$Register, 0x55555555);
 4399       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4400     }
 4401     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4402     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4403                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4404   %}
 4405   ins_pipe( pipe_slow );
 4406 %}
 4407 #endif
 4408 
 4409 // ====================Scatter=======================================
 4410 
 4411 // Scatter INT, LONG, FLOAT, DOUBLE
 4412 
 4413 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
 4414   predicate(UseAVX > 2);
 4415   match(Set mem (StoreVectorScatter mem (Binary src idx)));
 4416   effect(TEMP tmp, TEMP ktmp);
 4417   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
 4418   ins_encode %{
 4419     int vlen_enc = vector_length_encoding(this, $src);
 4420     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4421 
 4422     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4423     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4424 
 4425     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4426     __ lea($tmp$$Register, $mem$$Address);
 4427     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4428   %}
 4429   ins_pipe( pipe_slow );
 4430 %}
 4431 
 4432 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4433   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
 4434   effect(TEMP tmp, TEMP ktmp);
 4435   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
 4436   ins_encode %{
 4437     int vlen_enc = vector_length_encoding(this, $src);
 4438     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4439     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4440     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4441     // Note: Since scatter instruction partially updates the opmask register used
 4442     // for predication hense moving mask operand to a temporary.
 4443     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4444     __ lea($tmp$$Register, $mem$$Address);
 4445     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4446   %}
 4447   ins_pipe( pipe_slow );
 4448 %}
 4449 
 4450 // ====================REPLICATE=======================================
 4451 
 4452 // Replicate byte scalar to be vector
 4453 instruct vReplB_reg(vec dst, rRegI src) %{
 4454   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 4455   match(Set dst (Replicate src));
 4456   format %{ "replicateB $dst,$src" %}
 4457   ins_encode %{
 4458     uint vlen = Matcher::vector_length(this);
 4459     if (UseAVX >= 2) {
 4460       int vlen_enc = vector_length_encoding(this);
 4461       if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4462         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
 4463         __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
 4464       } else {
 4465         __ movdl($dst$$XMMRegister, $src$$Register);
 4466         __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4467       }
 4468     } else {
 4469        assert(UseAVX < 2, "");
 4470       __ movdl($dst$$XMMRegister, $src$$Register);
 4471       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
 4472       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4473       if (vlen >= 16) {
 4474         assert(vlen == 16, "");
 4475         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4476       }
 4477     }
 4478   %}
 4479   ins_pipe( pipe_slow );
 4480 %}
 4481 
 4482 instruct ReplB_mem(vec dst, memory mem) %{
 4483   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_BYTE);
 4484   match(Set dst (Replicate (LoadB mem)));
 4485   format %{ "replicateB $dst,$mem" %}
 4486   ins_encode %{
 4487     int vlen_enc = vector_length_encoding(this);
 4488     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4489   %}
 4490   ins_pipe( pipe_slow );
 4491 %}
 4492 
 4493 // ====================ReplicateS=======================================
 4494 
 4495 instruct vReplS_reg(vec dst, rRegI src) %{
 4496   predicate(Matcher::vector_element_basic_type(n) == T_SHORT);
 4497   match(Set dst (Replicate src));
 4498   format %{ "replicateS $dst,$src" %}
 4499   ins_encode %{
 4500     uint vlen = Matcher::vector_length(this);
 4501     int vlen_enc = vector_length_encoding(this);
 4502     if (UseAVX >= 2) {
 4503       if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4504         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
 4505         __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
 4506       } else {
 4507         __ movdl($dst$$XMMRegister, $src$$Register);
 4508         __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4509       }
 4510     } else {
 4511       assert(UseAVX < 2, "");
 4512       __ movdl($dst$$XMMRegister, $src$$Register);
 4513       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4514       if (vlen >= 8) {
 4515         assert(vlen == 8, "");
 4516         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4517       }
 4518     }
 4519   %}
 4520   ins_pipe( pipe_slow );
 4521 %}
 4522 
 4523 instruct ReplS_mem(vec dst, memory mem) %{
 4524   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_SHORT);
 4525   match(Set dst (Replicate (LoadS mem)));
 4526   format %{ "replicateS $dst,$mem" %}
 4527   ins_encode %{
 4528     int vlen_enc = vector_length_encoding(this);
 4529     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4530   %}
 4531   ins_pipe( pipe_slow );
 4532 %}
 4533 
 4534 // ====================ReplicateI=======================================
 4535 
 4536 instruct ReplI_reg(vec dst, rRegI src) %{
 4537   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4538   match(Set dst (Replicate src));
 4539   format %{ "replicateI $dst,$src" %}
 4540   ins_encode %{
 4541     uint vlen = Matcher::vector_length(this);
 4542     int vlen_enc = vector_length_encoding(this);
 4543     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4544       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
 4545     } else if (VM_Version::supports_avx2()) {
 4546       __ movdl($dst$$XMMRegister, $src$$Register);
 4547       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4548     } else {
 4549       __ movdl($dst$$XMMRegister, $src$$Register);
 4550       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4551     }
 4552   %}
 4553   ins_pipe( pipe_slow );
 4554 %}
 4555 
 4556 instruct ReplI_mem(vec dst, memory mem) %{
 4557   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4558   match(Set dst (Replicate (LoadI mem)));
 4559   format %{ "replicateI $dst,$mem" %}
 4560   ins_encode %{
 4561     int vlen_enc = vector_length_encoding(this);
 4562     if (VM_Version::supports_avx2()) {
 4563       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4564     } else if (VM_Version::supports_avx()) {
 4565       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4566     } else {
 4567       __ movdl($dst$$XMMRegister, $mem$$Address);
 4568       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4569     }
 4570   %}
 4571   ins_pipe( pipe_slow );
 4572 %}
 4573 
 4574 instruct ReplI_imm(vec dst, immI con) %{
 4575   predicate(Matcher::is_non_long_integral_vector(n));
 4576   match(Set dst (Replicate con));
 4577   format %{ "replicateI $dst,$con" %}
 4578   ins_encode %{
 4579     InternalAddress addr = $constantaddress(Matcher::vector_element_basic_type(this),
 4580         vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
 4581             (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 8) /
 4582                 type2aelembytes(Matcher::vector_element_basic_type(this))));
 4583     BasicType bt = Matcher::vector_element_basic_type(this);
 4584     int vlen = Matcher::vector_length_in_bytes(this);
 4585     __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
 4586   %}
 4587   ins_pipe( pipe_slow );
 4588 %}
 4589 
 4590 // Replicate scalar zero to be vector
 4591 instruct ReplI_zero(vec dst, immI_0 zero) %{
 4592   predicate(Matcher::is_non_long_integral_vector(n));
 4593   match(Set dst (Replicate zero));
 4594   format %{ "replicateI $dst,$zero" %}
 4595   ins_encode %{
 4596     int vlen_enc = vector_length_encoding(this);
 4597     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4598       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4599     } else {
 4600       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4601     }
 4602   %}
 4603   ins_pipe( fpu_reg_reg );
 4604 %}
 4605 
 4606 instruct ReplI_M1(vec dst, immI_M1 con) %{
 4607   predicate(UseSSE >= 2 && Matcher::is_non_long_integral_vector(n));
 4608   match(Set dst (Replicate con));
 4609   format %{ "vallones $dst" %}
 4610   ins_encode %{
 4611     int vector_len = vector_length_encoding(this);
 4612     __ vallones($dst$$XMMRegister, vector_len);
 4613   %}
 4614   ins_pipe( pipe_slow );
 4615 %}
 4616 
 4617 // ====================ReplicateL=======================================
 4618 
 4619 #ifdef _LP64
 4620 // Replicate long (8 byte) scalar to be vector
 4621 instruct ReplL_reg(vec dst, rRegL src) %{
 4622   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4623   match(Set dst (Replicate src));
 4624   format %{ "replicateL $dst,$src" %}
 4625   ins_encode %{
 4626     int vlen = Matcher::vector_length(this);
 4627     int vlen_enc = vector_length_encoding(this);
 4628     if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4629       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
 4630     } else if (VM_Version::supports_avx2()) {
 4631       __ movdq($dst$$XMMRegister, $src$$Register);
 4632       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4633     } else {
 4634       __ movdq($dst$$XMMRegister, $src$$Register);
 4635       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4636     }
 4637   %}
 4638   ins_pipe( pipe_slow );
 4639 %}
 4640 #else // _LP64
 4641 // Replicate long (8 byte) scalar to be vector
 4642 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
 4643   predicate(Matcher::vector_length(n) <= 4 && Matcher::vector_element_basic_type(n) == T_LONG);
 4644   match(Set dst (Replicate src));
 4645   effect(TEMP dst, USE src, TEMP tmp);
 4646   format %{ "replicateL $dst,$src" %}
 4647   ins_encode %{
 4648     uint vlen = Matcher::vector_length(this);
 4649     if (vlen == 2) {
 4650       __ movdl($dst$$XMMRegister, $src$$Register);
 4651       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4652       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4653       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4654     } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4655       int vlen_enc = Assembler::AVX_256bit;
 4656       __ movdl($dst$$XMMRegister, $src$$Register);
 4657       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4658       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4659       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4660     } else {
 4661       __ movdl($dst$$XMMRegister, $src$$Register);
 4662       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4663       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4664       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4665       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4666     }
 4667   %}
 4668   ins_pipe( pipe_slow );
 4669 %}
 4670 
 4671 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
 4672   predicate(Matcher::vector_length(n) == 8 && Matcher::vector_element_basic_type(n) == T_LONG);
 4673   match(Set dst (Replicate src));
 4674   effect(TEMP dst, USE src, TEMP tmp);
 4675   format %{ "replicateL $dst,$src" %}
 4676   ins_encode %{
 4677     if (VM_Version::supports_avx512vl()) {
 4678       __ movdl($dst$$XMMRegister, $src$$Register);
 4679       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4680       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4681       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4682       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4683       __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
 4684     } else {
 4685       int vlen_enc = Assembler::AVX_512bit;
 4686       __ movdl($dst$$XMMRegister, $src$$Register);
 4687       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4688       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4689       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4690     }
 4691   %}
 4692   ins_pipe( pipe_slow );
 4693 %}
 4694 #endif // _LP64
 4695 
 4696 instruct ReplL_mem(vec dst, memory mem) %{
 4697   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4698   match(Set dst (Replicate (LoadL mem)));
 4699   format %{ "replicateL $dst,$mem" %}
 4700   ins_encode %{
 4701     int vlen_enc = vector_length_encoding(this);
 4702     if (VM_Version::supports_avx2()) {
 4703       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4704     } else if (VM_Version::supports_sse3()) {
 4705       __ movddup($dst$$XMMRegister, $mem$$Address);
 4706     } else {
 4707       __ movq($dst$$XMMRegister, $mem$$Address);
 4708       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4709     }
 4710   %}
 4711   ins_pipe( pipe_slow );
 4712 %}
 4713 
 4714 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
 4715 instruct ReplL_imm(vec dst, immL con) %{
 4716   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4717   match(Set dst (Replicate con));
 4718   format %{ "replicateL $dst,$con" %}
 4719   ins_encode %{
 4720     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, $con$$constant, 1));
 4721     int vlen = Matcher::vector_length_in_bytes(this);
 4722     __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
 4723   %}
 4724   ins_pipe( pipe_slow );
 4725 %}
 4726 
 4727 instruct ReplL_zero(vec dst, immL0 zero) %{
 4728   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4729   match(Set dst (Replicate zero));
 4730   format %{ "replicateL $dst,$zero" %}
 4731   ins_encode %{
 4732     int vlen_enc = vector_length_encoding(this);
 4733     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4734       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4735     } else {
 4736       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4737     }
 4738   %}
 4739   ins_pipe( fpu_reg_reg );
 4740 %}
 4741 
 4742 instruct ReplL_M1(vec dst, immL_M1 con) %{
 4743   predicate(UseSSE >= 2 && Matcher::vector_element_basic_type(n) == T_LONG);
 4744   match(Set dst (Replicate con));
 4745   format %{ "vallones $dst" %}
 4746   ins_encode %{
 4747     int vector_len = vector_length_encoding(this);
 4748     __ vallones($dst$$XMMRegister, vector_len);
 4749   %}
 4750   ins_pipe( pipe_slow );
 4751 %}
 4752 
 4753 // ====================ReplicateF=======================================
 4754 
 4755 instruct vReplF_reg(vec dst, vlRegF src) %{
 4756   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4757   match(Set dst (Replicate src));
 4758   format %{ "replicateF $dst,$src" %}
 4759   ins_encode %{
 4760     uint vlen = Matcher::vector_length(this);
 4761     int vlen_enc = vector_length_encoding(this);
 4762     if (vlen <= 4) {
 4763       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4764     } else if (VM_Version::supports_avx2()) {
 4765       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4766     } else {
 4767       assert(vlen == 8, "sanity");
 4768       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4769       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4770     }
 4771   %}
 4772   ins_pipe( pipe_slow );
 4773 %}
 4774 
 4775 instruct ReplF_reg(vec dst, vlRegF src) %{
 4776   predicate(UseAVX == 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4777   match(Set dst (Replicate src));
 4778   format %{ "replicateF $dst,$src" %}
 4779   ins_encode %{
 4780     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4781   %}
 4782   ins_pipe( pipe_slow );
 4783 %}
 4784 
 4785 instruct ReplF_mem(vec dst, memory mem) %{
 4786   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4787   match(Set dst (Replicate (LoadF mem)));
 4788   format %{ "replicateF $dst,$mem" %}
 4789   ins_encode %{
 4790     int vlen_enc = vector_length_encoding(this);
 4791     __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4792   %}
 4793   ins_pipe( pipe_slow );
 4794 %}
 4795 
 4796 // Replicate float scalar immediate to be vector by loading from const table.
 4797 instruct ReplF_imm(vec dst, immF con) %{
 4798   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4799   match(Set dst (Replicate con));
 4800   format %{ "replicateF $dst,$con" %}
 4801   ins_encode %{
 4802     InternalAddress addr = $constantaddress(T_FLOAT, vreplicate_imm(T_FLOAT, $con$$constant,
 4803         VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 2));
 4804     int vlen = Matcher::vector_length_in_bytes(this);
 4805     __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
 4806   %}
 4807   ins_pipe( pipe_slow );
 4808 %}
 4809 
 4810 instruct ReplF_zero(vec dst, immF0 zero) %{
 4811   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4812   match(Set dst (Replicate zero));
 4813   format %{ "replicateF $dst,$zero" %}
 4814   ins_encode %{
 4815     int vlen_enc = vector_length_encoding(this);
 4816     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4817       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4818     } else {
 4819       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4820     }
 4821   %}
 4822   ins_pipe( fpu_reg_reg );
 4823 %}
 4824 
 4825 // ====================ReplicateD=======================================
 4826 
 4827 // Replicate double (8 bytes) scalar to be vector
 4828 instruct vReplD_reg(vec dst, vlRegD src) %{
 4829   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4830   match(Set dst (Replicate src));
 4831   format %{ "replicateD $dst,$src" %}
 4832   ins_encode %{
 4833     uint vlen = Matcher::vector_length(this);
 4834     int vlen_enc = vector_length_encoding(this);
 4835     if (vlen <= 2) {
 4836       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4837     } else if (VM_Version::supports_avx2()) {
 4838       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4839     } else {
 4840       assert(vlen == 4, "sanity");
 4841       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4842       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4843     }
 4844   %}
 4845   ins_pipe( pipe_slow );
 4846 %}
 4847 
 4848 instruct ReplD_reg(vec dst, vlRegD src) %{
 4849   predicate(UseSSE < 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4850   match(Set dst (Replicate src));
 4851   format %{ "replicateD $dst,$src" %}
 4852   ins_encode %{
 4853     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4854   %}
 4855   ins_pipe( pipe_slow );
 4856 %}
 4857 
 4858 instruct ReplD_mem(vec dst, memory mem) %{
 4859   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4860   match(Set dst (Replicate (LoadD mem)));
 4861   format %{ "replicateD $dst,$mem" %}
 4862   ins_encode %{
 4863     if (Matcher::vector_length(this) >= 4) {
 4864       int vlen_enc = vector_length_encoding(this);
 4865       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4866     } else {
 4867       __ movddup($dst$$XMMRegister, $mem$$Address);
 4868     }
 4869   %}
 4870   ins_pipe( pipe_slow );
 4871 %}
 4872 
 4873 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
 4874 instruct ReplD_imm(vec dst, immD con) %{
 4875   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4876   match(Set dst (Replicate con));
 4877   format %{ "replicateD $dst,$con" %}
 4878   ins_encode %{
 4879     InternalAddress addr = $constantaddress(T_DOUBLE, vreplicate_imm(T_DOUBLE, $con$$constant, 1));
 4880     int vlen = Matcher::vector_length_in_bytes(this);
 4881     __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
 4882   %}
 4883   ins_pipe( pipe_slow );
 4884 %}
 4885 
 4886 instruct ReplD_zero(vec dst, immD0 zero) %{
 4887   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4888   match(Set dst (Replicate zero));
 4889   format %{ "replicateD $dst,$zero" %}
 4890   ins_encode %{
 4891     int vlen_enc = vector_length_encoding(this);
 4892     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4893       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4894     } else {
 4895       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4896     }
 4897   %}
 4898   ins_pipe( fpu_reg_reg );
 4899 %}
 4900 
 4901 // ====================VECTOR INSERT=======================================
 4902 
 4903 instruct insert(vec dst, rRegI val, immU8 idx) %{
 4904   predicate(Matcher::vector_length_in_bytes(n) < 32);
 4905   match(Set dst (VectorInsert (Binary dst val) idx));
 4906   format %{ "vector_insert $dst,$val,$idx" %}
 4907   ins_encode %{
 4908     assert(UseSSE >= 4, "required");
 4909     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
 4910 
 4911     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4912 
 4913     assert(is_integral_type(elem_bt), "");
 4914     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4915 
 4916     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
 4917   %}
 4918   ins_pipe( pipe_slow );
 4919 %}
 4920 
 4921 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
 4922   predicate(Matcher::vector_length_in_bytes(n) == 32);
 4923   match(Set dst (VectorInsert (Binary src val) idx));
 4924   effect(TEMP vtmp);
 4925   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4926   ins_encode %{
 4927     int vlen_enc = Assembler::AVX_256bit;
 4928     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4929     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4930     int log2epr = log2(elem_per_lane);
 4931 
 4932     assert(is_integral_type(elem_bt), "sanity");
 4933     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4934 
 4935     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4936     uint y_idx = ($idx$$constant >> log2epr) & 1;
 4937     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4938     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4939     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4940   %}
 4941   ins_pipe( pipe_slow );
 4942 %}
 4943 
 4944 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
 4945   predicate(Matcher::vector_length_in_bytes(n) == 64);
 4946   match(Set dst (VectorInsert (Binary src val) idx));
 4947   effect(TEMP vtmp);
 4948   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4949   ins_encode %{
 4950     assert(UseAVX > 2, "sanity");
 4951 
 4952     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4953     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4954     int log2epr = log2(elem_per_lane);
 4955 
 4956     assert(is_integral_type(elem_bt), "");
 4957     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4958 
 4959     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4960     uint y_idx = ($idx$$constant >> log2epr) & 3;
 4961     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4962     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4963     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4964   %}
 4965   ins_pipe( pipe_slow );
 4966 %}
 4967 
 4968 #ifdef _LP64
 4969 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
 4970   predicate(Matcher::vector_length(n) == 2);
 4971   match(Set dst (VectorInsert (Binary dst val) idx));
 4972   format %{ "vector_insert $dst,$val,$idx" %}
 4973   ins_encode %{
 4974     assert(UseSSE >= 4, "required");
 4975     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4976     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4977 
 4978     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
 4979   %}
 4980   ins_pipe( pipe_slow );
 4981 %}
 4982 
 4983 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
 4984   predicate(Matcher::vector_length(n) == 4);
 4985   match(Set dst (VectorInsert (Binary src val) idx));
 4986   effect(TEMP vtmp);
 4987   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4988   ins_encode %{
 4989     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4990     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4991 
 4992     uint x_idx = $idx$$constant & right_n_bits(1);
 4993     uint y_idx = ($idx$$constant >> 1) & 1;
 4994     int vlen_enc = Assembler::AVX_256bit;
 4995     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4996     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4997     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4998   %}
 4999   ins_pipe( pipe_slow );
 5000 %}
 5001 
 5002 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
 5003   predicate(Matcher::vector_length(n) == 8);
 5004   match(Set dst (VectorInsert (Binary src val) idx));
 5005   effect(TEMP vtmp);
 5006   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 5007   ins_encode %{
 5008     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
 5009     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5010 
 5011     uint x_idx = $idx$$constant & right_n_bits(1);
 5012     uint y_idx = ($idx$$constant >> 1) & 3;
 5013     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5014     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 5015     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5016   %}
 5017   ins_pipe( pipe_slow );
 5018 %}
 5019 #endif
 5020 
 5021 instruct insertF(vec dst, regF val, immU8 idx) %{
 5022   predicate(Matcher::vector_length(n) < 8);
 5023   match(Set dst (VectorInsert (Binary dst val) idx));
 5024   format %{ "vector_insert $dst,$val,$idx" %}
 5025   ins_encode %{
 5026     assert(UseSSE >= 4, "sanity");
 5027 
 5028     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 5029     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5030 
 5031     uint x_idx = $idx$$constant & right_n_bits(2);
 5032     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 5033   %}
 5034   ins_pipe( pipe_slow );
 5035 %}
 5036 
 5037 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
 5038   predicate(Matcher::vector_length(n) >= 8);
 5039   match(Set dst (VectorInsert (Binary src val) idx));
 5040   effect(TEMP vtmp);
 5041   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 5042   ins_encode %{
 5043     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 5044     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5045 
 5046     int vlen = Matcher::vector_length(this);
 5047     uint x_idx = $idx$$constant & right_n_bits(2);
 5048     if (vlen == 8) {
 5049       uint y_idx = ($idx$$constant >> 2) & 1;
 5050       int vlen_enc = Assembler::AVX_256bit;
 5051       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5052       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 5053       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5054     } else {
 5055       assert(vlen == 16, "sanity");
 5056       uint y_idx = ($idx$$constant >> 2) & 3;
 5057       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5058       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 5059       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5060     }
 5061   %}
 5062   ins_pipe( pipe_slow );
 5063 %}
 5064 
 5065 #ifdef _LP64
 5066 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
 5067   predicate(Matcher::vector_length(n) == 2);
 5068   match(Set dst (VectorInsert (Binary dst val) idx));
 5069   effect(TEMP tmp);
 5070   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
 5071   ins_encode %{
 5072     assert(UseSSE >= 4, "sanity");
 5073     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 5074     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5075 
 5076     __ movq($tmp$$Register, $val$$XMMRegister);
 5077     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
 5078   %}
 5079   ins_pipe( pipe_slow );
 5080 %}
 5081 
 5082 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
 5083   predicate(Matcher::vector_length(n) == 4);
 5084   match(Set dst (VectorInsert (Binary src val) idx));
 5085   effect(TEMP vtmp, TEMP tmp);
 5086   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
 5087   ins_encode %{
 5088     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 5089     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5090 
 5091     uint x_idx = $idx$$constant & right_n_bits(1);
 5092     uint y_idx = ($idx$$constant >> 1) & 1;
 5093     int vlen_enc = Assembler::AVX_256bit;
 5094     __ movq($tmp$$Register, $val$$XMMRegister);
 5095     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5096     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 5097     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5098   %}
 5099   ins_pipe( pipe_slow );
 5100 %}
 5101 
 5102 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
 5103   predicate(Matcher::vector_length(n) == 8);
 5104   match(Set dst (VectorInsert (Binary src val) idx));
 5105   effect(TEMP tmp, TEMP vtmp);
 5106   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 5107   ins_encode %{
 5108     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 5109     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5110 
 5111     uint x_idx = $idx$$constant & right_n_bits(1);
 5112     uint y_idx = ($idx$$constant >> 1) & 3;
 5113     __ movq($tmp$$Register, $val$$XMMRegister);
 5114     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5115     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 5116     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5117   %}
 5118   ins_pipe( pipe_slow );
 5119 %}
 5120 #endif
 5121 
 5122 // ====================REDUCTION ARITHMETIC=======================================
 5123 
 5124 // =======================Int Reduction==========================================
 5125 
 5126 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5127   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
 5128   match(Set dst (AddReductionVI src1 src2));
 5129   match(Set dst (MulReductionVI src1 src2));
 5130   match(Set dst (AndReductionV  src1 src2));
 5131   match(Set dst ( OrReductionV  src1 src2));
 5132   match(Set dst (XorReductionV  src1 src2));
 5133   match(Set dst (MinReductionV  src1 src2));
 5134   match(Set dst (MaxReductionV  src1 src2));
 5135   effect(TEMP vtmp1, TEMP vtmp2);
 5136   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5137   ins_encode %{
 5138     int opcode = this->ideal_Opcode();
 5139     int vlen = Matcher::vector_length(this, $src2);
 5140     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5141   %}
 5142   ins_pipe( pipe_slow );
 5143 %}
 5144 
 5145 // =======================Long Reduction==========================================
 5146 
 5147 #ifdef _LP64
 5148 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5149   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
 5150   match(Set dst (AddReductionVL src1 src2));
 5151   match(Set dst (MulReductionVL src1 src2));
 5152   match(Set dst (AndReductionV  src1 src2));
 5153   match(Set dst ( OrReductionV  src1 src2));
 5154   match(Set dst (XorReductionV  src1 src2));
 5155   match(Set dst (MinReductionV  src1 src2));
 5156   match(Set dst (MaxReductionV  src1 src2));
 5157   effect(TEMP vtmp1, TEMP vtmp2);
 5158   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5159   ins_encode %{
 5160     int opcode = this->ideal_Opcode();
 5161     int vlen = Matcher::vector_length(this, $src2);
 5162     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5163   %}
 5164   ins_pipe( pipe_slow );
 5165 %}
 5166 
 5167 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
 5168   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
 5169   match(Set dst (AddReductionVL src1 src2));
 5170   match(Set dst (MulReductionVL src1 src2));
 5171   match(Set dst (AndReductionV  src1 src2));
 5172   match(Set dst ( OrReductionV  src1 src2));
 5173   match(Set dst (XorReductionV  src1 src2));
 5174   match(Set dst (MinReductionV  src1 src2));
 5175   match(Set dst (MaxReductionV  src1 src2));
 5176   effect(TEMP vtmp1, TEMP vtmp2);
 5177   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5178   ins_encode %{
 5179     int opcode = this->ideal_Opcode();
 5180     int vlen = Matcher::vector_length(this, $src2);
 5181     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5182   %}
 5183   ins_pipe( pipe_slow );
 5184 %}
 5185 #endif // _LP64
 5186 
 5187 // =======================Float Reduction==========================================
 5188 
 5189 instruct reductionF128(regF dst, vec src, vec vtmp) %{
 5190   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) <= 4); // src
 5191   match(Set dst (AddReductionVF dst src));
 5192   match(Set dst (MulReductionVF dst src));
 5193   effect(TEMP dst, TEMP vtmp);
 5194   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
 5195   ins_encode %{
 5196     int opcode = this->ideal_Opcode();
 5197     int vlen = Matcher::vector_length(this, $src);
 5198     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5199   %}
 5200   ins_pipe( pipe_slow );
 5201 %}
 5202 
 5203 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
 5204   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5205   match(Set dst (AddReductionVF dst src));
 5206   match(Set dst (MulReductionVF dst src));
 5207   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5208   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5209   ins_encode %{
 5210     int opcode = this->ideal_Opcode();
 5211     int vlen = Matcher::vector_length(this, $src);
 5212     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5213   %}
 5214   ins_pipe( pipe_slow );
 5215 %}
 5216 
 5217 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5218   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src
 5219   match(Set dst (AddReductionVF dst src));
 5220   match(Set dst (MulReductionVF dst src));
 5221   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5222   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5223   ins_encode %{
 5224     int opcode = this->ideal_Opcode();
 5225     int vlen = Matcher::vector_length(this, $src);
 5226     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5227   %}
 5228   ins_pipe( pipe_slow );
 5229 %}
 5230 
 5231 
 5232 instruct unordered_reduction2F(regF dst, regF src1, vec src2) %{
 5233   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5234   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5235   // src1 contains reduction identity
 5236   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5237   match(Set dst (AddReductionVF src1 src2));
 5238   match(Set dst (MulReductionVF src1 src2));
 5239   effect(TEMP dst);
 5240   format %{ "vector_reduction_float  $dst,$src1,$src2 ;" %}
 5241   ins_encode %{
 5242     int opcode = this->ideal_Opcode();
 5243     int vlen = Matcher::vector_length(this, $src2);
 5244     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5245   %}
 5246   ins_pipe( pipe_slow );
 5247 %}
 5248 
 5249 instruct unordered_reduction4F(regF dst, regF src1, vec src2, vec vtmp) %{
 5250   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5251   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5252   // src1 contains reduction identity
 5253   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5254   match(Set dst (AddReductionVF src1 src2));
 5255   match(Set dst (MulReductionVF src1 src2));
 5256   effect(TEMP dst, TEMP vtmp);
 5257   format %{ "vector_reduction_float  $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5258   ins_encode %{
 5259     int opcode = this->ideal_Opcode();
 5260     int vlen = Matcher::vector_length(this, $src2);
 5261     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5262   %}
 5263   ins_pipe( pipe_slow );
 5264 %}
 5265 
 5266 instruct unordered_reduction8F(regF dst, regF src1, vec src2, vec vtmp1, vec vtmp2) %{
 5267   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5268   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5269   // src1 contains reduction identity
 5270   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5271   match(Set dst (AddReductionVF src1 src2));
 5272   match(Set dst (MulReductionVF src1 src2));
 5273   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5274   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5275   ins_encode %{
 5276     int opcode = this->ideal_Opcode();
 5277     int vlen = Matcher::vector_length(this, $src2);
 5278     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5279   %}
 5280   ins_pipe( pipe_slow );
 5281 %}
 5282 
 5283 instruct unordered_reduction16F(regF dst, regF src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5284   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5285   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5286   // src1 contains reduction identity
 5287   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src2
 5288   match(Set dst (AddReductionVF src1 src2));
 5289   match(Set dst (MulReductionVF src1 src2));
 5290   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5291   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5292   ins_encode %{
 5293     int opcode = this->ideal_Opcode();
 5294     int vlen = Matcher::vector_length(this, $src2);
 5295     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5296   %}
 5297   ins_pipe( pipe_slow );
 5298 %}
 5299 
 5300 // =======================Double Reduction==========================================
 5301 
 5302 instruct reduction2D(regD dst, vec src, vec vtmp) %{
 5303   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src
 5304   match(Set dst (AddReductionVD dst src));
 5305   match(Set dst (MulReductionVD dst src));
 5306   effect(TEMP dst, TEMP vtmp);
 5307   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
 5308   ins_encode %{
 5309     int opcode = this->ideal_Opcode();
 5310     int vlen = Matcher::vector_length(this, $src);
 5311     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5312 %}
 5313   ins_pipe( pipe_slow );
 5314 %}
 5315 
 5316 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
 5317   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src
 5318   match(Set dst (AddReductionVD dst src));
 5319   match(Set dst (MulReductionVD dst src));
 5320   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5321   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5322   ins_encode %{
 5323     int opcode = this->ideal_Opcode();
 5324     int vlen = Matcher::vector_length(this, $src);
 5325     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5326   %}
 5327   ins_pipe( pipe_slow );
 5328 %}
 5329 
 5330 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5331   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5332   match(Set dst (AddReductionVD dst src));
 5333   match(Set dst (MulReductionVD dst src));
 5334   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5335   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5336   ins_encode %{
 5337     int opcode = this->ideal_Opcode();
 5338     int vlen = Matcher::vector_length(this, $src);
 5339     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5340   %}
 5341   ins_pipe( pipe_slow );
 5342 %}
 5343 
 5344 instruct unordered_reduction2D(regD dst, regD src1, vec src2) %{
 5345   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5346   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5347   // src1 contains reduction identity
 5348   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5349   match(Set dst (AddReductionVD src1 src2));
 5350   match(Set dst (MulReductionVD src1 src2));
 5351   effect(TEMP dst);
 5352   format %{ "vector_reduction_double $dst,$src1,$src2 ;" %}
 5353   ins_encode %{
 5354     int opcode = this->ideal_Opcode();
 5355     int vlen = Matcher::vector_length(this, $src2);
 5356     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5357 %}
 5358   ins_pipe( pipe_slow );
 5359 %}
 5360 
 5361 instruct unordered_reduction4D(regD dst, regD src1, vec src2, vec vtmp) %{
 5362   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5363   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5364   // src1 contains reduction identity
 5365   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5366   match(Set dst (AddReductionVD src1 src2));
 5367   match(Set dst (MulReductionVD src1 src2));
 5368   effect(TEMP dst, TEMP vtmp);
 5369   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5370   ins_encode %{
 5371     int opcode = this->ideal_Opcode();
 5372     int vlen = Matcher::vector_length(this, $src2);
 5373     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5374   %}
 5375   ins_pipe( pipe_slow );
 5376 %}
 5377 
 5378 instruct unordered_reduction8D(regD dst, regD src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5379   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5380   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5381   // src1 contains reduction identity
 5382   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5383   match(Set dst (AddReductionVD src1 src2));
 5384   match(Set dst (MulReductionVD src1 src2));
 5385   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5386   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5387   ins_encode %{
 5388     int opcode = this->ideal_Opcode();
 5389     int vlen = Matcher::vector_length(this, $src2);
 5390     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5391   %}
 5392   ins_pipe( pipe_slow );
 5393 %}
 5394 
 5395 // =======================Byte Reduction==========================================
 5396 
 5397 #ifdef _LP64
 5398 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5399   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
 5400   match(Set dst (AddReductionVI src1 src2));
 5401   match(Set dst (AndReductionV  src1 src2));
 5402   match(Set dst ( OrReductionV  src1 src2));
 5403   match(Set dst (XorReductionV  src1 src2));
 5404   match(Set dst (MinReductionV  src1 src2));
 5405   match(Set dst (MaxReductionV  src1 src2));
 5406   effect(TEMP vtmp1, TEMP vtmp2);
 5407   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5408   ins_encode %{
 5409     int opcode = this->ideal_Opcode();
 5410     int vlen = Matcher::vector_length(this, $src2);
 5411     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5412   %}
 5413   ins_pipe( pipe_slow );
 5414 %}
 5415 
 5416 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5417   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
 5418   match(Set dst (AddReductionVI src1 src2));
 5419   match(Set dst (AndReductionV  src1 src2));
 5420   match(Set dst ( OrReductionV  src1 src2));
 5421   match(Set dst (XorReductionV  src1 src2));
 5422   match(Set dst (MinReductionV  src1 src2));
 5423   match(Set dst (MaxReductionV  src1 src2));
 5424   effect(TEMP vtmp1, TEMP vtmp2);
 5425   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5426   ins_encode %{
 5427     int opcode = this->ideal_Opcode();
 5428     int vlen = Matcher::vector_length(this, $src2);
 5429     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5430   %}
 5431   ins_pipe( pipe_slow );
 5432 %}
 5433 #endif
 5434 
 5435 // =======================Short Reduction==========================================
 5436 
 5437 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5438   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
 5439   match(Set dst (AddReductionVI src1 src2));
 5440   match(Set dst (MulReductionVI src1 src2));
 5441   match(Set dst (AndReductionV  src1 src2));
 5442   match(Set dst ( OrReductionV  src1 src2));
 5443   match(Set dst (XorReductionV  src1 src2));
 5444   match(Set dst (MinReductionV  src1 src2));
 5445   match(Set dst (MaxReductionV  src1 src2));
 5446   effect(TEMP vtmp1, TEMP vtmp2);
 5447   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5448   ins_encode %{
 5449     int opcode = this->ideal_Opcode();
 5450     int vlen = Matcher::vector_length(this, $src2);
 5451     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5452   %}
 5453   ins_pipe( pipe_slow );
 5454 %}
 5455 
 5456 // =======================Mul Reduction==========================================
 5457 
 5458 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5459   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5460             Matcher::vector_length(n->in(2)) <= 32); // src2
 5461   match(Set dst (MulReductionVI src1 src2));
 5462   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5463   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5464   ins_encode %{
 5465     int opcode = this->ideal_Opcode();
 5466     int vlen = Matcher::vector_length(this, $src2);
 5467     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5468   %}
 5469   ins_pipe( pipe_slow );
 5470 %}
 5471 
 5472 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5473   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5474             Matcher::vector_length(n->in(2)) == 64); // src2
 5475   match(Set dst (MulReductionVI src1 src2));
 5476   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5477   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5478   ins_encode %{
 5479     int opcode = this->ideal_Opcode();
 5480     int vlen = Matcher::vector_length(this, $src2);
 5481     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5482   %}
 5483   ins_pipe( pipe_slow );
 5484 %}
 5485 
 5486 //--------------------Min/Max Float Reduction --------------------
 5487 // Float Min Reduction
 5488 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
 5489                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5490   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5491             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5492              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5493             Matcher::vector_length(n->in(2)) == 2);
 5494   match(Set dst (MinReductionV src1 src2));
 5495   match(Set dst (MaxReductionV src1 src2));
 5496   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5497   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5498   ins_encode %{
 5499     assert(UseAVX > 0, "sanity");
 5500 
 5501     int opcode = this->ideal_Opcode();
 5502     int vlen = Matcher::vector_length(this, $src2);
 5503     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5504                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5505   %}
 5506   ins_pipe( pipe_slow );
 5507 %}
 5508 
 5509 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5510                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5511   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5512             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5513              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5514             Matcher::vector_length(n->in(2)) >= 4);
 5515   match(Set dst (MinReductionV src1 src2));
 5516   match(Set dst (MaxReductionV src1 src2));
 5517   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5518   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5519   ins_encode %{
 5520     assert(UseAVX > 0, "sanity");
 5521 
 5522     int opcode = this->ideal_Opcode();
 5523     int vlen = Matcher::vector_length(this, $src2);
 5524     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5525                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5526   %}
 5527   ins_pipe( pipe_slow );
 5528 %}
 5529 
 5530 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
 5531                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5532   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5533             Matcher::vector_length(n->in(2)) == 2);
 5534   match(Set dst (MinReductionV dst src));
 5535   match(Set dst (MaxReductionV dst src));
 5536   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5537   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5538   ins_encode %{
 5539     assert(UseAVX > 0, "sanity");
 5540 
 5541     int opcode = this->ideal_Opcode();
 5542     int vlen = Matcher::vector_length(this, $src);
 5543     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5544                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5545   %}
 5546   ins_pipe( pipe_slow );
 5547 %}
 5548 
 5549 
 5550 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
 5551                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5552   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5553             Matcher::vector_length(n->in(2)) >= 4);
 5554   match(Set dst (MinReductionV dst src));
 5555   match(Set dst (MaxReductionV dst src));
 5556   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5557   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5558   ins_encode %{
 5559     assert(UseAVX > 0, "sanity");
 5560 
 5561     int opcode = this->ideal_Opcode();
 5562     int vlen = Matcher::vector_length(this, $src);
 5563     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5564                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5565   %}
 5566   ins_pipe( pipe_slow );
 5567 %}
 5568 
 5569 
 5570 //--------------------Min Double Reduction --------------------
 5571 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
 5572                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5573                             rFlagsReg cr) %{
 5574   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5575             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5576              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5577             Matcher::vector_length(n->in(2)) == 2);
 5578   match(Set dst (MinReductionV src1 src2));
 5579   match(Set dst (MaxReductionV src1 src2));
 5580   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5581   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5582   ins_encode %{
 5583     assert(UseAVX > 0, "sanity");
 5584 
 5585     int opcode = this->ideal_Opcode();
 5586     int vlen = Matcher::vector_length(this, $src2);
 5587     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5588                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5589   %}
 5590   ins_pipe( pipe_slow );
 5591 %}
 5592 
 5593 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
 5594                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5595                            rFlagsReg cr) %{
 5596   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5597             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5598              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5599             Matcher::vector_length(n->in(2)) >= 4);
 5600   match(Set dst (MinReductionV src1 src2));
 5601   match(Set dst (MaxReductionV src1 src2));
 5602   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5603   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5604   ins_encode %{
 5605     assert(UseAVX > 0, "sanity");
 5606 
 5607     int opcode = this->ideal_Opcode();
 5608     int vlen = Matcher::vector_length(this, $src2);
 5609     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5610                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5611   %}
 5612   ins_pipe( pipe_slow );
 5613 %}
 5614 
 5615 
 5616 instruct minmax_reduction2D_av(legRegD dst, legVec src,
 5617                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5618                                rFlagsReg cr) %{
 5619   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5620             Matcher::vector_length(n->in(2)) == 2);
 5621   match(Set dst (MinReductionV dst src));
 5622   match(Set dst (MaxReductionV dst src));
 5623   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5624   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5625   ins_encode %{
 5626     assert(UseAVX > 0, "sanity");
 5627 
 5628     int opcode = this->ideal_Opcode();
 5629     int vlen = Matcher::vector_length(this, $src);
 5630     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5631                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5632   %}
 5633   ins_pipe( pipe_slow );
 5634 %}
 5635 
 5636 instruct minmax_reductionD_av(legRegD dst, legVec src,
 5637                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5638                               rFlagsReg cr) %{
 5639   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5640             Matcher::vector_length(n->in(2)) >= 4);
 5641   match(Set dst (MinReductionV dst src));
 5642   match(Set dst (MaxReductionV dst src));
 5643   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5644   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5645   ins_encode %{
 5646     assert(UseAVX > 0, "sanity");
 5647 
 5648     int opcode = this->ideal_Opcode();
 5649     int vlen = Matcher::vector_length(this, $src);
 5650     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5651                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5652   %}
 5653   ins_pipe( pipe_slow );
 5654 %}
 5655 
 5656 // ====================VECTOR ARITHMETIC=======================================
 5657 
 5658 // --------------------------------- ADD --------------------------------------
 5659 
 5660 // Bytes vector add
 5661 instruct vaddB(vec dst, vec src) %{
 5662   predicate(UseAVX == 0);
 5663   match(Set dst (AddVB dst src));
 5664   format %{ "paddb   $dst,$src\t! add packedB" %}
 5665   ins_encode %{
 5666     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
 5667   %}
 5668   ins_pipe( pipe_slow );
 5669 %}
 5670 
 5671 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
 5672   predicate(UseAVX > 0);
 5673   match(Set dst (AddVB src1 src2));
 5674   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
 5675   ins_encode %{
 5676     int vlen_enc = vector_length_encoding(this);
 5677     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5678   %}
 5679   ins_pipe( pipe_slow );
 5680 %}
 5681 
 5682 instruct vaddB_mem(vec dst, vec src, memory mem) %{
 5683   predicate((UseAVX > 0) &&
 5684             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5685   match(Set dst (AddVB src (LoadVector mem)));
 5686   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
 5687   ins_encode %{
 5688     int vlen_enc = vector_length_encoding(this);
 5689     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5690   %}
 5691   ins_pipe( pipe_slow );
 5692 %}
 5693 
 5694 // Shorts/Chars vector add
 5695 instruct vaddS(vec dst, vec src) %{
 5696   predicate(UseAVX == 0);
 5697   match(Set dst (AddVS dst src));
 5698   format %{ "paddw   $dst,$src\t! add packedS" %}
 5699   ins_encode %{
 5700     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
 5701   %}
 5702   ins_pipe( pipe_slow );
 5703 %}
 5704 
 5705 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
 5706   predicate(UseAVX > 0);
 5707   match(Set dst (AddVS src1 src2));
 5708   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
 5709   ins_encode %{
 5710     int vlen_enc = vector_length_encoding(this);
 5711     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5712   %}
 5713   ins_pipe( pipe_slow );
 5714 %}
 5715 
 5716 instruct vaddS_mem(vec dst, vec src, memory mem) %{
 5717   predicate((UseAVX > 0) &&
 5718             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5719   match(Set dst (AddVS src (LoadVector mem)));
 5720   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
 5721   ins_encode %{
 5722     int vlen_enc = vector_length_encoding(this);
 5723     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5724   %}
 5725   ins_pipe( pipe_slow );
 5726 %}
 5727 
 5728 // Integers vector add
 5729 instruct vaddI(vec dst, vec src) %{
 5730   predicate(UseAVX == 0);
 5731   match(Set dst (AddVI dst src));
 5732   format %{ "paddd   $dst,$src\t! add packedI" %}
 5733   ins_encode %{
 5734     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
 5735   %}
 5736   ins_pipe( pipe_slow );
 5737 %}
 5738 
 5739 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
 5740   predicate(UseAVX > 0);
 5741   match(Set dst (AddVI src1 src2));
 5742   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
 5743   ins_encode %{
 5744     int vlen_enc = vector_length_encoding(this);
 5745     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5746   %}
 5747   ins_pipe( pipe_slow );
 5748 %}
 5749 
 5750 
 5751 instruct vaddI_mem(vec dst, vec src, memory mem) %{
 5752   predicate((UseAVX > 0) &&
 5753             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5754   match(Set dst (AddVI src (LoadVector mem)));
 5755   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
 5756   ins_encode %{
 5757     int vlen_enc = vector_length_encoding(this);
 5758     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5759   %}
 5760   ins_pipe( pipe_slow );
 5761 %}
 5762 
 5763 // Longs vector add
 5764 instruct vaddL(vec dst, vec src) %{
 5765   predicate(UseAVX == 0);
 5766   match(Set dst (AddVL dst src));
 5767   format %{ "paddq   $dst,$src\t! add packedL" %}
 5768   ins_encode %{
 5769     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
 5770   %}
 5771   ins_pipe( pipe_slow );
 5772 %}
 5773 
 5774 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
 5775   predicate(UseAVX > 0);
 5776   match(Set dst (AddVL src1 src2));
 5777   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
 5778   ins_encode %{
 5779     int vlen_enc = vector_length_encoding(this);
 5780     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5781   %}
 5782   ins_pipe( pipe_slow );
 5783 %}
 5784 
 5785 instruct vaddL_mem(vec dst, vec src, memory mem) %{
 5786   predicate((UseAVX > 0) &&
 5787             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5788   match(Set dst (AddVL src (LoadVector mem)));
 5789   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
 5790   ins_encode %{
 5791     int vlen_enc = vector_length_encoding(this);
 5792     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5793   %}
 5794   ins_pipe( pipe_slow );
 5795 %}
 5796 
 5797 // Floats vector add
 5798 instruct vaddF(vec dst, vec src) %{
 5799   predicate(UseAVX == 0);
 5800   match(Set dst (AddVF dst src));
 5801   format %{ "addps   $dst,$src\t! add packedF" %}
 5802   ins_encode %{
 5803     __ addps($dst$$XMMRegister, $src$$XMMRegister);
 5804   %}
 5805   ins_pipe( pipe_slow );
 5806 %}
 5807 
 5808 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
 5809   predicate(UseAVX > 0);
 5810   match(Set dst (AddVF src1 src2));
 5811   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
 5812   ins_encode %{
 5813     int vlen_enc = vector_length_encoding(this);
 5814     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5815   %}
 5816   ins_pipe( pipe_slow );
 5817 %}
 5818 
 5819 instruct vaddF_mem(vec dst, vec src, memory mem) %{
 5820   predicate((UseAVX > 0) &&
 5821             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5822   match(Set dst (AddVF src (LoadVector mem)));
 5823   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
 5824   ins_encode %{
 5825     int vlen_enc = vector_length_encoding(this);
 5826     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5827   %}
 5828   ins_pipe( pipe_slow );
 5829 %}
 5830 
 5831 // Doubles vector add
 5832 instruct vaddD(vec dst, vec src) %{
 5833   predicate(UseAVX == 0);
 5834   match(Set dst (AddVD dst src));
 5835   format %{ "addpd   $dst,$src\t! add packedD" %}
 5836   ins_encode %{
 5837     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
 5838   %}
 5839   ins_pipe( pipe_slow );
 5840 %}
 5841 
 5842 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
 5843   predicate(UseAVX > 0);
 5844   match(Set dst (AddVD src1 src2));
 5845   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
 5846   ins_encode %{
 5847     int vlen_enc = vector_length_encoding(this);
 5848     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5849   %}
 5850   ins_pipe( pipe_slow );
 5851 %}
 5852 
 5853 instruct vaddD_mem(vec dst, vec src, memory mem) %{
 5854   predicate((UseAVX > 0) &&
 5855             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5856   match(Set dst (AddVD src (LoadVector mem)));
 5857   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
 5858   ins_encode %{
 5859     int vlen_enc = vector_length_encoding(this);
 5860     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5861   %}
 5862   ins_pipe( pipe_slow );
 5863 %}
 5864 
 5865 // --------------------------------- SUB --------------------------------------
 5866 
 5867 // Bytes vector sub
 5868 instruct vsubB(vec dst, vec src) %{
 5869   predicate(UseAVX == 0);
 5870   match(Set dst (SubVB dst src));
 5871   format %{ "psubb   $dst,$src\t! sub packedB" %}
 5872   ins_encode %{
 5873     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
 5874   %}
 5875   ins_pipe( pipe_slow );
 5876 %}
 5877 
 5878 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
 5879   predicate(UseAVX > 0);
 5880   match(Set dst (SubVB src1 src2));
 5881   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
 5882   ins_encode %{
 5883     int vlen_enc = vector_length_encoding(this);
 5884     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5885   %}
 5886   ins_pipe( pipe_slow );
 5887 %}
 5888 
 5889 instruct vsubB_mem(vec dst, vec src, memory mem) %{
 5890   predicate((UseAVX > 0) &&
 5891             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5892   match(Set dst (SubVB src (LoadVector mem)));
 5893   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
 5894   ins_encode %{
 5895     int vlen_enc = vector_length_encoding(this);
 5896     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5897   %}
 5898   ins_pipe( pipe_slow );
 5899 %}
 5900 
 5901 // Shorts/Chars vector sub
 5902 instruct vsubS(vec dst, vec src) %{
 5903   predicate(UseAVX == 0);
 5904   match(Set dst (SubVS dst src));
 5905   format %{ "psubw   $dst,$src\t! sub packedS" %}
 5906   ins_encode %{
 5907     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
 5908   %}
 5909   ins_pipe( pipe_slow );
 5910 %}
 5911 
 5912 
 5913 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
 5914   predicate(UseAVX > 0);
 5915   match(Set dst (SubVS src1 src2));
 5916   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
 5917   ins_encode %{
 5918     int vlen_enc = vector_length_encoding(this);
 5919     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5920   %}
 5921   ins_pipe( pipe_slow );
 5922 %}
 5923 
 5924 instruct vsubS_mem(vec dst, vec src, memory mem) %{
 5925   predicate((UseAVX > 0) &&
 5926             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5927   match(Set dst (SubVS src (LoadVector mem)));
 5928   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
 5929   ins_encode %{
 5930     int vlen_enc = vector_length_encoding(this);
 5931     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5932   %}
 5933   ins_pipe( pipe_slow );
 5934 %}
 5935 
 5936 // Integers vector sub
 5937 instruct vsubI(vec dst, vec src) %{
 5938   predicate(UseAVX == 0);
 5939   match(Set dst (SubVI dst src));
 5940   format %{ "psubd   $dst,$src\t! sub packedI" %}
 5941   ins_encode %{
 5942     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
 5943   %}
 5944   ins_pipe( pipe_slow );
 5945 %}
 5946 
 5947 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
 5948   predicate(UseAVX > 0);
 5949   match(Set dst (SubVI src1 src2));
 5950   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
 5951   ins_encode %{
 5952     int vlen_enc = vector_length_encoding(this);
 5953     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5954   %}
 5955   ins_pipe( pipe_slow );
 5956 %}
 5957 
 5958 instruct vsubI_mem(vec dst, vec src, memory mem) %{
 5959   predicate((UseAVX > 0) &&
 5960             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5961   match(Set dst (SubVI src (LoadVector mem)));
 5962   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
 5963   ins_encode %{
 5964     int vlen_enc = vector_length_encoding(this);
 5965     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5966   %}
 5967   ins_pipe( pipe_slow );
 5968 %}
 5969 
 5970 // Longs vector sub
 5971 instruct vsubL(vec dst, vec src) %{
 5972   predicate(UseAVX == 0);
 5973   match(Set dst (SubVL dst src));
 5974   format %{ "psubq   $dst,$src\t! sub packedL" %}
 5975   ins_encode %{
 5976     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
 5977   %}
 5978   ins_pipe( pipe_slow );
 5979 %}
 5980 
 5981 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
 5982   predicate(UseAVX > 0);
 5983   match(Set dst (SubVL src1 src2));
 5984   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
 5985   ins_encode %{
 5986     int vlen_enc = vector_length_encoding(this);
 5987     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5988   %}
 5989   ins_pipe( pipe_slow );
 5990 %}
 5991 
 5992 
 5993 instruct vsubL_mem(vec dst, vec src, memory mem) %{
 5994   predicate((UseAVX > 0) &&
 5995             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5996   match(Set dst (SubVL src (LoadVector mem)));
 5997   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
 5998   ins_encode %{
 5999     int vlen_enc = vector_length_encoding(this);
 6000     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6001   %}
 6002   ins_pipe( pipe_slow );
 6003 %}
 6004 
 6005 // Floats vector sub
 6006 instruct vsubF(vec dst, vec src) %{
 6007   predicate(UseAVX == 0);
 6008   match(Set dst (SubVF dst src));
 6009   format %{ "subps   $dst,$src\t! sub packedF" %}
 6010   ins_encode %{
 6011     __ subps($dst$$XMMRegister, $src$$XMMRegister);
 6012   %}
 6013   ins_pipe( pipe_slow );
 6014 %}
 6015 
 6016 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
 6017   predicate(UseAVX > 0);
 6018   match(Set dst (SubVF src1 src2));
 6019   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
 6020   ins_encode %{
 6021     int vlen_enc = vector_length_encoding(this);
 6022     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6023   %}
 6024   ins_pipe( pipe_slow );
 6025 %}
 6026 
 6027 instruct vsubF_mem(vec dst, vec src, memory mem) %{
 6028   predicate((UseAVX > 0) &&
 6029             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6030   match(Set dst (SubVF src (LoadVector mem)));
 6031   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
 6032   ins_encode %{
 6033     int vlen_enc = vector_length_encoding(this);
 6034     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6035   %}
 6036   ins_pipe( pipe_slow );
 6037 %}
 6038 
 6039 // Doubles vector sub
 6040 instruct vsubD(vec dst, vec src) %{
 6041   predicate(UseAVX == 0);
 6042   match(Set dst (SubVD dst src));
 6043   format %{ "subpd   $dst,$src\t! sub packedD" %}
 6044   ins_encode %{
 6045     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
 6046   %}
 6047   ins_pipe( pipe_slow );
 6048 %}
 6049 
 6050 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
 6051   predicate(UseAVX > 0);
 6052   match(Set dst (SubVD src1 src2));
 6053   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
 6054   ins_encode %{
 6055     int vlen_enc = vector_length_encoding(this);
 6056     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6057   %}
 6058   ins_pipe( pipe_slow );
 6059 %}
 6060 
 6061 instruct vsubD_mem(vec dst, vec src, memory mem) %{
 6062   predicate((UseAVX > 0) &&
 6063             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6064   match(Set dst (SubVD src (LoadVector mem)));
 6065   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
 6066   ins_encode %{
 6067     int vlen_enc = vector_length_encoding(this);
 6068     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6069   %}
 6070   ins_pipe( pipe_slow );
 6071 %}
 6072 
 6073 // --------------------------------- MUL --------------------------------------
 6074 
 6075 // Byte vector mul
 6076 instruct vmul8B(vec dst, vec src1, vec src2, vec xtmp) %{
 6077   predicate(Matcher::vector_length_in_bytes(n) <= 8);
 6078   match(Set dst (MulVB src1 src2));
 6079   effect(TEMP dst, TEMP xtmp);
 6080   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6081   ins_encode %{
 6082     assert(UseSSE > 3, "required");
 6083     __ pmovsxbw($dst$$XMMRegister, $src1$$XMMRegister);
 6084     __ pmovsxbw($xtmp$$XMMRegister, $src2$$XMMRegister);
 6085     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 6086     __ psllw($dst$$XMMRegister, 8);
 6087     __ psrlw($dst$$XMMRegister, 8);
 6088     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6089   %}
 6090   ins_pipe( pipe_slow );
 6091 %}
 6092 
 6093 instruct vmulB(vec dst, vec src1, vec src2, vec xtmp) %{
 6094   predicate(UseAVX == 0 && Matcher::vector_length_in_bytes(n) > 8);
 6095   match(Set dst (MulVB src1 src2));
 6096   effect(TEMP dst, TEMP xtmp);
 6097   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6098   ins_encode %{
 6099     assert(UseSSE > 3, "required");
 6100     // Odd-index elements
 6101     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
 6102     __ psrlw($dst$$XMMRegister, 8);
 6103     __ movdqu($xtmp$$XMMRegister, $src2$$XMMRegister);
 6104     __ psrlw($xtmp$$XMMRegister, 8);
 6105     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 6106     __ psllw($dst$$XMMRegister, 8);
 6107     // Even-index elements
 6108     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 6109     __ pmullw($xtmp$$XMMRegister, $src2$$XMMRegister);
 6110     __ psllw($xtmp$$XMMRegister, 8);
 6111     __ psrlw($xtmp$$XMMRegister, 8);
 6112     // Combine
 6113     __ por($dst$$XMMRegister, $xtmp$$XMMRegister);
 6114   %}
 6115   ins_pipe( pipe_slow );
 6116 %}
 6117 
 6118 instruct vmulB_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 6119   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) > 8);
 6120   match(Set dst (MulVB src1 src2));
 6121   effect(TEMP xtmp1, TEMP xtmp2);
 6122   format %{ "vmulVB  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 6123   ins_encode %{
 6124     int vlen_enc = vector_length_encoding(this);
 6125     // Odd-index elements
 6126     __ vpsrlw($xtmp2$$XMMRegister, $src1$$XMMRegister, 8, vlen_enc);
 6127     __ vpsrlw($xtmp1$$XMMRegister, $src2$$XMMRegister, 8, vlen_enc);
 6128     __ vpmullw($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6129     __ vpsllw($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 8, vlen_enc);
 6130     // Even-index elements
 6131     __ vpmullw($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6132     __ vpsllw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 6133     __ vpsrlw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 6134     // Combine
 6135     __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6136   %}
 6137   ins_pipe( pipe_slow );
 6138 %}
 6139 
 6140 // Shorts/Chars vector mul
 6141 instruct vmulS(vec dst, vec src) %{
 6142   predicate(UseAVX == 0);
 6143   match(Set dst (MulVS dst src));
 6144   format %{ "pmullw  $dst,$src\t! mul packedS" %}
 6145   ins_encode %{
 6146     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
 6147   %}
 6148   ins_pipe( pipe_slow );
 6149 %}
 6150 
 6151 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
 6152   predicate(UseAVX > 0);
 6153   match(Set dst (MulVS src1 src2));
 6154   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
 6155   ins_encode %{
 6156     int vlen_enc = vector_length_encoding(this);
 6157     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6158   %}
 6159   ins_pipe( pipe_slow );
 6160 %}
 6161 
 6162 instruct vmulS_mem(vec dst, vec src, memory mem) %{
 6163   predicate((UseAVX > 0) &&
 6164             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6165   match(Set dst (MulVS src (LoadVector mem)));
 6166   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
 6167   ins_encode %{
 6168     int vlen_enc = vector_length_encoding(this);
 6169     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6170   %}
 6171   ins_pipe( pipe_slow );
 6172 %}
 6173 
 6174 // Integers vector mul
 6175 instruct vmulI(vec dst, vec src) %{
 6176   predicate(UseAVX == 0);
 6177   match(Set dst (MulVI dst src));
 6178   format %{ "pmulld  $dst,$src\t! mul packedI" %}
 6179   ins_encode %{
 6180     assert(UseSSE > 3, "required");
 6181     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
 6182   %}
 6183   ins_pipe( pipe_slow );
 6184 %}
 6185 
 6186 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
 6187   predicate(UseAVX > 0);
 6188   match(Set dst (MulVI src1 src2));
 6189   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
 6190   ins_encode %{
 6191     int vlen_enc = vector_length_encoding(this);
 6192     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6193   %}
 6194   ins_pipe( pipe_slow );
 6195 %}
 6196 
 6197 instruct vmulI_mem(vec dst, vec src, memory mem) %{
 6198   predicate((UseAVX > 0) &&
 6199             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6200   match(Set dst (MulVI src (LoadVector mem)));
 6201   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
 6202   ins_encode %{
 6203     int vlen_enc = vector_length_encoding(this);
 6204     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6205   %}
 6206   ins_pipe( pipe_slow );
 6207 %}
 6208 
 6209 // Longs vector mul
 6210 instruct evmulL_reg(vec dst, vec src1, vec src2) %{
 6211   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6212              VM_Version::supports_avx512dq()) ||
 6213             VM_Version::supports_avx512vldq());
 6214   match(Set dst (MulVL src1 src2));
 6215   format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
 6216   ins_encode %{
 6217     assert(UseAVX > 2, "required");
 6218     int vlen_enc = vector_length_encoding(this);
 6219     __ evpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6220   %}
 6221   ins_pipe( pipe_slow );
 6222 %}
 6223 
 6224 instruct evmulL_mem(vec dst, vec src, memory mem) %{
 6225   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6226              VM_Version::supports_avx512dq()) ||
 6227             (Matcher::vector_length_in_bytes(n) > 8 &&
 6228              VM_Version::supports_avx512vldq()));
 6229   match(Set dst (MulVL src (LoadVector mem)));
 6230   format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
 6231   ins_encode %{
 6232     assert(UseAVX > 2, "required");
 6233     int vlen_enc = vector_length_encoding(this);
 6234     __ evpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6235   %}
 6236   ins_pipe( pipe_slow );
 6237 %}
 6238 
 6239 instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
 6240   predicate(UseAVX == 0);
 6241   match(Set dst (MulVL src1 src2));
 6242   effect(TEMP dst, TEMP xtmp);
 6243   format %{ "mulVL   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6244   ins_encode %{
 6245     assert(VM_Version::supports_sse4_1(), "required");
 6246     // Get the lo-hi products, only the lower 32 bits is in concerns
 6247     __ pshufd($xtmp$$XMMRegister, $src2$$XMMRegister, 0xB1);
 6248     __ pmulld($xtmp$$XMMRegister, $src1$$XMMRegister);
 6249     __ pshufd($dst$$XMMRegister, $xtmp$$XMMRegister, 0xB1);
 6250     __ paddd($dst$$XMMRegister, $xtmp$$XMMRegister);
 6251     __ psllq($dst$$XMMRegister, 32);
 6252     // Get the lo-lo products
 6253     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 6254     __ pmuludq($xtmp$$XMMRegister, $src2$$XMMRegister);
 6255     __ paddq($dst$$XMMRegister, $xtmp$$XMMRegister);
 6256   %}
 6257   ins_pipe( pipe_slow );
 6258 %}
 6259 
 6260 instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 6261   predicate(UseAVX > 0 &&
 6262             ((Matcher::vector_length_in_bytes(n) == 64 &&
 6263               !VM_Version::supports_avx512dq()) ||
 6264              (Matcher::vector_length_in_bytes(n) < 64 &&
 6265               !VM_Version::supports_avx512vldq())));
 6266   match(Set dst (MulVL src1 src2));
 6267   effect(TEMP xtmp1, TEMP xtmp2);
 6268   format %{ "vmulVL  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 6269   ins_encode %{
 6270     int vlen_enc = vector_length_encoding(this);
 6271     // Get the lo-hi products, only the lower 32 bits is in concerns
 6272     __ vpshufd($xtmp1$$XMMRegister, $src2$$XMMRegister, 0xB1, vlen_enc);
 6273     __ vpmulld($xtmp1$$XMMRegister, $src1$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6274     __ vpshufd($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 0xB1, vlen_enc);
 6275     __ vpaddd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6276     __ vpsllq($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 32, vlen_enc);
 6277     // Get the lo-lo products
 6278     __ vpmuludq($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6279     __ vpaddq($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6280   %}
 6281   ins_pipe( pipe_slow );
 6282 %}
 6283 
 6284 // Floats vector mul
 6285 instruct vmulF(vec dst, vec src) %{
 6286   predicate(UseAVX == 0);
 6287   match(Set dst (MulVF dst src));
 6288   format %{ "mulps   $dst,$src\t! mul packedF" %}
 6289   ins_encode %{
 6290     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
 6291   %}
 6292   ins_pipe( pipe_slow );
 6293 %}
 6294 
 6295 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
 6296   predicate(UseAVX > 0);
 6297   match(Set dst (MulVF src1 src2));
 6298   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
 6299   ins_encode %{
 6300     int vlen_enc = vector_length_encoding(this);
 6301     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6302   %}
 6303   ins_pipe( pipe_slow );
 6304 %}
 6305 
 6306 instruct vmulF_mem(vec dst, vec src, memory mem) %{
 6307   predicate((UseAVX > 0) &&
 6308             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6309   match(Set dst (MulVF src (LoadVector mem)));
 6310   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
 6311   ins_encode %{
 6312     int vlen_enc = vector_length_encoding(this);
 6313     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6314   %}
 6315   ins_pipe( pipe_slow );
 6316 %}
 6317 
 6318 // Doubles vector mul
 6319 instruct vmulD(vec dst, vec src) %{
 6320   predicate(UseAVX == 0);
 6321   match(Set dst (MulVD dst src));
 6322   format %{ "mulpd   $dst,$src\t! mul packedD" %}
 6323   ins_encode %{
 6324     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
 6325   %}
 6326   ins_pipe( pipe_slow );
 6327 %}
 6328 
 6329 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
 6330   predicate(UseAVX > 0);
 6331   match(Set dst (MulVD src1 src2));
 6332   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
 6333   ins_encode %{
 6334     int vlen_enc = vector_length_encoding(this);
 6335     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6336   %}
 6337   ins_pipe( pipe_slow );
 6338 %}
 6339 
 6340 instruct vmulD_mem(vec dst, vec src, memory mem) %{
 6341   predicate((UseAVX > 0) &&
 6342             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6343   match(Set dst (MulVD src (LoadVector mem)));
 6344   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
 6345   ins_encode %{
 6346     int vlen_enc = vector_length_encoding(this);
 6347     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6348   %}
 6349   ins_pipe( pipe_slow );
 6350 %}
 6351 
 6352 // --------------------------------- DIV --------------------------------------
 6353 
 6354 // Floats vector div
 6355 instruct vdivF(vec dst, vec src) %{
 6356   predicate(UseAVX == 0);
 6357   match(Set dst (DivVF dst src));
 6358   format %{ "divps   $dst,$src\t! div packedF" %}
 6359   ins_encode %{
 6360     __ divps($dst$$XMMRegister, $src$$XMMRegister);
 6361   %}
 6362   ins_pipe( pipe_slow );
 6363 %}
 6364 
 6365 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
 6366   predicate(UseAVX > 0);
 6367   match(Set dst (DivVF src1 src2));
 6368   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
 6369   ins_encode %{
 6370     int vlen_enc = vector_length_encoding(this);
 6371     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6372   %}
 6373   ins_pipe( pipe_slow );
 6374 %}
 6375 
 6376 instruct vdivF_mem(vec dst, vec src, memory mem) %{
 6377   predicate((UseAVX > 0) &&
 6378             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6379   match(Set dst (DivVF src (LoadVector mem)));
 6380   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
 6381   ins_encode %{
 6382     int vlen_enc = vector_length_encoding(this);
 6383     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6384   %}
 6385   ins_pipe( pipe_slow );
 6386 %}
 6387 
 6388 // Doubles vector div
 6389 instruct vdivD(vec dst, vec src) %{
 6390   predicate(UseAVX == 0);
 6391   match(Set dst (DivVD dst src));
 6392   format %{ "divpd   $dst,$src\t! div packedD" %}
 6393   ins_encode %{
 6394     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
 6395   %}
 6396   ins_pipe( pipe_slow );
 6397 %}
 6398 
 6399 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
 6400   predicate(UseAVX > 0);
 6401   match(Set dst (DivVD src1 src2));
 6402   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
 6403   ins_encode %{
 6404     int vlen_enc = vector_length_encoding(this);
 6405     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6406   %}
 6407   ins_pipe( pipe_slow );
 6408 %}
 6409 
 6410 instruct vdivD_mem(vec dst, vec src, memory mem) %{
 6411   predicate((UseAVX > 0) &&
 6412             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6413   match(Set dst (DivVD src (LoadVector mem)));
 6414   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
 6415   ins_encode %{
 6416     int vlen_enc = vector_length_encoding(this);
 6417     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6418   %}
 6419   ins_pipe( pipe_slow );
 6420 %}
 6421 
 6422 // ------------------------------ MinMax ---------------------------------------
 6423 
 6424 // Byte, Short, Int vector Min/Max
 6425 instruct minmax_reg_sse(vec dst, vec src) %{
 6426   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6427             UseAVX == 0);
 6428   match(Set dst (MinV dst src));
 6429   match(Set dst (MaxV dst src));
 6430   format %{ "vector_minmax  $dst,$src\t!  " %}
 6431   ins_encode %{
 6432     assert(UseSSE >= 4, "required");
 6433 
 6434     int opcode = this->ideal_Opcode();
 6435     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6436     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
 6437   %}
 6438   ins_pipe( pipe_slow );
 6439 %}
 6440 
 6441 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
 6442   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6443             UseAVX > 0);
 6444   match(Set dst (MinV src1 src2));
 6445   match(Set dst (MaxV src1 src2));
 6446   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
 6447   ins_encode %{
 6448     int opcode = this->ideal_Opcode();
 6449     int vlen_enc = vector_length_encoding(this);
 6450     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6451 
 6452     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6453   %}
 6454   ins_pipe( pipe_slow );
 6455 %}
 6456 
 6457 // Long vector Min/Max
 6458 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
 6459   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6460             UseAVX == 0);
 6461   match(Set dst (MinV dst src));
 6462   match(Set dst (MaxV src dst));
 6463   effect(TEMP dst, TEMP tmp);
 6464   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
 6465   ins_encode %{
 6466     assert(UseSSE >= 4, "required");
 6467 
 6468     int opcode = this->ideal_Opcode();
 6469     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6470     assert(elem_bt == T_LONG, "sanity");
 6471 
 6472     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
 6473   %}
 6474   ins_pipe( pipe_slow );
 6475 %}
 6476 
 6477 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
 6478   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6479             UseAVX > 0 && !VM_Version::supports_avx512vl());
 6480   match(Set dst (MinV src1 src2));
 6481   match(Set dst (MaxV src1 src2));
 6482   effect(TEMP dst);
 6483   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
 6484   ins_encode %{
 6485     int vlen_enc = vector_length_encoding(this);
 6486     int opcode = this->ideal_Opcode();
 6487     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6488     assert(elem_bt == T_LONG, "sanity");
 6489 
 6490     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6491   %}
 6492   ins_pipe( pipe_slow );
 6493 %}
 6494 
 6495 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
 6496   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
 6497             Matcher::vector_element_basic_type(n) == T_LONG);
 6498   match(Set dst (MinV src1 src2));
 6499   match(Set dst (MaxV src1 src2));
 6500   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
 6501   ins_encode %{
 6502     assert(UseAVX > 2, "required");
 6503 
 6504     int vlen_enc = vector_length_encoding(this);
 6505     int opcode = this->ideal_Opcode();
 6506     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6507     assert(elem_bt == T_LONG, "sanity");
 6508 
 6509     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6510   %}
 6511   ins_pipe( pipe_slow );
 6512 %}
 6513 
 6514 // Float/Double vector Min/Max
 6515 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
 6516   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
 6517             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
 6518             UseAVX > 0);
 6519   match(Set dst (MinV a b));
 6520   match(Set dst (MaxV a b));
 6521   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
 6522   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
 6523   ins_encode %{
 6524     assert(UseAVX > 0, "required");
 6525 
 6526     int opcode = this->ideal_Opcode();
 6527     int vlen_enc = vector_length_encoding(this);
 6528     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6529 
 6530     __ vminmax_fp(opcode, elem_bt,
 6531                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6532                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6533   %}
 6534   ins_pipe( pipe_slow );
 6535 %}
 6536 
 6537 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
 6538   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 6539             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6540   match(Set dst (MinV a b));
 6541   match(Set dst (MaxV a b));
 6542   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
 6543   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
 6544   ins_encode %{
 6545     assert(UseAVX > 2, "required");
 6546 
 6547     int opcode = this->ideal_Opcode();
 6548     int vlen_enc = vector_length_encoding(this);
 6549     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6550 
 6551     __ evminmax_fp(opcode, elem_bt,
 6552                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6553                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6554   %}
 6555   ins_pipe( pipe_slow );
 6556 %}
 6557 
 6558 // ------------------------------ Unsigned vector Min/Max ----------------------
 6559 
 6560 instruct vector_uminmax_reg(vec dst, vec a, vec b) %{
 6561   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6562   match(Set dst (UMinV a b));
 6563   match(Set dst (UMaxV a b));
 6564   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6565   ins_encode %{
 6566     int opcode = this->ideal_Opcode();
 6567     int vlen_enc = vector_length_encoding(this);
 6568     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6569     assert(is_integral_type(elem_bt), "");
 6570     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vlen_enc);
 6571   %}
 6572   ins_pipe( pipe_slow );
 6573 %}
 6574 
 6575 instruct vector_uminmax_mem(vec dst, vec a, memory b) %{
 6576   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6577   match(Set dst (UMinV a (LoadVector b)));
 6578   match(Set dst (UMaxV a (LoadVector b)));
 6579   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6580   ins_encode %{
 6581     int opcode = this->ideal_Opcode();
 6582     int vlen_enc = vector_length_encoding(this);
 6583     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6584     assert(is_integral_type(elem_bt), "");
 6585     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$Address, vlen_enc);
 6586   %}
 6587   ins_pipe( pipe_slow );
 6588 %}
 6589 
 6590 instruct vector_uminmaxq_reg(vec dst, vec a, vec b, vec xtmp1, vec xtmp2) %{
 6591   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_LONG);
 6592   match(Set dst (UMinV a b));
 6593   match(Set dst (UMaxV a b));
 6594   effect(TEMP xtmp1, TEMP xtmp2);
 6595   format %{ "vector_uminmaxq $dst,$a,$b\t! using xtmp1 and xtmp2 as TEMP" %}
 6596   ins_encode %{
 6597     int opcode = this->ideal_Opcode();
 6598     int vlen_enc = vector_length_encoding(this);
 6599     __ vpuminmaxq(opcode, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6600   %}
 6601   ins_pipe( pipe_slow );
 6602 %}
 6603 
 6604 instruct vector_uminmax_reg_masked(vec dst, vec src2, kReg mask) %{
 6605   match(Set dst (UMinV (Binary dst src2) mask));
 6606   match(Set dst (UMaxV (Binary dst src2) mask));
 6607   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6608   ins_encode %{
 6609     int vlen_enc = vector_length_encoding(this);
 6610     BasicType bt = Matcher::vector_element_basic_type(this);
 6611     int opc = this->ideal_Opcode();
 6612     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6613                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 6614   %}
 6615   ins_pipe( pipe_slow );
 6616 %}
 6617 
 6618 instruct vector_uminmax_mem_masked(vec dst, memory src2, kReg mask) %{
 6619   match(Set dst (UMinV (Binary dst (LoadVector src2)) mask));
 6620   match(Set dst (UMaxV (Binary dst (LoadVector src2)) mask));
 6621   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6622   ins_encode %{
 6623     int vlen_enc = vector_length_encoding(this);
 6624     BasicType bt = Matcher::vector_element_basic_type(this);
 6625     int opc = this->ideal_Opcode();
 6626     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6627                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 6628   %}
 6629   ins_pipe( pipe_slow );
 6630 %}
 6631 
 6632 // --------------------------------- Signum/CopySign ---------------------------
 6633 
 6634 instruct signumF_reg(regF dst, regF zero, regF one, rFlagsReg cr) %{
 6635   match(Set dst (SignumF dst (Binary zero one)));
 6636   effect(KILL cr);
 6637   format %{ "signumF $dst, $dst" %}
 6638   ins_encode %{
 6639     int opcode = this->ideal_Opcode();
 6640     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6641   %}
 6642   ins_pipe( pipe_slow );
 6643 %}
 6644 
 6645 instruct signumD_reg(regD dst, regD zero, regD one, rFlagsReg cr) %{
 6646   match(Set dst (SignumD dst (Binary zero one)));
 6647   effect(KILL cr);
 6648   format %{ "signumD $dst, $dst" %}
 6649   ins_encode %{
 6650     int opcode = this->ideal_Opcode();
 6651     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6652   %}
 6653   ins_pipe( pipe_slow );
 6654 %}
 6655 
 6656 instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
 6657   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 6658   match(Set dst (SignumVF src (Binary zero one)));
 6659   match(Set dst (SignumVD src (Binary zero one)));
 6660   effect(TEMP dst, TEMP xtmp1);
 6661   format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
 6662   ins_encode %{
 6663     int opcode = this->ideal_Opcode();
 6664     int vec_enc = vector_length_encoding(this);
 6665     __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6666                          $xtmp1$$XMMRegister, vec_enc);
 6667   %}
 6668   ins_pipe( pipe_slow );
 6669 %}
 6670 
 6671 instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
 6672   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 6673   match(Set dst (SignumVF src (Binary zero one)));
 6674   match(Set dst (SignumVD src (Binary zero one)));
 6675   effect(TEMP dst, TEMP ktmp1);
 6676   format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
 6677   ins_encode %{
 6678     int opcode = this->ideal_Opcode();
 6679     int vec_enc = vector_length_encoding(this);
 6680     __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6681                           $ktmp1$$KRegister, vec_enc);
 6682   %}
 6683   ins_pipe( pipe_slow );
 6684 %}
 6685 
 6686 // ---------------------------------------
 6687 // For copySign use 0xE4 as writemask for vpternlog
 6688 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
 6689 // C (xmm2) is set to 0x7FFFFFFF
 6690 // Wherever xmm2 is 0, we want to pick from B (sign)
 6691 // Wherever xmm2 is 1, we want to pick from A (src)
 6692 //
 6693 // A B C Result
 6694 // 0 0 0 0
 6695 // 0 0 1 0
 6696 // 0 1 0 1
 6697 // 0 1 1 0
 6698 // 1 0 0 0
 6699 // 1 0 1 1
 6700 // 1 1 0 1
 6701 // 1 1 1 1
 6702 //
 6703 // Result going from high bit to low bit is 0x11100100 = 0xe4
 6704 // ---------------------------------------
 6705 
 6706 #ifdef _LP64
 6707 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
 6708   match(Set dst (CopySignF dst src));
 6709   effect(TEMP tmp1, TEMP tmp2);
 6710   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6711   ins_encode %{
 6712     __ movl($tmp2$$Register, 0x7FFFFFFF);
 6713     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
 6714     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6715   %}
 6716   ins_pipe( pipe_slow );
 6717 %}
 6718 
 6719 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
 6720   match(Set dst (CopySignD dst (Binary src zero)));
 6721   ins_cost(100);
 6722   effect(TEMP tmp1, TEMP tmp2);
 6723   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6724   ins_encode %{
 6725     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
 6726     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
 6727     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6728   %}
 6729   ins_pipe( pipe_slow );
 6730 %}
 6731 
 6732 #endif // _LP64
 6733 
 6734 //----------------------------- CompressBits/ExpandBits ------------------------
 6735 
 6736 instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6737   predicate(n->bottom_type()->isa_int());
 6738   match(Set dst (CompressBits src mask));
 6739   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6740   ins_encode %{
 6741     __ pextl($dst$$Register, $src$$Register, $mask$$Register);
 6742   %}
 6743   ins_pipe( pipe_slow );
 6744 %}
 6745 
 6746 instruct expandBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6747   predicate(n->bottom_type()->isa_int());
 6748   match(Set dst (ExpandBits src mask));
 6749   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6750   ins_encode %{
 6751     __ pdepl($dst$$Register, $src$$Register, $mask$$Register);
 6752   %}
 6753   ins_pipe( pipe_slow );
 6754 %}
 6755 
 6756 instruct compressBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6757   predicate(n->bottom_type()->isa_int());
 6758   match(Set dst (CompressBits src (LoadI mask)));
 6759   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6760   ins_encode %{
 6761     __ pextl($dst$$Register, $src$$Register, $mask$$Address);
 6762   %}
 6763   ins_pipe( pipe_slow );
 6764 %}
 6765 
 6766 instruct expandBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6767   predicate(n->bottom_type()->isa_int());
 6768   match(Set dst (ExpandBits src (LoadI mask)));
 6769   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6770   ins_encode %{
 6771     __ pdepl($dst$$Register, $src$$Register, $mask$$Address);
 6772   %}
 6773   ins_pipe( pipe_slow );
 6774 %}
 6775 
 6776 // --------------------------------- Sqrt --------------------------------------
 6777 
 6778 instruct vsqrtF_reg(vec dst, vec src) %{
 6779   match(Set dst (SqrtVF src));
 6780   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
 6781   ins_encode %{
 6782     assert(UseAVX > 0, "required");
 6783     int vlen_enc = vector_length_encoding(this);
 6784     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6785   %}
 6786   ins_pipe( pipe_slow );
 6787 %}
 6788 
 6789 instruct vsqrtF_mem(vec dst, memory mem) %{
 6790   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6791   match(Set dst (SqrtVF (LoadVector mem)));
 6792   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
 6793   ins_encode %{
 6794     assert(UseAVX > 0, "required");
 6795     int vlen_enc = vector_length_encoding(this);
 6796     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6797   %}
 6798   ins_pipe( pipe_slow );
 6799 %}
 6800 
 6801 // Floating point vector sqrt
 6802 instruct vsqrtD_reg(vec dst, vec src) %{
 6803   match(Set dst (SqrtVD src));
 6804   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
 6805   ins_encode %{
 6806     assert(UseAVX > 0, "required");
 6807     int vlen_enc = vector_length_encoding(this);
 6808     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6809   %}
 6810   ins_pipe( pipe_slow );
 6811 %}
 6812 
 6813 instruct vsqrtD_mem(vec dst, memory mem) %{
 6814   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6815   match(Set dst (SqrtVD (LoadVector mem)));
 6816   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
 6817   ins_encode %{
 6818     assert(UseAVX > 0, "required");
 6819     int vlen_enc = vector_length_encoding(this);
 6820     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6821   %}
 6822   ins_pipe( pipe_slow );
 6823 %}
 6824 
 6825 // ------------------------------ Shift ---------------------------------------
 6826 
 6827 // Left and right shift count vectors are the same on x86
 6828 // (only lowest bits of xmm reg are used for count).
 6829 instruct vshiftcnt(vec dst, rRegI cnt) %{
 6830   match(Set dst (LShiftCntV cnt));
 6831   match(Set dst (RShiftCntV cnt));
 6832   format %{ "movdl    $dst,$cnt\t! load shift count" %}
 6833   ins_encode %{
 6834     __ movdl($dst$$XMMRegister, $cnt$$Register);
 6835   %}
 6836   ins_pipe( pipe_slow );
 6837 %}
 6838 
 6839 // Byte vector shift
 6840 instruct vshiftB(vec dst, vec src, vec shift, vec tmp) %{
 6841   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
 6842   match(Set dst ( LShiftVB src shift));
 6843   match(Set dst ( RShiftVB src shift));
 6844   match(Set dst (URShiftVB src shift));
 6845   effect(TEMP dst, USE src, USE shift, TEMP tmp);
 6846   format %{"vector_byte_shift $dst,$src,$shift" %}
 6847   ins_encode %{
 6848     assert(UseSSE > 3, "required");
 6849     int opcode = this->ideal_Opcode();
 6850     bool sign = (opcode != Op_URShiftVB);
 6851     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
 6852     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
 6853     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6854     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 6855     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6856   %}
 6857   ins_pipe( pipe_slow );
 6858 %}
 6859 
 6860 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6861   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6862             UseAVX <= 1);
 6863   match(Set dst ( LShiftVB src shift));
 6864   match(Set dst ( RShiftVB src shift));
 6865   match(Set dst (URShiftVB src shift));
 6866   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2);
 6867   format %{"vector_byte_shift $dst,$src,$shift" %}
 6868   ins_encode %{
 6869     assert(UseSSE > 3, "required");
 6870     int opcode = this->ideal_Opcode();
 6871     bool sign = (opcode != Op_URShiftVB);
 6872     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
 6873     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
 6874     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
 6875     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
 6876     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
 6877     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6878     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 6879     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 6880     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 6881   %}
 6882   ins_pipe( pipe_slow );
 6883 %}
 6884 
 6885 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6886   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6887             UseAVX > 1);
 6888   match(Set dst ( LShiftVB src shift));
 6889   match(Set dst ( RShiftVB src shift));
 6890   match(Set dst (URShiftVB src shift));
 6891   effect(TEMP dst, TEMP tmp);
 6892   format %{"vector_byte_shift $dst,$src,$shift" %}
 6893   ins_encode %{
 6894     int opcode = this->ideal_Opcode();
 6895     bool sign = (opcode != Op_URShiftVB);
 6896     int vlen_enc = Assembler::AVX_256bit;
 6897     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6898     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6899     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6900     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
 6901     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
 6902   %}
 6903   ins_pipe( pipe_slow );
 6904 %}
 6905 
 6906 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6907   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
 6908   match(Set dst ( LShiftVB src shift));
 6909   match(Set dst ( RShiftVB src shift));
 6910   match(Set dst (URShiftVB src shift));
 6911   effect(TEMP dst, TEMP tmp);
 6912   format %{"vector_byte_shift $dst,$src,$shift" %}
 6913   ins_encode %{
 6914     assert(UseAVX > 1, "required");
 6915     int opcode = this->ideal_Opcode();
 6916     bool sign = (opcode != Op_URShiftVB);
 6917     int vlen_enc = Assembler::AVX_256bit;
 6918     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
 6919     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6920     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6921     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6922     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6923     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6924     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6925     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6926     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6927   %}
 6928   ins_pipe( pipe_slow );
 6929 %}
 6930 
 6931 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6932   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
 6933   match(Set dst ( LShiftVB src shift));
 6934   match(Set dst  (RShiftVB src shift));
 6935   match(Set dst (URShiftVB src shift));
 6936   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 6937   format %{"vector_byte_shift $dst,$src,$shift" %}
 6938   ins_encode %{
 6939     assert(UseAVX > 2, "required");
 6940     int opcode = this->ideal_Opcode();
 6941     bool sign = (opcode != Op_URShiftVB);
 6942     int vlen_enc = Assembler::AVX_512bit;
 6943     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
 6944     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 6945     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6946     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6947     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6948     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6949     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6950     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6951     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6952     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 6953     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
 6954     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6955   %}
 6956   ins_pipe( pipe_slow );
 6957 %}
 6958 
 6959 // Shorts vector logical right shift produces incorrect Java result
 6960 // for negative data because java code convert short value into int with
 6961 // sign extension before a shift. But char vectors are fine since chars are
 6962 // unsigned values.
 6963 // Shorts/Chars vector left shift
 6964 instruct vshiftS(vec dst, vec src, vec shift) %{
 6965   predicate(!n->as_ShiftV()->is_var_shift());
 6966   match(Set dst ( LShiftVS src shift));
 6967   match(Set dst ( RShiftVS src shift));
 6968   match(Set dst (URShiftVS src shift));
 6969   effect(TEMP dst, USE src, USE shift);
 6970   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
 6971   ins_encode %{
 6972     int opcode = this->ideal_Opcode();
 6973     if (UseAVX > 0) {
 6974       int vlen_enc = vector_length_encoding(this);
 6975       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6976     } else {
 6977       int vlen = Matcher::vector_length(this);
 6978       if (vlen == 2) {
 6979         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 6980         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6981       } else if (vlen == 4) {
 6982         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6983         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6984       } else {
 6985         assert (vlen == 8, "sanity");
 6986         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6987         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6988       }
 6989     }
 6990   %}
 6991   ins_pipe( pipe_slow );
 6992 %}
 6993 
 6994 // Integers vector left shift
 6995 instruct vshiftI(vec dst, vec src, vec shift) %{
 6996   predicate(!n->as_ShiftV()->is_var_shift());
 6997   match(Set dst ( LShiftVI src shift));
 6998   match(Set dst ( RShiftVI src shift));
 6999   match(Set dst (URShiftVI src shift));
 7000   effect(TEMP dst, USE src, USE shift);
 7001   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
 7002   ins_encode %{
 7003     int opcode = this->ideal_Opcode();
 7004     if (UseAVX > 0) {
 7005       int vlen_enc = vector_length_encoding(this);
 7006       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7007     } else {
 7008       int vlen = Matcher::vector_length(this);
 7009       if (vlen == 2) {
 7010         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 7011         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 7012       } else {
 7013         assert(vlen == 4, "sanity");
 7014         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7015         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 7016       }
 7017     }
 7018   %}
 7019   ins_pipe( pipe_slow );
 7020 %}
 7021 
 7022 // Integers vector left constant shift
 7023 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
 7024   match(Set dst (LShiftVI src (LShiftCntV shift)));
 7025   match(Set dst (RShiftVI src (RShiftCntV shift)));
 7026   match(Set dst (URShiftVI src (RShiftCntV shift)));
 7027   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
 7028   ins_encode %{
 7029     int opcode = this->ideal_Opcode();
 7030     if (UseAVX > 0) {
 7031       int vector_len = vector_length_encoding(this);
 7032       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 7033     } else {
 7034       int vlen = Matcher::vector_length(this);
 7035       if (vlen == 2) {
 7036         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 7037         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 7038       } else {
 7039         assert(vlen == 4, "sanity");
 7040         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7041         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 7042       }
 7043     }
 7044   %}
 7045   ins_pipe( pipe_slow );
 7046 %}
 7047 
 7048 // Longs vector shift
 7049 instruct vshiftL(vec dst, vec src, vec shift) %{
 7050   predicate(!n->as_ShiftV()->is_var_shift());
 7051   match(Set dst ( LShiftVL src shift));
 7052   match(Set dst (URShiftVL src shift));
 7053   effect(TEMP dst, USE src, USE shift);
 7054   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
 7055   ins_encode %{
 7056     int opcode = this->ideal_Opcode();
 7057     if (UseAVX > 0) {
 7058       int vlen_enc = vector_length_encoding(this);
 7059       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7060     } else {
 7061       assert(Matcher::vector_length(this) == 2, "");
 7062       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7063       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 7064     }
 7065   %}
 7066   ins_pipe( pipe_slow );
 7067 %}
 7068 
 7069 // Longs vector constant shift
 7070 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
 7071   match(Set dst (LShiftVL src (LShiftCntV shift)));
 7072   match(Set dst (URShiftVL src (RShiftCntV shift)));
 7073   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
 7074   ins_encode %{
 7075     int opcode = this->ideal_Opcode();
 7076     if (UseAVX > 0) {
 7077       int vector_len = vector_length_encoding(this);
 7078       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 7079     } else {
 7080       assert(Matcher::vector_length(this) == 2, "");
 7081       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7082       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 7083     }
 7084   %}
 7085   ins_pipe( pipe_slow );
 7086 %}
 7087 
 7088 // -------------------ArithmeticRightShift -----------------------------------
 7089 // Long vector arithmetic right shift
 7090 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp) %{
 7091   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
 7092   match(Set dst (RShiftVL src shift));
 7093   effect(TEMP dst, TEMP tmp);
 7094   format %{ "vshiftq $dst,$src,$shift" %}
 7095   ins_encode %{
 7096     uint vlen = Matcher::vector_length(this);
 7097     if (vlen == 2) {
 7098       assert(UseSSE >= 2, "required");
 7099       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7100       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
 7101       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 7102       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
 7103       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
 7104       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
 7105     } else {
 7106       assert(vlen == 4, "sanity");
 7107       assert(UseAVX > 1, "required");
 7108       int vlen_enc = Assembler::AVX_256bit;
 7109       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7110       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 7111       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7112       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 7113       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 7114     }
 7115   %}
 7116   ins_pipe( pipe_slow );
 7117 %}
 7118 
 7119 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
 7120   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
 7121   match(Set dst (RShiftVL src shift));
 7122   format %{ "vshiftq $dst,$src,$shift" %}
 7123   ins_encode %{
 7124     int vlen_enc = vector_length_encoding(this);
 7125     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7126   %}
 7127   ins_pipe( pipe_slow );
 7128 %}
 7129 
 7130 // ------------------- Variable Shift -----------------------------
 7131 // Byte variable shift
 7132 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7133   predicate(Matcher::vector_length(n) <= 8 &&
 7134             n->as_ShiftV()->is_var_shift() &&
 7135             !VM_Version::supports_avx512bw());
 7136   match(Set dst ( LShiftVB src shift));
 7137   match(Set dst ( RShiftVB src shift));
 7138   match(Set dst (URShiftVB src shift));
 7139   effect(TEMP dst, TEMP vtmp);
 7140   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7141   ins_encode %{
 7142     assert(UseAVX >= 2, "required");
 7143 
 7144     int opcode = this->ideal_Opcode();
 7145     int vlen_enc = Assembler::AVX_128bit;
 7146     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7147     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7148   %}
 7149   ins_pipe( pipe_slow );
 7150 %}
 7151 
 7152 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7153   predicate(Matcher::vector_length(n) == 16 &&
 7154             n->as_ShiftV()->is_var_shift() &&
 7155             !VM_Version::supports_avx512bw());
 7156   match(Set dst ( LShiftVB src shift));
 7157   match(Set dst ( RShiftVB src shift));
 7158   match(Set dst (URShiftVB src shift));
 7159   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7160   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7161   ins_encode %{
 7162     assert(UseAVX >= 2, "required");
 7163 
 7164     int opcode = this->ideal_Opcode();
 7165     int vlen_enc = Assembler::AVX_128bit;
 7166     // Shift lower half and get word result in dst
 7167     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7168 
 7169     // Shift upper half and get word result in vtmp1
 7170     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7171     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7172     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7173 
 7174     // Merge and down convert the two word results to byte in dst
 7175     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7176   %}
 7177   ins_pipe( pipe_slow );
 7178 %}
 7179 
 7180 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4) %{
 7181   predicate(Matcher::vector_length(n) == 32 &&
 7182             n->as_ShiftV()->is_var_shift() &&
 7183             !VM_Version::supports_avx512bw());
 7184   match(Set dst ( LShiftVB src shift));
 7185   match(Set dst ( RShiftVB src shift));
 7186   match(Set dst (URShiftVB src shift));
 7187   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4);
 7188   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 as TEMP" %}
 7189   ins_encode %{
 7190     assert(UseAVX >= 2, "required");
 7191 
 7192     int opcode = this->ideal_Opcode();
 7193     int vlen_enc = Assembler::AVX_128bit;
 7194     // Process lower 128 bits and get result in dst
 7195     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7196     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7197     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7198     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7199     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7200 
 7201     // Process higher 128 bits and get result in vtmp3
 7202     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7203     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7204     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister);
 7205     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
 7206     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
 7207     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7208     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7209 
 7210     // Merge the two results in dst
 7211     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7212   %}
 7213   ins_pipe( pipe_slow );
 7214 %}
 7215 
 7216 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp) %{
 7217   predicate(Matcher::vector_length(n) <= 32 &&
 7218             n->as_ShiftV()->is_var_shift() &&
 7219             VM_Version::supports_avx512bw());
 7220   match(Set dst ( LShiftVB src shift));
 7221   match(Set dst ( RShiftVB src shift));
 7222   match(Set dst (URShiftVB src shift));
 7223   effect(TEMP dst, TEMP vtmp);
 7224   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7225   ins_encode %{
 7226     assert(UseAVX > 2, "required");
 7227 
 7228     int opcode = this->ideal_Opcode();
 7229     int vlen_enc = vector_length_encoding(this);
 7230     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7231   %}
 7232   ins_pipe( pipe_slow );
 7233 %}
 7234 
 7235 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7236   predicate(Matcher::vector_length(n) == 64 &&
 7237             n->as_ShiftV()->is_var_shift() &&
 7238             VM_Version::supports_avx512bw());
 7239   match(Set dst ( LShiftVB src shift));
 7240   match(Set dst ( RShiftVB src shift));
 7241   match(Set dst (URShiftVB src shift));
 7242   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7243   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7244   ins_encode %{
 7245     assert(UseAVX > 2, "required");
 7246 
 7247     int opcode = this->ideal_Opcode();
 7248     int vlen_enc = Assembler::AVX_256bit;
 7249     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7250     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7251     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7252     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7253     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7254   %}
 7255   ins_pipe( pipe_slow );
 7256 %}
 7257 
 7258 // Short variable shift
 7259 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7260   predicate(Matcher::vector_length(n) <= 8 &&
 7261             n->as_ShiftV()->is_var_shift() &&
 7262             !VM_Version::supports_avx512bw());
 7263   match(Set dst ( LShiftVS src shift));
 7264   match(Set dst ( RShiftVS src shift));
 7265   match(Set dst (URShiftVS src shift));
 7266   effect(TEMP dst, TEMP vtmp);
 7267   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7268   ins_encode %{
 7269     assert(UseAVX >= 2, "required");
 7270 
 7271     int opcode = this->ideal_Opcode();
 7272     bool sign = (opcode != Op_URShiftVS);
 7273     int vlen_enc = Assembler::AVX_256bit;
 7274     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
 7275     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
 7276     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 7277     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7278     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
 7279     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7280   %}
 7281   ins_pipe( pipe_slow );
 7282 %}
 7283 
 7284 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7285   predicate(Matcher::vector_length(n) == 16 &&
 7286             n->as_ShiftV()->is_var_shift() &&
 7287             !VM_Version::supports_avx512bw());
 7288   match(Set dst ( LShiftVS src shift));
 7289   match(Set dst ( RShiftVS src shift));
 7290   match(Set dst (URShiftVS src shift));
 7291   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7292   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7293   ins_encode %{
 7294     assert(UseAVX >= 2, "required");
 7295 
 7296     int opcode = this->ideal_Opcode();
 7297     bool sign = (opcode != Op_URShiftVS);
 7298     int vlen_enc = Assembler::AVX_256bit;
 7299     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
 7300     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7301     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7302     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7303     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7304 
 7305     // Shift upper half, with result in dst using vtmp1 as TEMP
 7306     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
 7307     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
 7308     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7309     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7310     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7311     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7312 
 7313     // Merge lower and upper half result into dst
 7314     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7315     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 7316   %}
 7317   ins_pipe( pipe_slow );
 7318 %}
 7319 
 7320 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
 7321   predicate(n->as_ShiftV()->is_var_shift() &&
 7322             VM_Version::supports_avx512bw());
 7323   match(Set dst ( LShiftVS src shift));
 7324   match(Set dst ( RShiftVS src shift));
 7325   match(Set dst (URShiftVS src shift));
 7326   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
 7327   ins_encode %{
 7328     assert(UseAVX > 2, "required");
 7329 
 7330     int opcode = this->ideal_Opcode();
 7331     int vlen_enc = vector_length_encoding(this);
 7332     if (!VM_Version::supports_avx512vl()) {
 7333       vlen_enc = Assembler::AVX_512bit;
 7334     }
 7335     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7336   %}
 7337   ins_pipe( pipe_slow );
 7338 %}
 7339 
 7340 //Integer variable shift
 7341 instruct vshiftI_var(vec dst, vec src, vec shift) %{
 7342   predicate(n->as_ShiftV()->is_var_shift());
 7343   match(Set dst ( LShiftVI src shift));
 7344   match(Set dst ( RShiftVI src shift));
 7345   match(Set dst (URShiftVI src shift));
 7346   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
 7347   ins_encode %{
 7348     assert(UseAVX >= 2, "required");
 7349 
 7350     int opcode = this->ideal_Opcode();
 7351     int vlen_enc = vector_length_encoding(this);
 7352     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7353   %}
 7354   ins_pipe( pipe_slow );
 7355 %}
 7356 
 7357 //Long variable shift
 7358 instruct vshiftL_var(vec dst, vec src, vec shift) %{
 7359   predicate(n->as_ShiftV()->is_var_shift());
 7360   match(Set dst ( LShiftVL src shift));
 7361   match(Set dst (URShiftVL src shift));
 7362   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
 7363   ins_encode %{
 7364     assert(UseAVX >= 2, "required");
 7365 
 7366     int opcode = this->ideal_Opcode();
 7367     int vlen_enc = vector_length_encoding(this);
 7368     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7369   %}
 7370   ins_pipe( pipe_slow );
 7371 %}
 7372 
 7373 //Long variable right shift arithmetic
 7374 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
 7375   predicate(Matcher::vector_length(n) <= 4 &&
 7376             n->as_ShiftV()->is_var_shift() &&
 7377             UseAVX == 2);
 7378   match(Set dst (RShiftVL src shift));
 7379   effect(TEMP dst, TEMP vtmp);
 7380   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
 7381   ins_encode %{
 7382     int opcode = this->ideal_Opcode();
 7383     int vlen_enc = vector_length_encoding(this);
 7384     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
 7385                  $vtmp$$XMMRegister);
 7386   %}
 7387   ins_pipe( pipe_slow );
 7388 %}
 7389 
 7390 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
 7391   predicate(n->as_ShiftV()->is_var_shift() &&
 7392             UseAVX > 2);
 7393   match(Set dst (RShiftVL src shift));
 7394   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
 7395   ins_encode %{
 7396     int opcode = this->ideal_Opcode();
 7397     int vlen_enc = vector_length_encoding(this);
 7398     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7399   %}
 7400   ins_pipe( pipe_slow );
 7401 %}
 7402 
 7403 // --------------------------------- AND --------------------------------------
 7404 
 7405 instruct vand(vec dst, vec src) %{
 7406   predicate(UseAVX == 0);
 7407   match(Set dst (AndV dst src));
 7408   format %{ "pand    $dst,$src\t! and vectors" %}
 7409   ins_encode %{
 7410     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 7411   %}
 7412   ins_pipe( pipe_slow );
 7413 %}
 7414 
 7415 instruct vand_reg(vec dst, vec src1, vec src2) %{
 7416   predicate(UseAVX > 0);
 7417   match(Set dst (AndV src1 src2));
 7418   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
 7419   ins_encode %{
 7420     int vlen_enc = vector_length_encoding(this);
 7421     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7422   %}
 7423   ins_pipe( pipe_slow );
 7424 %}
 7425 
 7426 instruct vand_mem(vec dst, vec src, memory mem) %{
 7427   predicate((UseAVX > 0) &&
 7428             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7429   match(Set dst (AndV src (LoadVector mem)));
 7430   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
 7431   ins_encode %{
 7432     int vlen_enc = vector_length_encoding(this);
 7433     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7434   %}
 7435   ins_pipe( pipe_slow );
 7436 %}
 7437 
 7438 // --------------------------------- OR ---------------------------------------
 7439 
 7440 instruct vor(vec dst, vec src) %{
 7441   predicate(UseAVX == 0);
 7442   match(Set dst (OrV dst src));
 7443   format %{ "por     $dst,$src\t! or vectors" %}
 7444   ins_encode %{
 7445     __ por($dst$$XMMRegister, $src$$XMMRegister);
 7446   %}
 7447   ins_pipe( pipe_slow );
 7448 %}
 7449 
 7450 instruct vor_reg(vec dst, vec src1, vec src2) %{
 7451   predicate(UseAVX > 0);
 7452   match(Set dst (OrV src1 src2));
 7453   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
 7454   ins_encode %{
 7455     int vlen_enc = vector_length_encoding(this);
 7456     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7457   %}
 7458   ins_pipe( pipe_slow );
 7459 %}
 7460 
 7461 instruct vor_mem(vec dst, vec src, memory mem) %{
 7462   predicate((UseAVX > 0) &&
 7463             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7464   match(Set dst (OrV src (LoadVector mem)));
 7465   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
 7466   ins_encode %{
 7467     int vlen_enc = vector_length_encoding(this);
 7468     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7469   %}
 7470   ins_pipe( pipe_slow );
 7471 %}
 7472 
 7473 // --------------------------------- XOR --------------------------------------
 7474 
 7475 instruct vxor(vec dst, vec src) %{
 7476   predicate(UseAVX == 0);
 7477   match(Set dst (XorV dst src));
 7478   format %{ "pxor    $dst,$src\t! xor vectors" %}
 7479   ins_encode %{
 7480     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
 7481   %}
 7482   ins_pipe( pipe_slow );
 7483 %}
 7484 
 7485 instruct vxor_reg(vec dst, vec src1, vec src2) %{
 7486   predicate(UseAVX > 0);
 7487   match(Set dst (XorV src1 src2));
 7488   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
 7489   ins_encode %{
 7490     int vlen_enc = vector_length_encoding(this);
 7491     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7492   %}
 7493   ins_pipe( pipe_slow );
 7494 %}
 7495 
 7496 instruct vxor_mem(vec dst, vec src, memory mem) %{
 7497   predicate((UseAVX > 0) &&
 7498             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7499   match(Set dst (XorV src (LoadVector mem)));
 7500   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
 7501   ins_encode %{
 7502     int vlen_enc = vector_length_encoding(this);
 7503     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7504   %}
 7505   ins_pipe( pipe_slow );
 7506 %}
 7507 
 7508 // --------------------------------- VectorCast --------------------------------------
 7509 
 7510 instruct vcastBtoX(vec dst, vec src) %{
 7511   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_DOUBLE);
 7512   match(Set dst (VectorCastB2X src));
 7513   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7514   ins_encode %{
 7515     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7516     int vlen_enc = vector_length_encoding(this);
 7517     __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7518   %}
 7519   ins_pipe( pipe_slow );
 7520 %}
 7521 
 7522 instruct vcastBtoD(legVec dst, legVec src) %{
 7523   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7524   match(Set dst (VectorCastB2X src));
 7525   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7526   ins_encode %{
 7527     int vlen_enc = vector_length_encoding(this);
 7528     __ vconvert_b2x(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7529   %}
 7530   ins_pipe( pipe_slow );
 7531 %}
 7532 
 7533 instruct castStoX(vec dst, vec src) %{
 7534   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7535             Matcher::vector_length(n->in(1)) <= 8 && // src
 7536             Matcher::vector_element_basic_type(n) == T_BYTE);
 7537   match(Set dst (VectorCastS2X src));
 7538   format %{ "vector_cast_s2x $dst,$src" %}
 7539   ins_encode %{
 7540     assert(UseAVX > 0, "required");
 7541 
 7542     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, noreg);
 7543     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7544   %}
 7545   ins_pipe( pipe_slow );
 7546 %}
 7547 
 7548 instruct vcastStoX(vec dst, vec src, vec vtmp) %{
 7549   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7550             Matcher::vector_length(n->in(1)) == 16 && // src
 7551             Matcher::vector_element_basic_type(n) == T_BYTE);
 7552   effect(TEMP dst, TEMP vtmp);
 7553   match(Set dst (VectorCastS2X src));
 7554   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp as TEMP" %}
 7555   ins_encode %{
 7556     assert(UseAVX > 0, "required");
 7557 
 7558     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
 7559     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7560     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 7561     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7562   %}
 7563   ins_pipe( pipe_slow );
 7564 %}
 7565 
 7566 instruct vcastStoX_evex(vec dst, vec src) %{
 7567   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
 7568             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7569   match(Set dst (VectorCastS2X src));
 7570   format %{ "vector_cast_s2x $dst,$src\t!" %}
 7571   ins_encode %{
 7572     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7573     int src_vlen_enc = vector_length_encoding(this, $src);
 7574     int vlen_enc = vector_length_encoding(this);
 7575     switch (to_elem_bt) {
 7576       case T_BYTE:
 7577         if (!VM_Version::supports_avx512vl()) {
 7578           vlen_enc = Assembler::AVX_512bit;
 7579         }
 7580         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7581         break;
 7582       case T_INT:
 7583         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7584         break;
 7585       case T_FLOAT:
 7586         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7587         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7588         break;
 7589       case T_LONG:
 7590         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7591         break;
 7592       case T_DOUBLE: {
 7593         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 7594         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 7595         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7596         break;
 7597       }
 7598       default:
 7599         ShouldNotReachHere();
 7600     }
 7601   %}
 7602   ins_pipe( pipe_slow );
 7603 %}
 7604 
 7605 instruct castItoX(vec dst, vec src) %{
 7606   predicate(UseAVX <= 2 &&
 7607             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
 7608             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7609   match(Set dst (VectorCastI2X src));
 7610   format %{ "vector_cast_i2x $dst,$src" %}
 7611   ins_encode %{
 7612     assert(UseAVX > 0, "required");
 7613 
 7614     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7615     int vlen_enc = vector_length_encoding(this, $src);
 7616 
 7617     if (to_elem_bt == T_BYTE) {
 7618       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7619       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7620       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7621     } else {
 7622       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7623       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7624       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7625     }
 7626   %}
 7627   ins_pipe( pipe_slow );
 7628 %}
 7629 
 7630 instruct vcastItoX(vec dst, vec src, vec vtmp) %{
 7631   predicate(UseAVX <= 2 &&
 7632             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
 7633             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7634   match(Set dst (VectorCastI2X src));
 7635   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp as TEMP" %}
 7636   effect(TEMP dst, TEMP vtmp);
 7637   ins_encode %{
 7638     assert(UseAVX > 0, "required");
 7639 
 7640     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7641     int vlen_enc = vector_length_encoding(this, $src);
 7642 
 7643     if (to_elem_bt == T_BYTE) {
 7644       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7645       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7646       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7647       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7648     } else {
 7649       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7650       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7651       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7652       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7653     }
 7654   %}
 7655   ins_pipe( pipe_slow );
 7656 %}
 7657 
 7658 instruct vcastItoX_evex(vec dst, vec src) %{
 7659   predicate(UseAVX > 2 ||
 7660             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7661   match(Set dst (VectorCastI2X src));
 7662   format %{ "vector_cast_i2x $dst,$src\t!" %}
 7663   ins_encode %{
 7664     assert(UseAVX > 0, "required");
 7665 
 7666     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
 7667     int src_vlen_enc = vector_length_encoding(this, $src);
 7668     int dst_vlen_enc = vector_length_encoding(this);
 7669     switch (dst_elem_bt) {
 7670       case T_BYTE:
 7671         if (!VM_Version::supports_avx512vl()) {
 7672           src_vlen_enc = Assembler::AVX_512bit;
 7673         }
 7674         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7675         break;
 7676       case T_SHORT:
 7677         if (!VM_Version::supports_avx512vl()) {
 7678           src_vlen_enc = Assembler::AVX_512bit;
 7679         }
 7680         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7681         break;
 7682       case T_FLOAT:
 7683         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7684         break;
 7685       case T_LONG:
 7686         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7687         break;
 7688       case T_DOUBLE:
 7689         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7690         break;
 7691       default:
 7692         ShouldNotReachHere();
 7693     }
 7694   %}
 7695   ins_pipe( pipe_slow );
 7696 %}
 7697 
 7698 instruct vcastLtoBS(vec dst, vec src) %{
 7699   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
 7700             UseAVX <= 2);
 7701   match(Set dst (VectorCastL2X src));
 7702   format %{ "vector_cast_l2x  $dst,$src" %}
 7703   ins_encode %{
 7704     assert(UseAVX > 0, "required");
 7705 
 7706     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7707     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
 7708     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
 7709                                                       : ExternalAddress(vector_int_to_short_mask());
 7710     if (vlen <= 16) {
 7711       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
 7712       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7713       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7714     } else {
 7715       assert(vlen <= 32, "required");
 7716       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
 7717       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
 7718       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7719       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7720     }
 7721     if (to_elem_bt == T_BYTE) {
 7722       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7723     }
 7724   %}
 7725   ins_pipe( pipe_slow );
 7726 %}
 7727 
 7728 instruct vcastLtoX_evex(vec dst, vec src) %{
 7729   predicate(UseAVX > 2 ||
 7730             (Matcher::vector_element_basic_type(n) == T_INT ||
 7731              Matcher::vector_element_basic_type(n) == T_FLOAT ||
 7732              Matcher::vector_element_basic_type(n) == T_DOUBLE));
 7733   match(Set dst (VectorCastL2X src));
 7734   format %{ "vector_cast_l2x  $dst,$src\t!" %}
 7735   ins_encode %{
 7736     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7737     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7738     int vlen_enc = vector_length_encoding(this, $src);
 7739     switch (to_elem_bt) {
 7740       case T_BYTE:
 7741         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7742           vlen_enc = Assembler::AVX_512bit;
 7743         }
 7744         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7745         break;
 7746       case T_SHORT:
 7747         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7748           vlen_enc = Assembler::AVX_512bit;
 7749         }
 7750         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7751         break;
 7752       case T_INT:
 7753         if (vlen == 8) {
 7754           if ($dst$$XMMRegister != $src$$XMMRegister) {
 7755             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7756           }
 7757         } else if (vlen == 16) {
 7758           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
 7759         } else if (vlen == 32) {
 7760           if (UseAVX > 2) {
 7761             if (!VM_Version::supports_avx512vl()) {
 7762               vlen_enc = Assembler::AVX_512bit;
 7763             }
 7764             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7765           } else {
 7766             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
 7767             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 7768           }
 7769         } else { // vlen == 64
 7770           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7771         }
 7772         break;
 7773       case T_FLOAT:
 7774         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7775         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7776         break;
 7777       case T_DOUBLE:
 7778         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7779         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7780         break;
 7781 
 7782       default: assert(false, "%s", type2name(to_elem_bt));
 7783     }
 7784   %}
 7785   ins_pipe( pipe_slow );
 7786 %}
 7787 
 7788 instruct vcastFtoD_reg(vec dst, vec src) %{
 7789   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7790   match(Set dst (VectorCastF2X src));
 7791   format %{ "vector_cast_f2d  $dst,$src\t!" %}
 7792   ins_encode %{
 7793     int vlen_enc = vector_length_encoding(this);
 7794     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7795   %}
 7796   ins_pipe( pipe_slow );
 7797 %}
 7798 
 7799 
 7800 instruct castFtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7801   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7802             type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
 7803   match(Set dst (VectorCastF2X src));
 7804   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7805   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
 7806   ins_encode %{
 7807     int vlen_enc = vector_length_encoding(this, $src);
 7808     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7809     // JDK-8292878 removed the need for an explicit scratch register needed to load greater than
 7810     // 32 bit addresses for register indirect addressing mode since stub constants
 7811     // are part of code cache and there is a cap of 2G on ReservedCodeCacheSize currently.
 7812     // However, targets are free to increase this limit, but having a large code cache size
 7813     // greater than 2G looks unreasonable in practical scenario, on the hind side with given
 7814     // cap we save a temporary register allocation which in limiting case can prevent
 7815     // spilling in high register pressure blocks.
 7816     __ vector_castF2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7817                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 7818                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7819   %}
 7820   ins_pipe( pipe_slow );
 7821 %}
 7822 
 7823 instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7824   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7825             is_integral_type(Matcher::vector_element_basic_type(n)));
 7826   match(Set dst (VectorCastF2X src));
 7827   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7828   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7829   ins_encode %{
 7830     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7831     if (to_elem_bt == T_LONG) {
 7832       int vlen_enc = vector_length_encoding(this);
 7833       __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7834                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7835                              ExternalAddress(vector_double_signflip()), noreg, vlen_enc);
 7836     } else {
 7837       int vlen_enc = vector_length_encoding(this, $src);
 7838       __ vector_castF2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7839                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7840                              ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7841     }
 7842   %}
 7843   ins_pipe( pipe_slow );
 7844 %}
 7845 
 7846 instruct vcastDtoF_reg(vec dst, vec src) %{
 7847   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 7848   match(Set dst (VectorCastD2X src));
 7849   format %{ "vector_cast_d2x  $dst,$src\t!" %}
 7850   ins_encode %{
 7851     int vlen_enc = vector_length_encoding(this, $src);
 7852     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7853   %}
 7854   ins_pipe( pipe_slow );
 7855 %}
 7856 
 7857 instruct castDtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, vec xtmp5, rFlagsReg cr) %{
 7858   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7859             is_integral_type(Matcher::vector_element_basic_type(n)));
 7860   match(Set dst (VectorCastD2X src));
 7861   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP xtmp5, KILL cr);
 7862   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $xtmp5 as TEMP" %}
 7863   ins_encode %{
 7864     int vlen_enc = vector_length_encoding(this, $src);
 7865     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7866     __ vector_castD2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7867                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, $xtmp5$$XMMRegister,
 7868                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7869   %}
 7870   ins_pipe( pipe_slow );
 7871 %}
 7872 
 7873 instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7874   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7875             is_integral_type(Matcher::vector_element_basic_type(n)));
 7876   match(Set dst (VectorCastD2X src));
 7877   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7878   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7879   ins_encode %{
 7880     int vlen_enc = vector_length_encoding(this, $src);
 7881     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7882     AddressLiteral signflip = VM_Version::supports_avx512dq() ? ExternalAddress(vector_double_signflip()) :
 7883                               ExternalAddress(vector_float_signflip());
 7884     __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7885                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, signflip, noreg, vlen_enc);
 7886   %}
 7887   ins_pipe( pipe_slow );
 7888 %}
 7889 
 7890 instruct vucast(vec dst, vec src) %{
 7891   match(Set dst (VectorUCastB2X src));
 7892   match(Set dst (VectorUCastS2X src));
 7893   match(Set dst (VectorUCastI2X src));
 7894   format %{ "vector_ucast $dst,$src\t!" %}
 7895   ins_encode %{
 7896     assert(UseAVX > 0, "required");
 7897 
 7898     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
 7899     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7900     int vlen_enc = vector_length_encoding(this);
 7901     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
 7902   %}
 7903   ins_pipe( pipe_slow );
 7904 %}
 7905 
 7906 #ifdef _LP64
 7907 instruct vround_float_avx(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7908   predicate(!VM_Version::supports_avx512vl() &&
 7909             Matcher::vector_length_in_bytes(n) < 64 &&
 7910             Matcher::vector_element_basic_type(n) == T_INT);
 7911   match(Set dst (RoundVF src));
 7912   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7913   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
 7914   ins_encode %{
 7915     int vlen_enc = vector_length_encoding(this);
 7916     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7917     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister,
 7918                               ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7919                               $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
 7920   %}
 7921   ins_pipe( pipe_slow );
 7922 %}
 7923 
 7924 instruct vround_float_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7925   predicate((VM_Version::supports_avx512vl() ||
 7926              Matcher::vector_length_in_bytes(n) == 64) &&
 7927              Matcher::vector_element_basic_type(n) == T_INT);
 7928   match(Set dst (RoundVF src));
 7929   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7930   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7931   ins_encode %{
 7932     int vlen_enc = vector_length_encoding(this);
 7933     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7934     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister,
 7935                                ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7936                                $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7937   %}
 7938   ins_pipe( pipe_slow );
 7939 %}
 7940 
 7941 instruct vround_reg_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7942   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 7943   match(Set dst (RoundVD src));
 7944   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2,  KILL cr);
 7945   format %{ "vector_round_long $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7946   ins_encode %{
 7947     int vlen_enc = vector_length_encoding(this);
 7948     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7949     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister,
 7950                                 ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), new_mxcsr, vlen_enc,
 7951                                 $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7952   %}
 7953   ins_pipe( pipe_slow );
 7954 %}
 7955 
 7956 #endif // _LP64
 7957 
 7958 // --------------------------------- VectorMaskCmp --------------------------------------
 7959 
 7960 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7961   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7962             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
 7963             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7964             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7965   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7966   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7967   ins_encode %{
 7968     int vlen_enc = vector_length_encoding(this, $src1);
 7969     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7970     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7971       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7972     } else {
 7973       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7974     }
 7975   %}
 7976   ins_pipe( pipe_slow );
 7977 %}
 7978 
 7979 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7980   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
 7981             n->bottom_type()->isa_vectmask() == nullptr &&
 7982             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7983   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7984   effect(TEMP ktmp);
 7985   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7986   ins_encode %{
 7987     int vlen_enc = Assembler::AVX_512bit;
 7988     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7989     KRegister mask = k0; // The comparison itself is not being masked.
 7990     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7991       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7992       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7993     } else {
 7994       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7995       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7996     }
 7997   %}
 7998   ins_pipe( pipe_slow );
 7999 %}
 8000 
 8001 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
 8002   predicate(n->bottom_type()->isa_vectmask() &&
 8003             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 8004   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8005   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
 8006   ins_encode %{
 8007     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 8008     int vlen_enc = vector_length_encoding(this, $src1);
 8009     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 8010     KRegister mask = k0; // The comparison itself is not being masked.
 8011     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 8012       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 8013     } else {
 8014       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 8015     }
 8016   %}
 8017   ins_pipe( pipe_slow );
 8018 %}
 8019 
 8020 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 8021   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 8022             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 8023             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 8024             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 8025             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 8026             (n->in(2)->get_int() == BoolTest::eq ||
 8027              n->in(2)->get_int() == BoolTest::lt ||
 8028              n->in(2)->get_int() == BoolTest::gt)); // cond
 8029   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8030   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 8031   ins_encode %{
 8032     int vlen_enc = vector_length_encoding(this, $src1);
 8033     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8034     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 8035     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
 8036   %}
 8037   ins_pipe( pipe_slow );
 8038 %}
 8039 
 8040 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 8041   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 8042             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 8043             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 8044             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 8045             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 8046             (n->in(2)->get_int() == BoolTest::ne ||
 8047              n->in(2)->get_int() == BoolTest::le ||
 8048              n->in(2)->get_int() == BoolTest::ge)); // cond
 8049   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8050   effect(TEMP dst, TEMP xtmp);
 8051   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 8052   ins_encode %{
 8053     int vlen_enc = vector_length_encoding(this, $src1);
 8054     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8055     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 8056     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 8057   %}
 8058   ins_pipe( pipe_slow );
 8059 %}
 8060 
 8061 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 8062   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 8063             Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 8064             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 8065             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 8066             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8067   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8068   effect(TEMP dst, TEMP xtmp);
 8069   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 8070   ins_encode %{
 8071     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
 8072     int vlen_enc = vector_length_encoding(this, $src1);
 8073     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8074     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 8075 
 8076     if (vlen_enc == Assembler::AVX_128bit) {
 8077       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 8078     } else {
 8079       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 8080     }
 8081     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 8082     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8083     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 8084   %}
 8085   ins_pipe( pipe_slow );
 8086 %}
 8087 
 8088 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 8089   predicate((n->bottom_type()->isa_vectmask() == nullptr &&
 8090              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
 8091              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8092   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8093   effect(TEMP ktmp);
 8094   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 8095   ins_encode %{
 8096     assert(UseAVX > 2, "required");
 8097 
 8098     int vlen_enc = vector_length_encoding(this, $src1);
 8099     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8100     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 8101     KRegister mask = k0; // The comparison itself is not being masked.
 8102     bool merge = false;
 8103     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 8104 
 8105     switch (src1_elem_bt) {
 8106       case T_INT: {
 8107         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8108         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 8109         break;
 8110       }
 8111       case T_LONG: {
 8112         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8113         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 8114         break;
 8115       }
 8116       default: assert(false, "%s", type2name(src1_elem_bt));
 8117     }
 8118   %}
 8119   ins_pipe( pipe_slow );
 8120 %}
 8121 
 8122 
 8123 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
 8124   predicate(n->bottom_type()->isa_vectmask() &&
 8125             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8126   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8127   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
 8128   ins_encode %{
 8129     assert(UseAVX > 2, "required");
 8130     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 8131 
 8132     int vlen_enc = vector_length_encoding(this, $src1);
 8133     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8134     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 8135     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 8136 
 8137     // Comparison i
 8138     switch (src1_elem_bt) {
 8139       case T_BYTE: {
 8140         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8141         break;
 8142       }
 8143       case T_SHORT: {
 8144         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8145         break;
 8146       }
 8147       case T_INT: {
 8148         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8149         break;
 8150       }
 8151       case T_LONG: {
 8152         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8153         break;
 8154       }
 8155       default: assert(false, "%s", type2name(src1_elem_bt));
 8156     }
 8157   %}
 8158   ins_pipe( pipe_slow );
 8159 %}
 8160 
 8161 // Extract
 8162 
 8163 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
 8164   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
 8165   match(Set dst (ExtractI src idx));
 8166   match(Set dst (ExtractS src idx));
 8167 #ifdef _LP64
 8168   match(Set dst (ExtractB src idx));
 8169 #endif
 8170   format %{ "extractI $dst,$src,$idx\t!" %}
 8171   ins_encode %{
 8172     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8173 
 8174     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8175     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8176   %}
 8177   ins_pipe( pipe_slow );
 8178 %}
 8179 
 8180 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
 8181   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
 8182             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
 8183   match(Set dst (ExtractI src idx));
 8184   match(Set dst (ExtractS src idx));
 8185 #ifdef _LP64
 8186   match(Set dst (ExtractB src idx));
 8187 #endif
 8188   effect(TEMP vtmp);
 8189   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8190   ins_encode %{
 8191     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8192 
 8193     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8194     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8195     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
 8196   %}
 8197   ins_pipe( pipe_slow );
 8198 %}
 8199 
 8200 #ifdef _LP64
 8201 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
 8202   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
 8203   match(Set dst (ExtractL src idx));
 8204   format %{ "extractL $dst,$src,$idx\t!" %}
 8205   ins_encode %{
 8206     assert(UseSSE >= 4, "required");
 8207     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8208 
 8209     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8210   %}
 8211   ins_pipe( pipe_slow );
 8212 %}
 8213 
 8214 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
 8215   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8216             Matcher::vector_length(n->in(1)) == 8);  // src
 8217   match(Set dst (ExtractL src idx));
 8218   effect(TEMP vtmp);
 8219   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8220   ins_encode %{
 8221     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8222 
 8223     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8224     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
 8225   %}
 8226   ins_pipe( pipe_slow );
 8227 %}
 8228 #endif
 8229 
 8230 instruct extractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8231   predicate(Matcher::vector_length(n->in(1)) <= 4);
 8232   match(Set dst (ExtractF src idx));
 8233   effect(TEMP dst, TEMP vtmp);
 8234   format %{ "extractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8235   ins_encode %{
 8236     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8237 
 8238     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $vtmp$$XMMRegister);
 8239   %}
 8240   ins_pipe( pipe_slow );
 8241 %}
 8242 
 8243 instruct vextractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8244   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
 8245             Matcher::vector_length(n->in(1)/*src*/) == 16);
 8246   match(Set dst (ExtractF src idx));
 8247   effect(TEMP vtmp);
 8248   format %{ "vextractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8249   ins_encode %{
 8250     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8251 
 8252     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8253     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8254   %}
 8255   ins_pipe( pipe_slow );
 8256 %}
 8257 
 8258 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
 8259   predicate(Matcher::vector_length(n->in(1)) == 2); // src
 8260   match(Set dst (ExtractD src idx));
 8261   format %{ "extractD $dst,$src,$idx\t!" %}
 8262   ins_encode %{
 8263     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8264 
 8265     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8266   %}
 8267   ins_pipe( pipe_slow );
 8268 %}
 8269 
 8270 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
 8271   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8272             Matcher::vector_length(n->in(1)) == 8);  // src
 8273   match(Set dst (ExtractD src idx));
 8274   effect(TEMP vtmp);
 8275   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8276   ins_encode %{
 8277     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8278 
 8279     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8280     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8281   %}
 8282   ins_pipe( pipe_slow );
 8283 %}
 8284 
 8285 // --------------------------------- Vector Blend --------------------------------------
 8286 
 8287 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 8288   predicate(UseAVX == 0);
 8289   match(Set dst (VectorBlend (Binary dst src) mask));
 8290   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
 8291   effect(TEMP tmp);
 8292   ins_encode %{
 8293     assert(UseSSE >= 4, "required");
 8294 
 8295     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
 8296       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
 8297     }
 8298     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
 8299   %}
 8300   ins_pipe( pipe_slow );
 8301 %}
 8302 
 8303 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8304   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8305             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8306             Matcher::vector_length_in_bytes(n) <= 32 &&
 8307             is_integral_type(Matcher::vector_element_basic_type(n)));
 8308   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8309   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8310   ins_encode %{
 8311     int vlen_enc = vector_length_encoding(this);
 8312     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8313   %}
 8314   ins_pipe( pipe_slow );
 8315 %}
 8316 
 8317 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8318   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8319             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8320             Matcher::vector_length_in_bytes(n) <= 32 &&
 8321             !is_integral_type(Matcher::vector_element_basic_type(n)));
 8322   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8323   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8324   ins_encode %{
 8325     int vlen_enc = vector_length_encoding(this);
 8326     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8327   %}
 8328   ins_pipe( pipe_slow );
 8329 %}
 8330 
 8331 instruct vblendvp(legVec dst, legVec src1, legVec src2, legVec mask, legVec vtmp) %{
 8332   predicate(UseAVX > 0 && EnableX86ECoreOpts &&
 8333             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8334             Matcher::vector_length_in_bytes(n) <= 32);
 8335   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8336   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $vtmp as TEMP" %}
 8337   effect(TEMP vtmp, TEMP dst);
 8338   ins_encode %{
 8339     int vlen_enc = vector_length_encoding(this);
 8340     __ vpandn($vtmp$$XMMRegister, $mask$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 8341     __ vpand ($dst$$XMMRegister,  $mask$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8342     __ vpor  ($dst$$XMMRegister,  $dst$$XMMRegister,  $vtmp$$XMMRegister, vlen_enc);
 8343   %}
 8344   ins_pipe( pipe_slow );
 8345 %}
 8346 
 8347 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
 8348   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 8349             n->in(2)->bottom_type()->isa_vectmask() == nullptr);
 8350   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8351   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8352   effect(TEMP ktmp);
 8353   ins_encode %{
 8354      int vlen_enc = Assembler::AVX_512bit;
 8355      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8356     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, noreg);
 8357     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8358   %}
 8359   ins_pipe( pipe_slow );
 8360 %}
 8361 
 8362 
 8363 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask) %{
 8364   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
 8365             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
 8366              VM_Version::supports_avx512bw()));
 8367   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8368   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8369   ins_encode %{
 8370     int vlen_enc = vector_length_encoding(this);
 8371     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8372     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8373   %}
 8374   ins_pipe( pipe_slow );
 8375 %}
 8376 
 8377 // --------------------------------- ABS --------------------------------------
 8378 // a = |a|
 8379 instruct vabsB_reg(vec dst, vec src) %{
 8380   match(Set dst (AbsVB  src));
 8381   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
 8382   ins_encode %{
 8383     uint vlen = Matcher::vector_length(this);
 8384     if (vlen <= 16) {
 8385       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8386     } else {
 8387       int vlen_enc = vector_length_encoding(this);
 8388       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8389     }
 8390   %}
 8391   ins_pipe( pipe_slow );
 8392 %}
 8393 
 8394 instruct vabsS_reg(vec dst, vec src) %{
 8395   match(Set dst (AbsVS  src));
 8396   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
 8397   ins_encode %{
 8398     uint vlen = Matcher::vector_length(this);
 8399     if (vlen <= 8) {
 8400       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8401     } else {
 8402       int vlen_enc = vector_length_encoding(this);
 8403       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8404     }
 8405   %}
 8406   ins_pipe( pipe_slow );
 8407 %}
 8408 
 8409 instruct vabsI_reg(vec dst, vec src) %{
 8410   match(Set dst (AbsVI  src));
 8411   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
 8412   ins_encode %{
 8413     uint vlen = Matcher::vector_length(this);
 8414     if (vlen <= 4) {
 8415       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8416     } else {
 8417       int vlen_enc = vector_length_encoding(this);
 8418       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8419     }
 8420   %}
 8421   ins_pipe( pipe_slow );
 8422 %}
 8423 
 8424 instruct vabsL_reg(vec dst, vec src) %{
 8425   match(Set dst (AbsVL  src));
 8426   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
 8427   ins_encode %{
 8428     assert(UseAVX > 2, "required");
 8429     int vlen_enc = vector_length_encoding(this);
 8430     if (!VM_Version::supports_avx512vl()) {
 8431       vlen_enc = Assembler::AVX_512bit;
 8432     }
 8433     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8434   %}
 8435   ins_pipe( pipe_slow );
 8436 %}
 8437 
 8438 // --------------------------------- ABSNEG --------------------------------------
 8439 
 8440 instruct vabsnegF(vec dst, vec src) %{
 8441   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
 8442   match(Set dst (AbsVF src));
 8443   match(Set dst (NegVF src));
 8444   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
 8445   ins_cost(150);
 8446   ins_encode %{
 8447     int opcode = this->ideal_Opcode();
 8448     int vlen = Matcher::vector_length(this);
 8449     if (vlen == 2) {
 8450       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8451     } else {
 8452       assert(vlen == 8 || vlen == 16, "required");
 8453       int vlen_enc = vector_length_encoding(this);
 8454       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8455     }
 8456   %}
 8457   ins_pipe( pipe_slow );
 8458 %}
 8459 
 8460 instruct vabsneg4F(vec dst) %{
 8461   predicate(Matcher::vector_length(n) == 4);
 8462   match(Set dst (AbsVF dst));
 8463   match(Set dst (NegVF dst));
 8464   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
 8465   ins_cost(150);
 8466   ins_encode %{
 8467     int opcode = this->ideal_Opcode();
 8468     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister);
 8469   %}
 8470   ins_pipe( pipe_slow );
 8471 %}
 8472 
 8473 instruct vabsnegD(vec dst, vec src) %{
 8474   match(Set dst (AbsVD  src));
 8475   match(Set dst (NegVD  src));
 8476   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
 8477   ins_encode %{
 8478     int opcode = this->ideal_Opcode();
 8479     uint vlen = Matcher::vector_length(this);
 8480     if (vlen == 2) {
 8481       assert(UseSSE >= 2, "required");
 8482       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8483     } else {
 8484       int vlen_enc = vector_length_encoding(this);
 8485       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8486     }
 8487   %}
 8488   ins_pipe( pipe_slow );
 8489 %}
 8490 
 8491 //------------------------------------- VectorTest --------------------------------------------
 8492 
 8493 #ifdef _LP64
 8494 instruct vptest_lt16(rFlagsRegU cr, legVec src1, legVec src2, legVec vtmp) %{
 8495   predicate(Matcher::vector_length_in_bytes(n->in(1)) < 16);
 8496   match(Set cr (VectorTest src1 src2));
 8497   effect(TEMP vtmp);
 8498   format %{ "vptest_lt16  $src1, $src2\t! using $vtmp as TEMP" %}
 8499   ins_encode %{
 8500     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8501     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8502     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister, vlen);
 8503   %}
 8504   ins_pipe( pipe_slow );
 8505 %}
 8506 
 8507 instruct vptest_ge16(rFlagsRegU cr, legVec src1, legVec src2) %{
 8508   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16);
 8509   match(Set cr (VectorTest src1 src2));
 8510   format %{ "vptest_ge16  $src1, $src2\n\t" %}
 8511   ins_encode %{
 8512     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8513     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8514     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, vlen);
 8515   %}
 8516   ins_pipe( pipe_slow );
 8517 %}
 8518 
 8519 instruct ktest_alltrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8520   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8521              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8522             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 8523   match(Set cr (VectorTest src1 src2));
 8524   effect(TEMP tmp);
 8525   format %{ "ktest_alltrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8526   ins_encode %{
 8527     uint masklen = Matcher::vector_length(this, $src1);
 8528     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8529     __ andl($tmp$$Register, (1 << masklen) - 1);
 8530     __ cmpl($tmp$$Register, (1 << masklen) - 1);
 8531   %}
 8532   ins_pipe( pipe_slow );
 8533 %}
 8534 
 8535 instruct ktest_anytrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8536   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8537              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8538             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8539   match(Set cr (VectorTest src1 src2));
 8540   effect(TEMP tmp);
 8541   format %{ "ktest_anytrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8542   ins_encode %{
 8543     uint masklen = Matcher::vector_length(this, $src1);
 8544     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8545     __ andl($tmp$$Register, (1 << masklen) - 1);
 8546   %}
 8547   ins_pipe( pipe_slow );
 8548 %}
 8549 
 8550 instruct ktest_ge8(rFlagsRegU cr, kReg src1, kReg src2) %{
 8551   predicate(Matcher::vector_length(n->in(1)) >= 16 ||
 8552             (Matcher::vector_length(n->in(1)) == 8 && VM_Version::supports_avx512dq()));
 8553   match(Set cr (VectorTest src1 src2));
 8554   format %{ "ktest_ge8  $src1, $src2\n\t" %}
 8555   ins_encode %{
 8556     uint masklen = Matcher::vector_length(this, $src1);
 8557     __ kortest(masklen, $src1$$KRegister, $src1$$KRegister);
 8558   %}
 8559   ins_pipe( pipe_slow );
 8560 %}
 8561 #endif
 8562 
 8563 //------------------------------------- LoadMask --------------------------------------------
 8564 
 8565 instruct loadMask(legVec dst, legVec src) %{
 8566   predicate(n->bottom_type()->isa_vectmask() == nullptr && !VM_Version::supports_avx512vlbw());
 8567   match(Set dst (VectorLoadMask src));
 8568   effect(TEMP dst);
 8569   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
 8570   ins_encode %{
 8571     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8572     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8573     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
 8574   %}
 8575   ins_pipe( pipe_slow );
 8576 %}
 8577 
 8578 instruct loadMask64(kReg dst, vec src, vec xtmp) %{
 8579   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8580   match(Set dst (VectorLoadMask src));
 8581   effect(TEMP xtmp);
 8582   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp as TEMP" %}
 8583   ins_encode %{
 8584     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8585                         true, Assembler::AVX_512bit);
 8586   %}
 8587   ins_pipe( pipe_slow );
 8588 %}
 8589 
 8590 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
 8591   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8592   match(Set dst (VectorLoadMask src));
 8593   effect(TEMP xtmp);
 8594   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
 8595   ins_encode %{
 8596     int vlen_enc = vector_length_encoding(in(1));
 8597     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8598                         false, vlen_enc);
 8599   %}
 8600   ins_pipe( pipe_slow );
 8601 %}
 8602 
 8603 //------------------------------------- StoreMask --------------------------------------------
 8604 
 8605 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
 8606   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8607   match(Set dst (VectorStoreMask src size));
 8608   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8609   ins_encode %{
 8610     int vlen = Matcher::vector_length(this);
 8611     if (vlen <= 16 && UseAVX <= 2) {
 8612       assert(UseSSE >= 3, "required");
 8613       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8614     } else {
 8615       assert(UseAVX > 0, "required");
 8616       int src_vlen_enc = vector_length_encoding(this, $src);
 8617       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8618     }
 8619   %}
 8620   ins_pipe( pipe_slow );
 8621 %}
 8622 
 8623 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
 8624   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8625   match(Set dst (VectorStoreMask src size));
 8626   effect(TEMP_DEF dst, TEMP xtmp);
 8627   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8628   ins_encode %{
 8629     int vlen_enc = Assembler::AVX_128bit;
 8630     int vlen = Matcher::vector_length(this);
 8631     if (vlen <= 8) {
 8632       assert(UseSSE >= 3, "required");
 8633       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8634       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8635       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8636     } else {
 8637       assert(UseAVX > 0, "required");
 8638       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8639       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8640       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8641     }
 8642   %}
 8643   ins_pipe( pipe_slow );
 8644 %}
 8645 
 8646 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
 8647   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8648   match(Set dst (VectorStoreMask src size));
 8649   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8650   effect(TEMP_DEF dst, TEMP xtmp);
 8651   ins_encode %{
 8652     int vlen_enc = Assembler::AVX_128bit;
 8653     int vlen = Matcher::vector_length(this);
 8654     if (vlen <= 4) {
 8655       assert(UseSSE >= 3, "required");
 8656       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8657       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8658       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8659       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8660     } else {
 8661       assert(UseAVX > 0, "required");
 8662       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8663       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8664       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8665       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8666       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8667     }
 8668   %}
 8669   ins_pipe( pipe_slow );
 8670 %}
 8671 
 8672 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
 8673   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
 8674   match(Set dst (VectorStoreMask src size));
 8675   effect(TEMP_DEF dst, TEMP xtmp);
 8676   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8677   ins_encode %{
 8678     assert(UseSSE >= 3, "required");
 8679     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8680     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
 8681     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
 8682     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8683     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8684   %}
 8685   ins_pipe( pipe_slow );
 8686 %}
 8687 
 8688 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
 8689   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
 8690   match(Set dst (VectorStoreMask src size));
 8691   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
 8692   effect(TEMP_DEF dst, TEMP vtmp);
 8693   ins_encode %{
 8694     int vlen_enc = Assembler::AVX_128bit;
 8695     __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
 8696     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 8697     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
 8698     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8699     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8700     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8701     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8702   %}
 8703   ins_pipe( pipe_slow );
 8704 %}
 8705 
 8706 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
 8707   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8708   match(Set dst (VectorStoreMask src size));
 8709   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8710   ins_encode %{
 8711     int src_vlen_enc = vector_length_encoding(this, $src);
 8712     int dst_vlen_enc = vector_length_encoding(this);
 8713     if (!VM_Version::supports_avx512vl()) {
 8714       src_vlen_enc = Assembler::AVX_512bit;
 8715     }
 8716     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8717     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8718   %}
 8719   ins_pipe( pipe_slow );
 8720 %}
 8721 
 8722 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
 8723   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8724   match(Set dst (VectorStoreMask src size));
 8725   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8726   ins_encode %{
 8727     int src_vlen_enc = vector_length_encoding(this, $src);
 8728     int dst_vlen_enc = vector_length_encoding(this);
 8729     if (!VM_Version::supports_avx512vl()) {
 8730       src_vlen_enc = Assembler::AVX_512bit;
 8731     }
 8732     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8733     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8734   %}
 8735   ins_pipe( pipe_slow );
 8736 %}
 8737 
 8738 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size) %{
 8739   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8740   match(Set dst (VectorStoreMask mask size));
 8741   effect(TEMP_DEF dst);
 8742   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8743   ins_encode %{
 8744     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
 8745     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
 8746                  false, Assembler::AVX_512bit, noreg);
 8747     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
 8748   %}
 8749   ins_pipe( pipe_slow );
 8750 %}
 8751 
 8752 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
 8753   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8754   match(Set dst (VectorStoreMask mask size));
 8755   effect(TEMP_DEF dst);
 8756   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8757   ins_encode %{
 8758     int dst_vlen_enc = vector_length_encoding(this);
 8759     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
 8760     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8761   %}
 8762   ins_pipe( pipe_slow );
 8763 %}
 8764 
 8765 instruct vmaskcast_evex(kReg dst) %{
 8766   match(Set dst (VectorMaskCast dst));
 8767   ins_cost(0);
 8768   format %{ "vector_mask_cast $dst" %}
 8769   ins_encode %{
 8770     // empty
 8771   %}
 8772   ins_pipe(empty);
 8773 %}
 8774 
 8775 instruct vmaskcast(vec dst) %{
 8776   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
 8777   match(Set dst (VectorMaskCast dst));
 8778   ins_cost(0);
 8779   format %{ "vector_mask_cast $dst" %}
 8780   ins_encode %{
 8781     // empty
 8782   %}
 8783   ins_pipe(empty);
 8784 %}
 8785 
 8786 instruct vmaskcast_avx(vec dst, vec src) %{
 8787   predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)));
 8788   match(Set dst (VectorMaskCast src));
 8789   format %{ "vector_mask_cast $dst, $src" %}
 8790   ins_encode %{
 8791     int vlen = Matcher::vector_length(this);
 8792     BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
 8793     BasicType dst_bt = Matcher::vector_element_basic_type(this);
 8794     __ vector_mask_cast($dst$$XMMRegister, $src$$XMMRegister, dst_bt, src_bt, vlen);
 8795   %}
 8796   ins_pipe(pipe_slow);
 8797 %}
 8798 
 8799 //-------------------------------- Load Iota Indices ----------------------------------
 8800 
 8801 instruct loadIotaIndices(vec dst, immI_0 src) %{
 8802   match(Set dst (VectorLoadConst src));
 8803   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
 8804   ins_encode %{
 8805      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8806      BasicType bt = Matcher::vector_element_basic_type(this);
 8807      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, bt);
 8808   %}
 8809   ins_pipe( pipe_slow );
 8810 %}
 8811 
 8812 #ifdef _LP64
 8813 instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
 8814   match(Set dst (PopulateIndex src1 src2));
 8815   effect(TEMP dst, TEMP vtmp);
 8816   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8817   ins_encode %{
 8818      assert($src2$$constant == 1, "required");
 8819      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8820      int vlen_enc = vector_length_encoding(this);
 8821      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8822      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8823      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8824      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8825   %}
 8826   ins_pipe( pipe_slow );
 8827 %}
 8828 
 8829 instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
 8830   match(Set dst (PopulateIndex src1 src2));
 8831   effect(TEMP dst, TEMP vtmp);
 8832   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8833   ins_encode %{
 8834      assert($src2$$constant == 1, "required");
 8835      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8836      int vlen_enc = vector_length_encoding(this);
 8837      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8838      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8839      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8840      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8841   %}
 8842   ins_pipe( pipe_slow );
 8843 %}
 8844 #endif
 8845 //-------------------------------- Rearrange ----------------------------------
 8846 
 8847 // LoadShuffle/Rearrange for Byte
 8848 
 8849 instruct loadShuffleB(vec dst) %{
 8850   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 8851   match(Set dst (VectorLoadShuffle dst));
 8852   format %{ "vector_load_shuffle $dst, $dst" %}
 8853   ins_encode %{
 8854     // empty
 8855   %}
 8856   ins_pipe( pipe_slow );
 8857 %}
 8858 
 8859 instruct rearrangeB(vec dst, vec shuffle) %{
 8860   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8861             Matcher::vector_length(n) < 32);
 8862   match(Set dst (VectorRearrange dst shuffle));
 8863   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8864   ins_encode %{
 8865     assert(UseSSE >= 4, "required");
 8866     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8867   %}
 8868   ins_pipe( pipe_slow );
 8869 %}
 8870 
 8871 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8872   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8873             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
 8874   match(Set dst (VectorRearrange src shuffle));
 8875   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8876   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8877   ins_encode %{
 8878     assert(UseAVX >= 2, "required");
 8879     // Swap src into vtmp1
 8880     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8881     // Shuffle swapped src to get entries from other 128 bit lane
 8882     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8883     // Shuffle original src to get entries from self 128 bit lane
 8884     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8885     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8886     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8887     // Perform the blend
 8888     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8889   %}
 8890   ins_pipe( pipe_slow );
 8891 %}
 8892 
 8893 
 8894 instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
 8895   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8896             Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
 8897   match(Set dst (VectorRearrange src shuffle));
 8898   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 8899   format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
 8900   ins_encode %{
 8901     int vlen_enc = vector_length_encoding(this);
 8902     __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
 8903                        $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
 8904                        $rtmp$$Register, $ktmp$$KRegister, vlen_enc);
 8905   %}
 8906   ins_pipe( pipe_slow );
 8907 %}
 8908 
 8909 instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
 8910   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8911             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
 8912   match(Set dst (VectorRearrange src shuffle));
 8913   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8914   ins_encode %{
 8915     int vlen_enc = vector_length_encoding(this);
 8916     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8917   %}
 8918   ins_pipe( pipe_slow );
 8919 %}
 8920 
 8921 // LoadShuffle/Rearrange for Short
 8922 
 8923 instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
 8924   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8925             Matcher::vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
 8926   match(Set dst (VectorLoadShuffle src));
 8927   effect(TEMP dst, TEMP vtmp);
 8928   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8929   ins_encode %{
 8930     // Create a byte shuffle mask from short shuffle mask
 8931     // only byte shuffle instruction available on these platforms
 8932     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8933     if (UseAVX == 0) {
 8934       assert(vlen_in_bytes <= 16, "required");
 8935       // Multiply each shuffle by two to get byte index
 8936       __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
 8937       __ psllw($vtmp$$XMMRegister, 1);
 8938 
 8939       // Duplicate to create 2 copies of byte index
 8940       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8941       __ psllw($dst$$XMMRegister, 8);
 8942       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
 8943 
 8944       // Add one to get alternate byte index
 8945       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), noreg);
 8946       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8947     } else {
 8948       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
 8949       int vlen_enc = vector_length_encoding(this);
 8950       // Multiply each shuffle by two to get byte index
 8951       __ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8952       __ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
 8953 
 8954       // Duplicate to create 2 copies of byte index
 8955       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
 8956       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8957 
 8958       // Add one to get alternate byte index
 8959       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, noreg);
 8960     }
 8961   %}
 8962   ins_pipe( pipe_slow );
 8963 %}
 8964 
 8965 instruct rearrangeS(vec dst, vec shuffle) %{
 8966   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8967             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
 8968   match(Set dst (VectorRearrange dst shuffle));
 8969   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8970   ins_encode %{
 8971     assert(UseSSE >= 4, "required");
 8972     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8973   %}
 8974   ins_pipe( pipe_slow );
 8975 %}
 8976 
 8977 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8978   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8979             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
 8980   match(Set dst (VectorRearrange src shuffle));
 8981   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8982   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8983   ins_encode %{
 8984     assert(UseAVX >= 2, "required");
 8985     // Swap src into vtmp1
 8986     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8987     // Shuffle swapped src to get entries from other 128 bit lane
 8988     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8989     // Shuffle original src to get entries from self 128 bit lane
 8990     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8991     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8992     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8993     // Perform the blend
 8994     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8995   %}
 8996   ins_pipe( pipe_slow );
 8997 %}
 8998 
 8999 instruct loadShuffleS_evex(vec dst, vec src) %{
 9000   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 9001             VM_Version::supports_avx512bw());
 9002   match(Set dst (VectorLoadShuffle src));
 9003   format %{ "vector_load_shuffle $dst, $src" %}
 9004   ins_encode %{
 9005     int vlen_enc = vector_length_encoding(this);
 9006     if (!VM_Version::supports_avx512vl()) {
 9007       vlen_enc = Assembler::AVX_512bit;
 9008     }
 9009     __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9010   %}
 9011   ins_pipe( pipe_slow );
 9012 %}
 9013 
 9014 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
 9015   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 9016             VM_Version::supports_avx512bw());
 9017   match(Set dst (VectorRearrange src shuffle));
 9018   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9019   ins_encode %{
 9020     int vlen_enc = vector_length_encoding(this);
 9021     if (!VM_Version::supports_avx512vl()) {
 9022       vlen_enc = Assembler::AVX_512bit;
 9023     }
 9024     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9025   %}
 9026   ins_pipe( pipe_slow );
 9027 %}
 9028 
 9029 // LoadShuffle/Rearrange for Integer and Float
 9030 
 9031 instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
 9032   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 9033             Matcher::vector_length(n) == 4 && UseAVX == 0);
 9034   match(Set dst (VectorLoadShuffle src));
 9035   effect(TEMP dst, TEMP vtmp);
 9036   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 9037   ins_encode %{
 9038     assert(UseSSE >= 4, "required");
 9039 
 9040     // Create a byte shuffle mask from int shuffle mask
 9041     // only byte shuffle instruction available on these platforms
 9042 
 9043     // Duplicate and multiply each shuffle by 4
 9044     __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
 9045     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 9046     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 9047     __ psllw($vtmp$$XMMRegister, 2);
 9048 
 9049     // Duplicate again to create 4 copies of byte index
 9050     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 9051     __ psllw($dst$$XMMRegister, 8);
 9052     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
 9053 
 9054     // Add 3,2,1,0 to get alternate byte index
 9055     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), noreg);
 9056     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 9057   %}
 9058   ins_pipe( pipe_slow );
 9059 %}
 9060 
 9061 instruct rearrangeI(vec dst, vec shuffle) %{
 9062   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 9063             UseAVX == 0);
 9064   match(Set dst (VectorRearrange dst shuffle));
 9065   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 9066   ins_encode %{
 9067     assert(UseSSE >= 4, "required");
 9068     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 9069   %}
 9070   ins_pipe( pipe_slow );
 9071 %}
 9072 
 9073 instruct loadShuffleI_avx(vec dst, vec src) %{
 9074   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 9075             UseAVX > 0);
 9076   match(Set dst (VectorLoadShuffle src));
 9077   format %{ "vector_load_shuffle $dst, $src" %}
 9078   ins_encode %{
 9079     int vlen_enc = vector_length_encoding(this);
 9080     __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9081   %}
 9082   ins_pipe( pipe_slow );
 9083 %}
 9084 
 9085 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
 9086   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 9087             UseAVX > 0);
 9088   match(Set dst (VectorRearrange src shuffle));
 9089   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9090   ins_encode %{
 9091     int vlen_enc = vector_length_encoding(this);
 9092     BasicType bt = Matcher::vector_element_basic_type(this);
 9093     __ vector_rearrange_int_float(bt, $dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9094   %}
 9095   ins_pipe( pipe_slow );
 9096 %}
 9097 
 9098 // LoadShuffle/Rearrange for Long and Double
 9099 
 9100 instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
 9101   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9102             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 9103   match(Set dst (VectorLoadShuffle src));
 9104   effect(TEMP dst, TEMP vtmp);
 9105   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 9106   ins_encode %{
 9107     assert(UseAVX >= 2, "required");
 9108 
 9109     int vlen_enc = vector_length_encoding(this);
 9110     // Create a double word shuffle mask from long shuffle mask
 9111     // only double word shuffle instruction available on these platforms
 9112 
 9113     // Multiply each shuffle by two to get double word index
 9114     __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9115     __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
 9116 
 9117     // Duplicate each double word shuffle
 9118     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
 9119     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 9120 
 9121     // Add one to get alternate double word index
 9122     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, noreg);
 9123   %}
 9124   ins_pipe( pipe_slow );
 9125 %}
 9126 
 9127 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
 9128   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9129             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 9130   match(Set dst (VectorRearrange src shuffle));
 9131   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9132   ins_encode %{
 9133     assert(UseAVX >= 2, "required");
 9134 
 9135     int vlen_enc = vector_length_encoding(this);
 9136     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9137   %}
 9138   ins_pipe( pipe_slow );
 9139 %}
 9140 
 9141 instruct loadShuffleL_evex(vec dst, vec src) %{
 9142   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9143             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 9144   match(Set dst (VectorLoadShuffle src));
 9145   format %{ "vector_load_shuffle $dst, $src" %}
 9146   ins_encode %{
 9147     assert(UseAVX > 2, "required");
 9148 
 9149     int vlen_enc = vector_length_encoding(this);
 9150     __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9151   %}
 9152   ins_pipe( pipe_slow );
 9153 %}
 9154 
 9155 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
 9156   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9157             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 9158   match(Set dst (VectorRearrange src shuffle));
 9159   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9160   ins_encode %{
 9161     assert(UseAVX > 2, "required");
 9162 
 9163     int vlen_enc = vector_length_encoding(this);
 9164     if (vlen_enc == Assembler::AVX_128bit) {
 9165       vlen_enc = Assembler::AVX_256bit;
 9166     }
 9167     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9168   %}
 9169   ins_pipe( pipe_slow );
 9170 %}
 9171 
 9172 // --------------------------------- FMA --------------------------------------
 9173 // a * b + c
 9174 
 9175 instruct vfmaF_reg(vec a, vec b, vec c) %{
 9176   match(Set c (FmaVF  c (Binary a b)));
 9177   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 9178   ins_cost(150);
 9179   ins_encode %{
 9180     assert(UseFMA, "not enabled");
 9181     int vlen_enc = vector_length_encoding(this);
 9182     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9183   %}
 9184   ins_pipe( pipe_slow );
 9185 %}
 9186 
 9187 instruct vfmaF_mem(vec a, memory b, vec c) %{
 9188   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9189   match(Set c (FmaVF  c (Binary a (LoadVector b))));
 9190   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 9191   ins_cost(150);
 9192   ins_encode %{
 9193     assert(UseFMA, "not enabled");
 9194     int vlen_enc = vector_length_encoding(this);
 9195     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9196   %}
 9197   ins_pipe( pipe_slow );
 9198 %}
 9199 
 9200 instruct vfmaD_reg(vec a, vec b, vec c) %{
 9201   match(Set c (FmaVD  c (Binary a b)));
 9202   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9203   ins_cost(150);
 9204   ins_encode %{
 9205     assert(UseFMA, "not enabled");
 9206     int vlen_enc = vector_length_encoding(this);
 9207     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9208   %}
 9209   ins_pipe( pipe_slow );
 9210 %}
 9211 
 9212 instruct vfmaD_mem(vec a, memory b, vec c) %{
 9213   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9214   match(Set c (FmaVD  c (Binary a (LoadVector b))));
 9215   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9216   ins_cost(150);
 9217   ins_encode %{
 9218     assert(UseFMA, "not enabled");
 9219     int vlen_enc = vector_length_encoding(this);
 9220     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9221   %}
 9222   ins_pipe( pipe_slow );
 9223 %}
 9224 
 9225 // --------------------------------- Vector Multiply Add --------------------------------------
 9226 
 9227 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
 9228   predicate(UseAVX == 0);
 9229   match(Set dst (MulAddVS2VI dst src1));
 9230   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
 9231   ins_encode %{
 9232     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
 9233   %}
 9234   ins_pipe( pipe_slow );
 9235 %}
 9236 
 9237 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
 9238   predicate(UseAVX > 0);
 9239   match(Set dst (MulAddVS2VI src1 src2));
 9240   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
 9241   ins_encode %{
 9242     int vlen_enc = vector_length_encoding(this);
 9243     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9244   %}
 9245   ins_pipe( pipe_slow );
 9246 %}
 9247 
 9248 // --------------------------------- Vector Multiply Add Add ----------------------------------
 9249 
 9250 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
 9251   predicate(VM_Version::supports_avx512_vnni());
 9252   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
 9253   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
 9254   ins_encode %{
 9255     assert(UseAVX > 2, "required");
 9256     int vlen_enc = vector_length_encoding(this);
 9257     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9258   %}
 9259   ins_pipe( pipe_slow );
 9260   ins_cost(10);
 9261 %}
 9262 
 9263 // --------------------------------- PopCount --------------------------------------
 9264 
 9265 instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
 9266   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9267   match(Set dst (PopCountVI src));
 9268   match(Set dst (PopCountVL src));
 9269   format %{ "vector_popcount_integral $dst, $src" %}
 9270   ins_encode %{
 9271     int opcode = this->ideal_Opcode();
 9272     int vlen_enc = vector_length_encoding(this, $src);
 9273     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9274     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
 9275   %}
 9276   ins_pipe( pipe_slow );
 9277 %}
 9278 
 9279 instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9280   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9281   match(Set dst (PopCountVI src mask));
 9282   match(Set dst (PopCountVL src mask));
 9283   format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
 9284   ins_encode %{
 9285     int vlen_enc = vector_length_encoding(this, $src);
 9286     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9287     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9288     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
 9289   %}
 9290   ins_pipe( pipe_slow );
 9291 %}
 9292 
 9293 instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
 9294   predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9295   match(Set dst (PopCountVI src));
 9296   match(Set dst (PopCountVL src));
 9297   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9298   format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
 9299   ins_encode %{
 9300     int opcode = this->ideal_Opcode();
 9301     int vlen_enc = vector_length_encoding(this, $src);
 9302     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9303     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9304                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
 9305   %}
 9306   ins_pipe( pipe_slow );
 9307 %}
 9308 
 9309 // --------------------------------- Vector Trailing Zeros Count --------------------------------------
 9310 
 9311 instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
 9312   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9313                                               Matcher::vector_length_in_bytes(n->in(1))));
 9314   match(Set dst (CountTrailingZerosV src));
 9315   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 9316   ins_cost(400);
 9317   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
 9318   ins_encode %{
 9319     int vlen_enc = vector_length_encoding(this, $src);
 9320     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9321     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9322                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9323   %}
 9324   ins_pipe( pipe_slow );
 9325 %}
 9326 
 9327 instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9328   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9329             VM_Version::supports_avx512cd() &&
 9330             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9331   match(Set dst (CountTrailingZerosV src));
 9332   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9333   ins_cost(400);
 9334   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
 9335   ins_encode %{
 9336     int vlen_enc = vector_length_encoding(this, $src);
 9337     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9338     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9339                                         $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9340   %}
 9341   ins_pipe( pipe_slow );
 9342 %}
 9343 
 9344 instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
 9345   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9346   match(Set dst (CountTrailingZerosV src));
 9347   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
 9348   ins_cost(400);
 9349   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
 9350   ins_encode %{
 9351     int vlen_enc = vector_length_encoding(this, $src);
 9352     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9353     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9354                                         $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 9355                                         $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
 9356   %}
 9357   ins_pipe( pipe_slow );
 9358 %}
 9359 
 9360 instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9361   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9362   match(Set dst (CountTrailingZerosV src));
 9363   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9364   format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9365   ins_encode %{
 9366     int vlen_enc = vector_length_encoding(this, $src);
 9367     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9368     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9369                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9370   %}
 9371   ins_pipe( pipe_slow );
 9372 %}
 9373 
 9374 
 9375 // --------------------------------- Bitwise Ternary Logic ----------------------------------
 9376 
 9377 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
 9378   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
 9379   effect(TEMP dst);
 9380   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9381   ins_encode %{
 9382     int vector_len = vector_length_encoding(this);
 9383     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
 9384   %}
 9385   ins_pipe( pipe_slow );
 9386 %}
 9387 
 9388 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
 9389   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
 9390   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
 9391   effect(TEMP dst);
 9392   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9393   ins_encode %{
 9394     int vector_len = vector_length_encoding(this);
 9395     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
 9396   %}
 9397   ins_pipe( pipe_slow );
 9398 %}
 9399 
 9400 // --------------------------------- Rotation Operations ----------------------------------
 9401 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
 9402   match(Set dst (RotateLeftV src shift));
 9403   match(Set dst (RotateRightV src shift));
 9404   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
 9405   ins_encode %{
 9406     int opcode      = this->ideal_Opcode();
 9407     int vector_len  = vector_length_encoding(this);
 9408     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9409     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 9410   %}
 9411   ins_pipe( pipe_slow );
 9412 %}
 9413 
 9414 instruct vprorate(vec dst, vec src, vec shift) %{
 9415   match(Set dst (RotateLeftV src shift));
 9416   match(Set dst (RotateRightV src shift));
 9417   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
 9418   ins_encode %{
 9419     int opcode      = this->ideal_Opcode();
 9420     int vector_len  = vector_length_encoding(this);
 9421     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9422     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
 9423   %}
 9424   ins_pipe( pipe_slow );
 9425 %}
 9426 
 9427 // ---------------------------------- Masked Operations ------------------------------------
 9428 instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
 9429   predicate(!n->in(3)->bottom_type()->isa_vectmask());
 9430   match(Set dst (LoadVectorMasked mem mask));
 9431   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9432   ins_encode %{
 9433     BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
 9434     int vlen_enc = vector_length_encoding(this);
 9435     __ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
 9436   %}
 9437   ins_pipe( pipe_slow );
 9438 %}
 9439 
 9440 
 9441 instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
 9442   predicate(n->in(3)->bottom_type()->isa_vectmask());
 9443   match(Set dst (LoadVectorMasked mem mask));
 9444   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9445   ins_encode %{
 9446     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
 9447     int vector_len = vector_length_encoding(this);
 9448     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
 9449   %}
 9450   ins_pipe( pipe_slow );
 9451 %}
 9452 
 9453 instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
 9454   predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9455   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9456   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9457   ins_encode %{
 9458     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9459     int vlen_enc = vector_length_encoding(src_node);
 9460     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9461     __ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 9462   %}
 9463   ins_pipe( pipe_slow );
 9464 %}
 9465 
 9466 instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
 9467   predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9468   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9469   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9470   ins_encode %{
 9471     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9472     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9473     int vlen_enc = vector_length_encoding(src_node);
 9474     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
 9475   %}
 9476   ins_pipe( pipe_slow );
 9477 %}
 9478 
 9479 #ifdef _LP64
 9480 instruct verify_vector_alignment(rRegP addr, immL32 mask, rFlagsReg cr) %{
 9481   match(Set addr (VerifyVectorAlignment addr mask));
 9482   effect(KILL cr);
 9483   format %{ "verify_vector_alignment $addr $mask \t! verify alignment" %}
 9484   ins_encode %{
 9485     Label Lskip;
 9486     // check if masked bits of addr are zero
 9487     __ testq($addr$$Register, $mask$$constant);
 9488     __ jccb(Assembler::equal, Lskip);
 9489     __ stop("verify_vector_alignment found a misaligned vector memory access");
 9490     __ bind(Lskip);
 9491   %}
 9492   ins_pipe(pipe_slow);
 9493 %}
 9494 
 9495 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 9496   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
 9497   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
 9498   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
 9499   ins_encode %{
 9500     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
 9501     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
 9502 
 9503     Label DONE;
 9504     int vlen_enc = vector_length_encoding(this, $src1);
 9505     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9506 
 9507     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
 9508     __ mov64($dst$$Register, -1L);
 9509     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
 9510     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
 9511     __ jccb(Assembler::carrySet, DONE);
 9512     __ kmovql($dst$$Register, $ktmp1$$KRegister);
 9513     __ notq($dst$$Register);
 9514     __ tzcntq($dst$$Register, $dst$$Register);
 9515     __ bind(DONE);
 9516   %}
 9517   ins_pipe( pipe_slow );
 9518 %}
 9519 
 9520 
 9521 instruct vmask_gen(kReg dst, rRegL len, rRegL temp, rFlagsReg cr) %{
 9522   match(Set dst (VectorMaskGen len));
 9523   effect(TEMP temp, KILL cr);
 9524   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
 9525   ins_encode %{
 9526     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
 9527   %}
 9528   ins_pipe( pipe_slow );
 9529 %}
 9530 
 9531 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
 9532   match(Set dst (VectorMaskGen len));
 9533   format %{ "vector_mask_gen $len \t! vector mask generator" %}
 9534   effect(TEMP temp);
 9535   ins_encode %{
 9536     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
 9537     __ kmovql($dst$$KRegister, $temp$$Register);
 9538   %}
 9539   ins_pipe( pipe_slow );
 9540 %}
 9541 
 9542 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
 9543   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9544   match(Set dst (VectorMaskToLong mask));
 9545   effect(TEMP dst, KILL cr);
 9546   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
 9547   ins_encode %{
 9548     int opcode = this->ideal_Opcode();
 9549     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9550     int mask_len = Matcher::vector_length(this, $mask);
 9551     int mask_size = mask_len * type2aelembytes(mbt);
 9552     int vlen_enc = vector_length_encoding(this, $mask);
 9553     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9554                              $dst$$Register, mask_len, mask_size, vlen_enc);
 9555   %}
 9556   ins_pipe( pipe_slow );
 9557 %}
 9558 
 9559 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
 9560   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9561   match(Set dst (VectorMaskToLong mask));
 9562   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
 9563   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9564   ins_encode %{
 9565     int opcode = this->ideal_Opcode();
 9566     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9567     int mask_len = Matcher::vector_length(this, $mask);
 9568     int vlen_enc = vector_length_encoding(this, $mask);
 9569     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9570                              $dst$$Register, mask_len, mbt, vlen_enc);
 9571   %}
 9572   ins_pipe( pipe_slow );
 9573 %}
 9574 
 9575 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
 9576   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9577   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
 9578   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
 9579   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9580   ins_encode %{
 9581     int opcode = this->ideal_Opcode();
 9582     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9583     int mask_len = Matcher::vector_length(this, $mask);
 9584     int vlen_enc = vector_length_encoding(this, $mask);
 9585     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9586                              $dst$$Register, mask_len, mbt, vlen_enc);
 9587   %}
 9588   ins_pipe( pipe_slow );
 9589 %}
 9590 
 9591 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9592   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9593   match(Set dst (VectorMaskTrueCount mask));
 9594   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9595   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
 9596   ins_encode %{
 9597     int opcode = this->ideal_Opcode();
 9598     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9599     int mask_len = Matcher::vector_length(this, $mask);
 9600     int mask_size = mask_len * type2aelembytes(mbt);
 9601     int vlen_enc = vector_length_encoding(this, $mask);
 9602     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9603                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9604   %}
 9605   ins_pipe( pipe_slow );
 9606 %}
 9607 
 9608 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9609   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9610   match(Set dst (VectorMaskTrueCount mask));
 9611   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9612   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9613   ins_encode %{
 9614     int opcode = this->ideal_Opcode();
 9615     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9616     int mask_len = Matcher::vector_length(this, $mask);
 9617     int vlen_enc = vector_length_encoding(this, $mask);
 9618     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9619                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9620   %}
 9621   ins_pipe( pipe_slow );
 9622 %}
 9623 
 9624 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9625   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9626   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
 9627   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9628   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9629   ins_encode %{
 9630     int opcode = this->ideal_Opcode();
 9631     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9632     int mask_len = Matcher::vector_length(this, $mask);
 9633     int vlen_enc = vector_length_encoding(this, $mask);
 9634     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9635                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9636   %}
 9637   ins_pipe( pipe_slow );
 9638 %}
 9639 
 9640 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9641   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9642   match(Set dst (VectorMaskFirstTrue mask));
 9643   match(Set dst (VectorMaskLastTrue mask));
 9644   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9645   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
 9646   ins_encode %{
 9647     int opcode = this->ideal_Opcode();
 9648     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9649     int mask_len = Matcher::vector_length(this, $mask);
 9650     int mask_size = mask_len * type2aelembytes(mbt);
 9651     int vlen_enc = vector_length_encoding(this, $mask);
 9652     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9653                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9654   %}
 9655   ins_pipe( pipe_slow );
 9656 %}
 9657 
 9658 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9659   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9660   match(Set dst (VectorMaskFirstTrue mask));
 9661   match(Set dst (VectorMaskLastTrue mask));
 9662   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9663   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9664   ins_encode %{
 9665     int opcode = this->ideal_Opcode();
 9666     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9667     int mask_len = Matcher::vector_length(this, $mask);
 9668     int vlen_enc = vector_length_encoding(this, $mask);
 9669     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9670                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9671   %}
 9672   ins_pipe( pipe_slow );
 9673 %}
 9674 
 9675 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9676   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9677   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
 9678   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
 9679   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9680   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9681   ins_encode %{
 9682     int opcode = this->ideal_Opcode();
 9683     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9684     int mask_len = Matcher::vector_length(this, $mask);
 9685     int vlen_enc = vector_length_encoding(this, $mask);
 9686     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9687                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9688   %}
 9689   ins_pipe( pipe_slow );
 9690 %}
 9691 
 9692 // --------------------------------- Compress/Expand Operations ---------------------------
 9693 #ifdef _LP64
 9694 instruct vcompress_reg_avx(vec dst, vec src, vec mask, rRegI rtmp, rRegL rscratch, vec perm, vec xtmp, rFlagsReg cr) %{
 9695   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 9696   match(Set dst (CompressV src mask));
 9697   match(Set dst (ExpandV src mask));
 9698   effect(TEMP_DEF dst, TEMP perm, TEMP xtmp, TEMP rtmp, TEMP rscratch, KILL cr);
 9699   format %{ "vector_compress $dst, $src, $mask \t!using $xtmp, $rtmp, $rscratch and $perm as TEMP" %}
 9700   ins_encode %{
 9701     int opcode = this->ideal_Opcode();
 9702     int vlen_enc = vector_length_encoding(this);
 9703     BasicType bt  = Matcher::vector_element_basic_type(this);
 9704     __ vector_compress_expand_avx2(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$XMMRegister, $rtmp$$Register,
 9705                                    $rscratch$$Register, $perm$$XMMRegister, $xtmp$$XMMRegister, bt, vlen_enc);
 9706   %}
 9707   ins_pipe( pipe_slow );
 9708 %}
 9709 #endif
 9710 
 9711 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
 9712   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 9713   match(Set dst (CompressV src mask));
 9714   match(Set dst (ExpandV src mask));
 9715   format %{ "vector_compress_expand $dst, $src, $mask" %}
 9716   ins_encode %{
 9717     int opcode = this->ideal_Opcode();
 9718     int vector_len = vector_length_encoding(this);
 9719     BasicType bt  = Matcher::vector_element_basic_type(this);
 9720     __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
 9721   %}
 9722   ins_pipe( pipe_slow );
 9723 %}
 9724 
 9725 instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
 9726   match(Set dst (CompressM mask));
 9727   effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
 9728   format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
 9729   ins_encode %{
 9730     assert(this->in(1)->bottom_type()->isa_vectmask(), "");
 9731     int mask_len = Matcher::vector_length(this);
 9732     __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
 9733   %}
 9734   ins_pipe( pipe_slow );
 9735 %}
 9736 
 9737 #endif // _LP64
 9738 
 9739 // -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
 9740 
 9741 instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9742   predicate(!VM_Version::supports_gfni());
 9743   match(Set dst (ReverseV src));
 9744   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9745   format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9746   ins_encode %{
 9747     int vec_enc = vector_length_encoding(this);
 9748     BasicType bt = Matcher::vector_element_basic_type(this);
 9749     __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9750                           $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9751   %}
 9752   ins_pipe( pipe_slow );
 9753 %}
 9754 
 9755 instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp) %{
 9756   predicate(VM_Version::supports_gfni());
 9757   match(Set dst (ReverseV src));
 9758   effect(TEMP dst, TEMP xtmp);
 9759   format %{ "vector_reverse_bit_gfni $dst, $src!\t using $xtmp as TEMP" %}
 9760   ins_encode %{
 9761     int vec_enc = vector_length_encoding(this);
 9762     BasicType bt  = Matcher::vector_element_basic_type(this);
 9763     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, 0x8040201008040201L, 1));
 9764     __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, addr, vec_enc,
 9765                                $xtmp$$XMMRegister);
 9766   %}
 9767   ins_pipe( pipe_slow );
 9768 %}
 9769 
 9770 instruct vreverse_byte_reg(vec dst, vec src) %{
 9771   predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
 9772   match(Set dst (ReverseBytesV src));
 9773   effect(TEMP dst);
 9774   format %{ "vector_reverse_byte $dst, $src" %}
 9775   ins_encode %{
 9776     int vec_enc = vector_length_encoding(this);
 9777     BasicType bt = Matcher::vector_element_basic_type(this);
 9778     __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, vec_enc);
 9779   %}
 9780   ins_pipe( pipe_slow );
 9781 %}
 9782 
 9783 instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9784   predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
 9785   match(Set dst (ReverseBytesV src));
 9786   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9787   format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9788   ins_encode %{
 9789     int vec_enc = vector_length_encoding(this);
 9790     BasicType bt = Matcher::vector_element_basic_type(this);
 9791     __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9792                              $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9793   %}
 9794   ins_pipe( pipe_slow );
 9795 %}
 9796 
 9797 // ---------------------------------- Vector Count Leading Zeros -----------------------------------
 9798 
 9799 instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
 9800   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9801                                               Matcher::vector_length_in_bytes(n->in(1))));
 9802   match(Set dst (CountLeadingZerosV src));
 9803   format %{ "vector_count_leading_zeros $dst, $src" %}
 9804   ins_encode %{
 9805      int vlen_enc = vector_length_encoding(this, $src);
 9806      BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9807      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9808                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
 9809   %}
 9810   ins_pipe( pipe_slow );
 9811 %}
 9812 
 9813 instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9814   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9815                                               Matcher::vector_length_in_bytes(n->in(1))));
 9816   match(Set dst (CountLeadingZerosV src mask));
 9817   format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
 9818   ins_encode %{
 9819     int vlen_enc = vector_length_encoding(this, $src);
 9820     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9821     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9822     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
 9823                                        xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
 9824   %}
 9825   ins_pipe( pipe_slow );
 9826 %}
 9827 
 9828 instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
 9829   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9830             VM_Version::supports_avx512cd() &&
 9831             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9832   match(Set dst (CountLeadingZerosV src));
 9833   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 9834   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
 9835   ins_encode %{
 9836     int vlen_enc = vector_length_encoding(this, $src);
 9837     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9838     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9839                                        $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
 9840   %}
 9841   ins_pipe( pipe_slow );
 9842 %}
 9843 
 9844 instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
 9845   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9846   match(Set dst (CountLeadingZerosV src));
 9847   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 9848   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
 9849   ins_encode %{
 9850     int vlen_enc = vector_length_encoding(this, $src);
 9851     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9852     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9853                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
 9854                                        $rtmp$$Register, true, vlen_enc);
 9855   %}
 9856   ins_pipe( pipe_slow );
 9857 %}
 9858 
 9859 instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
 9860   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
 9861             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9862   match(Set dst (CountLeadingZerosV src));
 9863   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
 9864   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
 9865   ins_encode %{
 9866     int vlen_enc = vector_length_encoding(this, $src);
 9867     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9868     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9869                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
 9870   %}
 9871   ins_pipe( pipe_slow );
 9872 %}
 9873 
 9874 instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9875   predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
 9876             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9877   match(Set dst (CountLeadingZerosV src));
 9878   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9879   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9880   ins_encode %{
 9881     int vlen_enc = vector_length_encoding(this, $src);
 9882     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9883     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9884                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9885   %}
 9886   ins_pipe( pipe_slow );
 9887 %}
 9888 
 9889 // ---------------------------------- Vector Masked Operations ------------------------------------
 9890 
 9891 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
 9892   match(Set dst (AddVB (Binary dst src2) mask));
 9893   match(Set dst (AddVS (Binary dst src2) mask));
 9894   match(Set dst (AddVI (Binary dst src2) mask));
 9895   match(Set dst (AddVL (Binary dst src2) mask));
 9896   match(Set dst (AddVF (Binary dst src2) mask));
 9897   match(Set dst (AddVD (Binary dst src2) mask));
 9898   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9899   ins_encode %{
 9900     int vlen_enc = vector_length_encoding(this);
 9901     BasicType bt = Matcher::vector_element_basic_type(this);
 9902     int opc = this->ideal_Opcode();
 9903     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9904                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9905   %}
 9906   ins_pipe( pipe_slow );
 9907 %}
 9908 
 9909 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
 9910   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
 9911   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
 9912   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
 9913   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
 9914   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
 9915   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
 9916   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9917   ins_encode %{
 9918     int vlen_enc = vector_length_encoding(this);
 9919     BasicType bt = Matcher::vector_element_basic_type(this);
 9920     int opc = this->ideal_Opcode();
 9921     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9922                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9923   %}
 9924   ins_pipe( pipe_slow );
 9925 %}
 9926 
 9927 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
 9928   match(Set dst (XorV (Binary dst src2) mask));
 9929   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9930   ins_encode %{
 9931     int vlen_enc = vector_length_encoding(this);
 9932     BasicType bt = Matcher::vector_element_basic_type(this);
 9933     int opc = this->ideal_Opcode();
 9934     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9935                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9936   %}
 9937   ins_pipe( pipe_slow );
 9938 %}
 9939 
 9940 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
 9941   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
 9942   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9943   ins_encode %{
 9944     int vlen_enc = vector_length_encoding(this);
 9945     BasicType bt = Matcher::vector_element_basic_type(this);
 9946     int opc = this->ideal_Opcode();
 9947     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9948                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9949   %}
 9950   ins_pipe( pipe_slow );
 9951 %}
 9952 
 9953 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
 9954   match(Set dst (OrV (Binary dst src2) mask));
 9955   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9956   ins_encode %{
 9957     int vlen_enc = vector_length_encoding(this);
 9958     BasicType bt = Matcher::vector_element_basic_type(this);
 9959     int opc = this->ideal_Opcode();
 9960     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9961                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9962   %}
 9963   ins_pipe( pipe_slow );
 9964 %}
 9965 
 9966 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
 9967   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
 9968   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9969   ins_encode %{
 9970     int vlen_enc = vector_length_encoding(this);
 9971     BasicType bt = Matcher::vector_element_basic_type(this);
 9972     int opc = this->ideal_Opcode();
 9973     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9974                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9975   %}
 9976   ins_pipe( pipe_slow );
 9977 %}
 9978 
 9979 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
 9980   match(Set dst (AndV (Binary dst src2) mask));
 9981   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9982   ins_encode %{
 9983     int vlen_enc = vector_length_encoding(this);
 9984     BasicType bt = Matcher::vector_element_basic_type(this);
 9985     int opc = this->ideal_Opcode();
 9986     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9987                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9988   %}
 9989   ins_pipe( pipe_slow );
 9990 %}
 9991 
 9992 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
 9993   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
 9994   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9995   ins_encode %{
 9996     int vlen_enc = vector_length_encoding(this);
 9997     BasicType bt = Matcher::vector_element_basic_type(this);
 9998     int opc = this->ideal_Opcode();
 9999     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10000                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10001   %}
10002   ins_pipe( pipe_slow );
10003 %}
10004 
10005 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
10006   match(Set dst (SubVB (Binary dst src2) mask));
10007   match(Set dst (SubVS (Binary dst src2) mask));
10008   match(Set dst (SubVI (Binary dst src2) mask));
10009   match(Set dst (SubVL (Binary dst src2) mask));
10010   match(Set dst (SubVF (Binary dst src2) mask));
10011   match(Set dst (SubVD (Binary dst src2) mask));
10012   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
10013   ins_encode %{
10014     int vlen_enc = vector_length_encoding(this);
10015     BasicType bt = Matcher::vector_element_basic_type(this);
10016     int opc = this->ideal_Opcode();
10017     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10018                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10019   %}
10020   ins_pipe( pipe_slow );
10021 %}
10022 
10023 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
10024   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
10025   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
10026   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
10027   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
10028   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
10029   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
10030   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
10031   ins_encode %{
10032     int vlen_enc = vector_length_encoding(this);
10033     BasicType bt = Matcher::vector_element_basic_type(this);
10034     int opc = this->ideal_Opcode();
10035     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10036                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10037   %}
10038   ins_pipe( pipe_slow );
10039 %}
10040 
10041 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
10042   match(Set dst (MulVS (Binary dst src2) mask));
10043   match(Set dst (MulVI (Binary dst src2) mask));
10044   match(Set dst (MulVL (Binary dst src2) mask));
10045   match(Set dst (MulVF (Binary dst src2) mask));
10046   match(Set dst (MulVD (Binary dst src2) mask));
10047   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
10048   ins_encode %{
10049     int vlen_enc = vector_length_encoding(this);
10050     BasicType bt = Matcher::vector_element_basic_type(this);
10051     int opc = this->ideal_Opcode();
10052     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10053                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10054   %}
10055   ins_pipe( pipe_slow );
10056 %}
10057 
10058 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
10059   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
10060   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
10061   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
10062   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
10063   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
10064   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
10065   ins_encode %{
10066     int vlen_enc = vector_length_encoding(this);
10067     BasicType bt = Matcher::vector_element_basic_type(this);
10068     int opc = this->ideal_Opcode();
10069     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10070                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10071   %}
10072   ins_pipe( pipe_slow );
10073 %}
10074 
10075 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
10076   match(Set dst (SqrtVF dst mask));
10077   match(Set dst (SqrtVD dst mask));
10078   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
10079   ins_encode %{
10080     int vlen_enc = vector_length_encoding(this);
10081     BasicType bt = Matcher::vector_element_basic_type(this);
10082     int opc = this->ideal_Opcode();
10083     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10084                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
10085   %}
10086   ins_pipe( pipe_slow );
10087 %}
10088 
10089 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
10090   match(Set dst (DivVF (Binary dst src2) mask));
10091   match(Set dst (DivVD (Binary dst src2) mask));
10092   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
10093   ins_encode %{
10094     int vlen_enc = vector_length_encoding(this);
10095     BasicType bt = Matcher::vector_element_basic_type(this);
10096     int opc = this->ideal_Opcode();
10097     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10098                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10099   %}
10100   ins_pipe( pipe_slow );
10101 %}
10102 
10103 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
10104   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
10105   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
10106   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
10107   ins_encode %{
10108     int vlen_enc = vector_length_encoding(this);
10109     BasicType bt = Matcher::vector_element_basic_type(this);
10110     int opc = this->ideal_Opcode();
10111     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10112                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10113   %}
10114   ins_pipe( pipe_slow );
10115 %}
10116 
10117 
10118 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
10119   match(Set dst (RotateLeftV (Binary dst shift) mask));
10120   match(Set dst (RotateRightV (Binary dst shift) mask));
10121   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
10122   ins_encode %{
10123     int vlen_enc = vector_length_encoding(this);
10124     BasicType bt = Matcher::vector_element_basic_type(this);
10125     int opc = this->ideal_Opcode();
10126     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10127                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10128   %}
10129   ins_pipe( pipe_slow );
10130 %}
10131 
10132 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
10133   match(Set dst (RotateLeftV (Binary dst src2) mask));
10134   match(Set dst (RotateRightV (Binary dst src2) mask));
10135   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
10136   ins_encode %{
10137     int vlen_enc = vector_length_encoding(this);
10138     BasicType bt = Matcher::vector_element_basic_type(this);
10139     int opc = this->ideal_Opcode();
10140     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10141                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10142   %}
10143   ins_pipe( pipe_slow );
10144 %}
10145 
10146 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10147   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
10148   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
10149   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
10150   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
10151   ins_encode %{
10152     int vlen_enc = vector_length_encoding(this);
10153     BasicType bt = Matcher::vector_element_basic_type(this);
10154     int opc = this->ideal_Opcode();
10155     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10156                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10157   %}
10158   ins_pipe( pipe_slow );
10159 %}
10160 
10161 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
10162   predicate(!n->as_ShiftV()->is_var_shift());
10163   match(Set dst (LShiftVS (Binary dst src2) mask));
10164   match(Set dst (LShiftVI (Binary dst src2) mask));
10165   match(Set dst (LShiftVL (Binary dst src2) mask));
10166   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
10167   ins_encode %{
10168     int vlen_enc = vector_length_encoding(this);
10169     BasicType bt = Matcher::vector_element_basic_type(this);
10170     int opc = this->ideal_Opcode();
10171     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10172                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10173   %}
10174   ins_pipe( pipe_slow );
10175 %}
10176 
10177 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10178   predicate(n->as_ShiftV()->is_var_shift());
10179   match(Set dst (LShiftVS (Binary dst src2) mask));
10180   match(Set dst (LShiftVI (Binary dst src2) mask));
10181   match(Set dst (LShiftVL (Binary dst src2) mask));
10182   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
10183   ins_encode %{
10184     int vlen_enc = vector_length_encoding(this);
10185     BasicType bt = Matcher::vector_element_basic_type(this);
10186     int opc = this->ideal_Opcode();
10187     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10188                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10189   %}
10190   ins_pipe( pipe_slow );
10191 %}
10192 
10193 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10194   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
10195   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
10196   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
10197   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
10198   ins_encode %{
10199     int vlen_enc = vector_length_encoding(this);
10200     BasicType bt = Matcher::vector_element_basic_type(this);
10201     int opc = this->ideal_Opcode();
10202     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10203                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10204   %}
10205   ins_pipe( pipe_slow );
10206 %}
10207 
10208 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
10209   predicate(!n->as_ShiftV()->is_var_shift());
10210   match(Set dst (RShiftVS (Binary dst src2) mask));
10211   match(Set dst (RShiftVI (Binary dst src2) mask));
10212   match(Set dst (RShiftVL (Binary dst src2) mask));
10213   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10214   ins_encode %{
10215     int vlen_enc = vector_length_encoding(this);
10216     BasicType bt = Matcher::vector_element_basic_type(this);
10217     int opc = this->ideal_Opcode();
10218     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10219                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10220   %}
10221   ins_pipe( pipe_slow );
10222 %}
10223 
10224 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10225   predicate(n->as_ShiftV()->is_var_shift());
10226   match(Set dst (RShiftVS (Binary dst src2) mask));
10227   match(Set dst (RShiftVI (Binary dst src2) mask));
10228   match(Set dst (RShiftVL (Binary dst src2) mask));
10229   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10230   ins_encode %{
10231     int vlen_enc = vector_length_encoding(this);
10232     BasicType bt = Matcher::vector_element_basic_type(this);
10233     int opc = this->ideal_Opcode();
10234     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10235                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10236   %}
10237   ins_pipe( pipe_slow );
10238 %}
10239 
10240 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10241   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
10242   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
10243   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
10244   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
10245   ins_encode %{
10246     int vlen_enc = vector_length_encoding(this);
10247     BasicType bt = Matcher::vector_element_basic_type(this);
10248     int opc = this->ideal_Opcode();
10249     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10250                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10251   %}
10252   ins_pipe( pipe_slow );
10253 %}
10254 
10255 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
10256   predicate(!n->as_ShiftV()->is_var_shift());
10257   match(Set dst (URShiftVS (Binary dst src2) mask));
10258   match(Set dst (URShiftVI (Binary dst src2) mask));
10259   match(Set dst (URShiftVL (Binary dst src2) mask));
10260   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10261   ins_encode %{
10262     int vlen_enc = vector_length_encoding(this);
10263     BasicType bt = Matcher::vector_element_basic_type(this);
10264     int opc = this->ideal_Opcode();
10265     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10266                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10267   %}
10268   ins_pipe( pipe_slow );
10269 %}
10270 
10271 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10272   predicate(n->as_ShiftV()->is_var_shift());
10273   match(Set dst (URShiftVS (Binary dst src2) mask));
10274   match(Set dst (URShiftVI (Binary dst src2) mask));
10275   match(Set dst (URShiftVL (Binary dst src2) mask));
10276   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10277   ins_encode %{
10278     int vlen_enc = vector_length_encoding(this);
10279     BasicType bt = Matcher::vector_element_basic_type(this);
10280     int opc = this->ideal_Opcode();
10281     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10282                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10283   %}
10284   ins_pipe( pipe_slow );
10285 %}
10286 
10287 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
10288   match(Set dst (MaxV (Binary dst src2) mask));
10289   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10290   ins_encode %{
10291     int vlen_enc = vector_length_encoding(this);
10292     BasicType bt = Matcher::vector_element_basic_type(this);
10293     int opc = this->ideal_Opcode();
10294     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10295                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10296   %}
10297   ins_pipe( pipe_slow );
10298 %}
10299 
10300 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
10301   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
10302   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10303   ins_encode %{
10304     int vlen_enc = vector_length_encoding(this);
10305     BasicType bt = Matcher::vector_element_basic_type(this);
10306     int opc = this->ideal_Opcode();
10307     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10308                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10309   %}
10310   ins_pipe( pipe_slow );
10311 %}
10312 
10313 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
10314   match(Set dst (MinV (Binary dst src2) mask));
10315   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10316   ins_encode %{
10317     int vlen_enc = vector_length_encoding(this);
10318     BasicType bt = Matcher::vector_element_basic_type(this);
10319     int opc = this->ideal_Opcode();
10320     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10321                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10322   %}
10323   ins_pipe( pipe_slow );
10324 %}
10325 
10326 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
10327   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
10328   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10329   ins_encode %{
10330     int vlen_enc = vector_length_encoding(this);
10331     BasicType bt = Matcher::vector_element_basic_type(this);
10332     int opc = this->ideal_Opcode();
10333     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10334                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10335   %}
10336   ins_pipe( pipe_slow );
10337 %}
10338 
10339 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
10340   match(Set dst (VectorRearrange (Binary dst src2) mask));
10341   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
10342   ins_encode %{
10343     int vlen_enc = vector_length_encoding(this);
10344     BasicType bt = Matcher::vector_element_basic_type(this);
10345     int opc = this->ideal_Opcode();
10346     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10347                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10348   %}
10349   ins_pipe( pipe_slow );
10350 %}
10351 
10352 instruct vabs_masked(vec dst, kReg mask) %{
10353   match(Set dst (AbsVB dst mask));
10354   match(Set dst (AbsVS dst mask));
10355   match(Set dst (AbsVI dst mask));
10356   match(Set dst (AbsVL dst mask));
10357   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
10358   ins_encode %{
10359     int vlen_enc = vector_length_encoding(this);
10360     BasicType bt = Matcher::vector_element_basic_type(this);
10361     int opc = this->ideal_Opcode();
10362     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10363                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
10364   %}
10365   ins_pipe( pipe_slow );
10366 %}
10367 
10368 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
10369   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
10370   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
10371   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10372   ins_encode %{
10373     assert(UseFMA, "Needs FMA instructions support.");
10374     int vlen_enc = vector_length_encoding(this);
10375     BasicType bt = Matcher::vector_element_basic_type(this);
10376     int opc = this->ideal_Opcode();
10377     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10378                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
10379   %}
10380   ins_pipe( pipe_slow );
10381 %}
10382 
10383 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
10384   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
10385   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
10386   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10387   ins_encode %{
10388     assert(UseFMA, "Needs FMA instructions support.");
10389     int vlen_enc = vector_length_encoding(this);
10390     BasicType bt = Matcher::vector_element_basic_type(this);
10391     int opc = this->ideal_Opcode();
10392     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10393                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
10394   %}
10395   ins_pipe( pipe_slow );
10396 %}
10397 
10398 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask) %{
10399   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
10400   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask" %}
10401   ins_encode %{
10402     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
10403     int vlen_enc = vector_length_encoding(this, $src1);
10404     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
10405 
10406     // Comparison i
10407     switch (src1_elem_bt) {
10408       case T_BYTE: {
10409         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10410         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10411         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10412         break;
10413       }
10414       case T_SHORT: {
10415         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10416         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10417         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10418         break;
10419       }
10420       case T_INT: {
10421         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10422         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10423         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10424         break;
10425       }
10426       case T_LONG: {
10427         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10428         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10429         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10430         break;
10431       }
10432       case T_FLOAT: {
10433         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10434         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10435         break;
10436       }
10437       case T_DOUBLE: {
10438         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10439         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10440         break;
10441       }
10442       default: assert(false, "%s", type2name(src1_elem_bt)); break;
10443     }
10444   %}
10445   ins_pipe( pipe_slow );
10446 %}
10447 
10448 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
10449   predicate(Matcher::vector_length(n) <= 32);
10450   match(Set dst (MaskAll src));
10451   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
10452   ins_encode %{
10453     int mask_len = Matcher::vector_length(this);
10454     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
10455   %}
10456   ins_pipe( pipe_slow );
10457 %}
10458 
10459 #ifdef _LP64
10460 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
10461   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
10462   match(Set dst (XorVMask src (MaskAll cnt)));
10463   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
10464   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
10465   ins_encode %{
10466     uint masklen = Matcher::vector_length(this);
10467     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
10468   %}
10469   ins_pipe( pipe_slow );
10470 %}
10471 
10472 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
10473   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
10474             (Matcher::vector_length(n) == 16) ||
10475             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
10476   match(Set dst (XorVMask src (MaskAll cnt)));
10477   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
10478   ins_encode %{
10479     uint masklen = Matcher::vector_length(this);
10480     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
10481   %}
10482   ins_pipe( pipe_slow );
10483 %}
10484 
10485 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
10486   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) <= 8);
10487   match(Set dst (VectorLongToMask src));
10488   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
10489   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
10490   ins_encode %{
10491     int mask_len = Matcher::vector_length(this);
10492     int vec_enc  = vector_length_encoding(mask_len);
10493     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10494                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
10495   %}
10496   ins_pipe( pipe_slow );
10497 %}
10498 
10499 
10500 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
10501   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) > 8);
10502   match(Set dst (VectorLongToMask src));
10503   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
10504   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
10505   ins_encode %{
10506     int mask_len = Matcher::vector_length(this);
10507     assert(mask_len <= 32, "invalid mask length");
10508     int vec_enc  = vector_length_encoding(mask_len);
10509     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10510                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
10511   %}
10512   ins_pipe( pipe_slow );
10513 %}
10514 
10515 instruct long_to_mask_evex(kReg dst, rRegL src) %{
10516   predicate(n->bottom_type()->isa_vectmask());
10517   match(Set dst (VectorLongToMask src));
10518   format %{ "long_to_mask_evex $dst, $src\t!" %}
10519   ins_encode %{
10520     __ kmov($dst$$KRegister, $src$$Register);
10521   %}
10522   ins_pipe( pipe_slow );
10523 %}
10524 #endif
10525 
10526 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10527   match(Set dst (AndVMask src1 src2));
10528   match(Set dst (OrVMask src1 src2));
10529   match(Set dst (XorVMask src1 src2));
10530   effect(TEMP kscratch);
10531   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10532   ins_encode %{
10533     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10534     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10535     assert(Type::equals(mask1->bottom_type(), mask2->bottom_type()), "Mask types must be equal");
10536     uint masklen = Matcher::vector_length(this);
10537     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10538     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10539   %}
10540   ins_pipe( pipe_slow );
10541 %}
10542 
10543 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10544   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10545   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10546   ins_encode %{
10547     int vlen_enc = vector_length_encoding(this);
10548     BasicType bt = Matcher::vector_element_basic_type(this);
10549     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10550                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10551   %}
10552   ins_pipe( pipe_slow );
10553 %}
10554 
10555 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10556   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10557   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10558   ins_encode %{
10559     int vlen_enc = vector_length_encoding(this);
10560     BasicType bt = Matcher::vector_element_basic_type(this);
10561     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10562                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10563   %}
10564   ins_pipe( pipe_slow );
10565 %}
10566 
10567 instruct castMM(kReg dst)
10568 %{
10569   match(Set dst (CastVV dst));
10570 
10571   size(0);
10572   format %{ "# castVV of $dst" %}
10573   ins_encode(/* empty encoding */);
10574   ins_cost(0);
10575   ins_pipe(empty);
10576 %}
10577 
10578 instruct castVV(vec dst)
10579 %{
10580   match(Set dst (CastVV dst));
10581 
10582   size(0);
10583   format %{ "# castVV of $dst" %}
10584   ins_encode(/* empty encoding */);
10585   ins_cost(0);
10586   ins_pipe(empty);
10587 %}
10588 
10589 instruct castVVLeg(legVec dst)
10590 %{
10591   match(Set dst (CastVV dst));
10592 
10593   size(0);
10594   format %{ "# castVV of $dst" %}
10595   ins_encode(/* empty encoding */);
10596   ins_cost(0);
10597   ins_pipe(empty);
10598 %}
10599 
10600 instruct FloatClassCheck_reg_reg_vfpclass(rRegI dst, regF src, kReg ktmp, rFlagsReg cr)
10601 %{
10602   match(Set dst (IsInfiniteF src));
10603   effect(TEMP ktmp, KILL cr);
10604   format %{ "float_class_check $dst, $src" %}
10605   ins_encode %{
10606     __ vfpclassss($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10607     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10608   %}
10609   ins_pipe(pipe_slow);
10610 %}
10611 
10612 instruct DoubleClassCheck_reg_reg_vfpclass(rRegI dst, regD src, kReg ktmp, rFlagsReg cr)
10613 %{
10614   match(Set dst (IsInfiniteD src));
10615   effect(TEMP ktmp, KILL cr);
10616   format %{ "double_class_check $dst, $src" %}
10617   ins_encode %{
10618     __ vfpclasssd($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10619     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10620   %}
10621   ins_pipe(pipe_slow);
10622 %}
10623 
10624 instruct vector_addsub_saturating_subword_reg(vec dst, vec src1, vec src2)
10625 %{
10626   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10627             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10628   match(Set dst (SaturatingAddV src1 src2));
10629   match(Set dst (SaturatingSubV src1 src2));
10630   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10631   ins_encode %{
10632     int vlen_enc = vector_length_encoding(this);
10633     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10634     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10635                             $src1$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10636   %}
10637   ins_pipe(pipe_slow);
10638 %}
10639 
10640 instruct vector_addsub_saturating_unsigned_subword_reg(vec dst, vec src1, vec src2)
10641 %{
10642   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10643             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10644   match(Set dst (SaturatingAddV src1 src2));
10645   match(Set dst (SaturatingSubV src1 src2));
10646   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10647   ins_encode %{
10648     int vlen_enc = vector_length_encoding(this);
10649     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10650     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10651                             $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10652   %}
10653   ins_pipe(pipe_slow);
10654 %}
10655 
10656 instruct vector_addsub_saturating_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2)
10657 %{
10658   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10659             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10660             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10661   match(Set dst (SaturatingAddV src1 src2));
10662   match(Set dst (SaturatingSubV src1 src2));
10663   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2);
10664   format %{ "vector_addsub_saturating_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
10665   ins_encode %{
10666     int vlen_enc = vector_length_encoding(this);
10667     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10668     __ vector_addsub_dq_saturating_evex(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10669                                         $src1$$XMMRegister, $src2$$XMMRegister,
10670                                         $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10671                                         $ktmp1$$KRegister, $ktmp2$$KRegister, vlen_enc);
10672   %}
10673   ins_pipe(pipe_slow);
10674 %}
10675 
10676 instruct vector_addsub_saturating_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4)
10677 %{
10678   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10679             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10680             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10681   match(Set dst (SaturatingAddV src1 src2));
10682   match(Set dst (SaturatingSubV src1 src2));
10683   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4);
10684   format %{ "vector_addsub_saturating_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
10685   ins_encode %{
10686     int vlen_enc = vector_length_encoding(this);
10687     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10688     __ vector_addsub_dq_saturating_avx(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10689                                        $src2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10690                                        $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, vlen_enc);
10691   %}
10692   ins_pipe(pipe_slow);
10693 %}
10694 
10695 instruct vector_add_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp)
10696 %{
10697   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10698             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10699             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10700   match(Set dst (SaturatingAddV src1 src2));
10701   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp);
10702   format %{ "vector_add_saturating_unsigned_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $ktmp as TEMP" %}
10703   ins_encode %{
10704     int vlen_enc = vector_length_encoding(this);
10705     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10706     __ vector_add_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10707                                               $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10708   %}
10709   ins_pipe(pipe_slow);
10710 %}
10711 
10712 instruct vector_add_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3)
10713 %{
10714   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10715             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10716             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10717   match(Set dst (SaturatingAddV src1 src2));
10718   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
10719   format %{ "vector_add_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
10720   ins_encode %{
10721     int vlen_enc = vector_length_encoding(this);
10722     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10723     __ vector_add_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10724                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, vlen_enc);
10725   %}
10726   ins_pipe(pipe_slow);
10727 %}
10728 
10729 instruct vector_sub_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, kReg ktmp)
10730 %{
10731   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10732             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10733             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10734   match(Set dst (SaturatingSubV src1 src2));
10735   effect(TEMP ktmp);
10736   format %{ "vector_sub_saturating_unsigned_evex $dst, $src1, $src2 \t! using $ktmp as TEMP" %}
10737   ins_encode %{
10738     int vlen_enc = vector_length_encoding(this);
10739     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10740     __ vector_sub_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10741                                               $src2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10742   %}
10743   ins_pipe(pipe_slow);
10744 %}
10745 
10746 instruct vector_sub_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2)
10747 %{
10748   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10749             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10750             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10751   match(Set dst (SaturatingSubV src1 src2));
10752   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
10753   format %{ "vector_sub_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1 and $xtmp2 as TEMP" %}
10754   ins_encode %{
10755     int vlen_enc = vector_length_encoding(this);
10756     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10757     __ vector_sub_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10758                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
10759   %}
10760   ins_pipe(pipe_slow);
10761 %}
10762 
10763 instruct vector_addsub_saturating_subword_mem(vec dst, vec src1, memory src2)
10764 %{
10765   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10766             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10767   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10768   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10769   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10770   ins_encode %{
10771     int vlen_enc = vector_length_encoding(this);
10772     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10773     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10774                             $src1$$XMMRegister, $src2$$Address, false, vlen_enc);
10775   %}
10776   ins_pipe(pipe_slow);
10777 %}
10778 
10779 instruct vector_addsub_saturating_unsigned_subword_mem(vec dst, vec src1, memory src2)
10780 %{
10781   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10782             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10783   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10784   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10785   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10786   ins_encode %{
10787     int vlen_enc = vector_length_encoding(this);
10788     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10789     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10790                             $src1$$XMMRegister, $src2$$Address, true, vlen_enc);
10791   %}
10792   ins_pipe(pipe_slow);
10793 %}
10794 
10795 instruct vector_addsub_saturating_subword_masked_reg(vec dst, vec src, kReg mask) %{
10796   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10797             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10798   match(Set dst (SaturatingAddV (Binary dst src) mask));
10799   match(Set dst (SaturatingSubV (Binary dst src) mask));
10800   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10801   ins_encode %{
10802     int vlen_enc = vector_length_encoding(this);
10803     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10804     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10805                               $dst$$XMMRegister, $src$$XMMRegister, false, true, vlen_enc);
10806   %}
10807   ins_pipe( pipe_slow );
10808 %}
10809 
10810 instruct vector_addsub_saturating_unsigned_subword_masked_reg(vec dst, vec src, kReg mask) %{
10811   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10812             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10813   match(Set dst (SaturatingAddV (Binary dst src) mask));
10814   match(Set dst (SaturatingSubV (Binary dst src) mask));
10815   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10816   ins_encode %{
10817     int vlen_enc = vector_length_encoding(this);
10818     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10819     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10820                               $dst$$XMMRegister, $src$$XMMRegister, true, true, vlen_enc);
10821   %}
10822   ins_pipe( pipe_slow );
10823 %}
10824 
10825 instruct vector_addsub_saturating_subword_masked_mem(vec dst, memory src, kReg mask) %{
10826   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10827             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10828   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10829   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10830   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10831   ins_encode %{
10832     int vlen_enc = vector_length_encoding(this);
10833     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10834     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10835                               $dst$$XMMRegister, $src$$Address, false, true, vlen_enc);
10836   %}
10837   ins_pipe( pipe_slow );
10838 %}
10839 
10840 instruct vector_addsub_saturating_unsigned_subword_masked_mem(vec dst, memory src, kReg mask) %{
10841   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10842             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10843   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10844   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10845   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10846   ins_encode %{
10847     int vlen_enc = vector_length_encoding(this);
10848     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10849     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10850                               $dst$$XMMRegister, $src$$Address, true, true, vlen_enc);
10851   %}
10852   ins_pipe( pipe_slow );
10853 %}
10854 
10855 instruct vector_selectfrom_twovectors_reg_evex(vec index, vec src1, vec src2)
10856 %{
10857   match(Set index (SelectFromTwoVector (Binary index src1) src2));
10858   format %{ "select_from_two_vector $index, $src1, $src2 \t!" %}
10859   ins_encode %{
10860     int vlen_enc = vector_length_encoding(this);
10861     BasicType bt = Matcher::vector_element_basic_type(this);
10862     __ select_from_two_vectors_evex(bt, $index$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
10863   %}
10864   ins_pipe(pipe_slow);
10865 %}